## Model Training

### 1.1 Import Data and Required Packages

In [88]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [89]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings 

In [90]:
df = pd.read_csv("data/StudentsPerformance.csv")

In [91]:
# Top 5 data
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


### 1.2 Preparing X and Y variable

In [92]:
X = df.drop(columns=['math score'], axis=1)
y = df['math score']

In [93]:
X.shape

(1000, 7)

In [94]:
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(exclude=[np.number]).columns.tolist()

print("numeric_features:", numeric_features)
print("categorical_features:", categorical_features)

numeric_features: ['reading score', 'writing score']
categorical_features: ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']


In [95]:


from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)


preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, categorical_features),
        ("StandardScaler", numeric_transformer, numeric_features)
    ]
)

In [96]:
X = preprocessor.fit_transform(X)

In [97]:
# separate dataset into train and test

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state=42)
X_train.shape, X_test.shape

((800, 19), (200, 19))

### 1.3 Creating an Evaluate Function to give all metrics after model Training

In [98]:
def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    return mae, rmse, r2

In [99]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "KNN Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "XGBoost Regressor": XGBRegressor(),
    "CatBoost Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}

model_list = []
r2_list = []

In [100]:

for name, model in models.items():
    try:
        # Train
        model.fit(X_train, y_train)

        # Predictions
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        # Evaluate
        model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
        model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

        # Logging / store results
        print(name)
        model_list.append(name)

        print("Model performance for Training set")
        print(f" RMSE: {model_train_rmse:.4f}")
        print(f" MAE : {model_train_mae:.4f}")
        print(f" R2  : {model_train_r2:.4f}\n")

        print("Model performance for Test set")
        print(f" RMSE: {model_test_rmse:.4f}")
        print(f" MAE : {model_test_mae:.4f}")
        print(f" R2  : {model_test_r2:.4f}")

        r2_list.append(model_test_r2)
        print('='*35 + '\n')

    except Exception as e:
        # catch & report model failures (useful for e.g. XGBoost/CatBoost import issues or data shape problems)
        print(f"Model {name} failed with error: {e}")
        print('='*35 + '\n')
        # Optionally append NaN to keep alignment
        model_list.append(name)
        r2_list.append(np.nan)


Linear Regression
Model performance for Training set
 RMSE: 5.3231
 MAE : 4.2667
 R2  : 0.8743

Model performance for Test set
 RMSE: 5.3940
 MAE : 4.2148
 R2  : 0.8804

Lasso
Model performance for Training set
 RMSE: 6.5938
 MAE : 5.2063
 R2  : 0.8071

Model performance for Test set
 RMSE: 6.5197
 MAE : 5.1579
 R2  : 0.8253

Ridge
Model performance for Training set
 RMSE: 5.3233
 MAE : 4.2650
 R2  : 0.8743

Model performance for Test set
 RMSE: 5.3904
 MAE : 4.2111
 R2  : 0.8806

KNN Regressor
Model performance for Training set
 RMSE: 5.7172
 MAE : 4.5270
 R2  : 0.8550

Model performance for Test set
 RMSE: 7.2553
 MAE : 5.6280
 R2  : 0.7837

Decision Tree
Model performance for Training set
 RMSE: 0.2795
 MAE : 0.0187
 R2  : 0.9997

Model performance for Test set
 RMSE: 8.1716
 MAE : 6.4450
 R2  : 0.7256

Random Forest
Model performance for Training set
 RMSE: 2.3011
 MAE : 1.8242
 R2  : 0.9765

Model performance for Test set
 RMSE: 5.9282
 MAE : 4.6261
 R2  : 0.8556

XGBoost Regresso

In [101]:

# results dataframe (sorted by R2)
results = pd.DataFrame({"Model": model_list, "R2": r2_list})
results = results.sort_values(by="R2", ascending=False).reset_index(drop=True)
print(results)

                Model        R2
0               Ridge  0.880593
1   Linear Regression  0.880433
2       Random Forest  0.855578
3  CatBoost Regressor  0.851632
4  AdaBoost Regressor  0.851615
5   XGBoost Regressor  0.827797
6               Lasso  0.825320
7       KNN Regressor  0.783681
8       Decision Tree  0.725588
