In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [2]:
data=pd.read_csv('data/stud.csv')
data.head(2)

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88


In [3]:
x=data.drop('math_score',axis=1)
x.head(2)

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88


In [4]:
y=data.math_score

In [5]:
num_col=[num for num in x.columns if x[num].dtype!='O']
cat_col=[cat for cat in x.columns if x[cat].dtype=='O']

In [6]:
num_transformer=StandardScaler()
Oh_transformer=OneHotEncoder()
preprocessor=ColumnTransformer(
    [
        ("OneHotEncoding",Oh_transformer,cat_col),
        ("Standardscalar",num_transformer,num_col)
    ]
)

In [7]:
x=preprocessor.fit_transform(x)

In [8]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
x_train.shape,x_test.shape

((800, 19), (200, 19))

In [9]:
def evaluate_model(actual,predicted):
    mae=mean_absolute_error(actual,predicted)
    mse=mean_squared_error(actual,predicted)
    rmse=np.sqrt(mse)
    R2_score=r2_score(actual,predicted)
    return mae,rmse,R2_score

In [10]:
# Lists to store results
model_list = []
best_model_list=[]
r2_list_train = []
r2_list_test = []
rmse_list_train = []
rmse_list_test = []

# Model and parameters dictionary
models = {
    "LinearReg": LinearRegression(),
    "LassoReg": Lasso(),
    "RidgeReg": Ridge(),
    "ElasticNet": ElasticNet(),
    "DecisionTree": DecisionTreeRegressor(),
    "RandomForest": RandomForestRegressor(),
    "XGBReg": XGBRegressor(),
    "CatBoost": CatBoostRegressor(verbose=False),
    "AdaBoostReg": AdaBoostRegressor()
}

params = {
    "LinearReg": {},  # Linear regression doesn't require hyperparameter tuning
    "LassoReg": {'alpha': [0.1, 1, 10, 100, 1000]},
    "RidgeReg": {'alpha': [0.1, 1, 10, 100, 1000]},
    "ElasticNet": {
        'alpha': [0.1, 1, 10, 100, 1000],
        'l1_ratio': [0.1, 0.5, 0.7, 0.9, 1.0]
    },
    "DecisionTree": {
        "criterion": ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        "splitter": ['best', 'random'],
        "max_features": ['sqrt', 'log2']
    },
    "RandomForest": {'n_estimators': [10, 20, 50, 100, 200]},
    "XGBReg": {
        'learning_rate': [0.1, 0.01, 0.05, 0.001],
        'n_estimators': [10, 20, 50, 100, 200]
    },
    "CatBoost": {
        'depth': [5, 10, 15],
        'learning_rate': [0.1, 0.05, 0.01, 0.001],
        'iterations': [10, 20, 50, 100]
    },
    "AdaBoostReg": {
        'n_estimators': [10, 20, 50, 100, 200],
        'learning_rate': [0.1, 0.05, 0.01, 0.001]
    }
}

# Loop through each model and perform GridSearchCV
for model_name, model in models.items():
    param_grid = params[model_name]  # Get the hyperparameter grid for the current model
    
    # GridSearchCV for hyperparameter tuning
    gs = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=1)
    gs.fit(x_train, y_train)  # Fit on training data
    
    # Use the best estimator found by GridSearchCV
    best_model = gs.best_estimator_
    
    # Predictions
    y_train_pred = best_model.predict(x_train)
    y_test_pred = best_model.predict(x_test)
    
    # Evaluate on training and test set
    model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)
    
    # Append model name and scores
    model_list.append(model_name)
    best_model_list.append(best_model)
    r2_list_train.append(model_train_r2)
    r2_list_test.append(model_test_r2)
    rmse_list_train.append(model_train_rmse)
    rmse_list_test.append(model_test_rmse)

# Create DataFrame to show results
model_output = pd.DataFrame({
    "Model": model_list,
    "Train_R2_score": r2_list_train,
    "Test_R2_score": r2_list_test,
    "Train_rmse_score": rmse_list_train,
    "Test_rmse_score": rmse_list_test,
    "Best_Model":best_model_list
})

# Output the result
print(model_output)


Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Fitting 3 folds for each of 25 candidates, totalling 75 fits
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Fitting 3 folds for each of 20 candidates, totalling 60 fits
Fitting 3 folds for each of 48 candidates, totalling 144 fits
Fitting 3 folds for each of 20 candidates, totalling 60 fits
          Model  Train_R2_score  Test_R2_score  Train_rmse_score  \
0     LinearReg        0.874023       0.879046          5.329287   
1      LassoReg        0.870729       0.881559          5.398496   
2      RidgeReg        0.874304       0.880593          5.323325   
3    ElasticNet        0.870729       0.881559          5.398496   
4  DecisionTree        0.999653       0.635672          0.279508   
5  RandomForest        0.976386       0.848716 

In [11]:

# Sort the DataFrame based on the 'Test_R2_score' to get the best model
best_model_row = model_output.loc[model_output['Test_R2_score'].idxmax()]

# Retrieve the best model name
best_model_name = best_model_row['Model']

# Get the best model from the models dictionary
best_model = best_model_row['Best_Model']

# Make predictions using the best model

y_best_pred = best_model.predict(x_test)  # Predict on test data

# You can now evaluate or use 'y_best_pred' as needed
print(f"Best model: {best_model_name}:{r2_score(y_pred=y_best_pred,y_true=y_test)}")
model_output['diff']=model_output.Test_rmse_score-model_output.Train_rmse_score
model_output.sort_values(by='Test_R2_score',ascending=False,inplace=True)
model_output_drop=model_output.drop('Best_Model',axis=1)
model_output_drop



Best model: LassoReg:0.8815586971937939


Unnamed: 0,Model,Train_R2_score,Test_R2_score,Train_rmse_score,Test_rmse_score,diff
1,LassoReg,0.870729,0.881559,5.398496,5.368549,-0.029947
3,ElasticNet,0.870729,0.881559,5.398496,5.368549,-0.029947
2,RidgeReg,0.874304,0.880593,5.323325,5.390387,0.067062
0,LinearReg,0.874023,0.879046,5.329287,5.425185,0.095898
7,CatBoost,0.898532,0.861084,4.782854,5.814094,1.03124
6,XGBReg,0.945695,0.860795,3.499005,5.820131,2.321126
5,RandomForest,0.976386,0.848716,2.307322,6.067381,3.76006
8,AdaBoostReg,0.841753,0.841967,5.972981,6.20125,0.228269
4,DecisionTree,0.999653,0.635672,0.279508,9.415678,9.13617


In [26]:
best_model_row.Test_R2_score

0.8815586971937939

In [12]:
model_name_wogs=[]
rmse_wogs_train=[]
r2_wogs_train=[]
rmse_wogs_test=[]
r2_wogs_test=[]
for model_name, model in models.items():
    model.fit(x_train,y_train)
    y_pred_wogs_train=model.predict(x_train)
    y_pred_wogs_test=model.predict(x_test)
    model_train_mae_wogs,model_train_rmse_wogs,model_train_r2_wogs=evaluate_model(y_train,y_pred_wogs_train)
    model_test_mae_wogs,model_test_rmse_wogs,model_test_r2_wogs=evaluate_model(y_test,y_pred_wogs_test)
    model_name_wogs.append(model_name)
    rmse_wogs_train.append(model_train_rmse_wogs)
    r2_wogs_train.append(model_train_r2_wogs)
    rmse_wogs_test.append(model_test_rmse_wogs)
    r2_wogs_test.append(model_test_r2_wogs)
model_output_wogs = pd.DataFrame({
    "Model": model_name_wogs,
    "Train_R2_score": r2_wogs_train,
    "Test_R2_score": r2_wogs_test,
    "Train_rmse_score": rmse_wogs_train,
    "Test_rmse_score": rmse_wogs_test
})



In [13]:
model_out_put_wogs=model_output_wogs.sort_values(by='Test_R2_score',ascending=False)
model_output_wogs['diff']=model_output_wogs.Train_rmse_score-model_out_put_wogs.Test_rmse_score
model_out_put_wogs

Unnamed: 0,Model,Train_R2_score,Test_R2_score,Train_rmse_score,Test_rmse_score
2,RidgeReg,0.874304,0.880593,5.323325,5.390387
0,LinearReg,0.874023,0.879046,5.329287,5.425185
5,RandomForest,0.976769,0.852497,2.288519,5.991093
7,CatBoost,0.958936,0.851632,3.042664,6.008632
8,AdaBoostReg,0.847822,0.843429,5.857322,6.172501
6,XGBReg,0.9955,0.827797,1.007282,6.473307
1,LassoReg,0.807146,0.82532,6.593816,6.519695
4,DecisionTree,0.999653,0.759532,0.279508,7.64951
3,ElasticNet,0.733779,0.739624,7.747189,7.959863


In [14]:
model_output

Unnamed: 0,Model,Train_R2_score,Test_R2_score,Train_rmse_score,Test_rmse_score,Best_Model,diff
1,LassoReg,0.870729,0.881559,5.398496,5.368549,Lasso(alpha=0.1),-0.029947
3,ElasticNet,0.870729,0.881559,5.398496,5.368549,"ElasticNet(alpha=0.1, l1_ratio=1.0)",-0.029947
2,RidgeReg,0.874304,0.880593,5.323325,5.390387,Ridge(alpha=1),0.067062
0,LinearReg,0.874023,0.879046,5.329287,5.425185,LinearRegression(),0.095898
7,CatBoost,0.898532,0.861084,4.782854,5.814094,<catboost.core.CatBoostRegressor object at 0x0...,1.03124
6,XGBReg,0.945695,0.860795,3.499005,5.820131,"XGBRegressor(base_score=None, booster=None, ca...",2.321126
5,RandomForest,0.976386,0.848716,2.307322,6.067381,"(DecisionTreeRegressor(max_features=1.0, rando...",3.76006
8,AdaBoostReg,0.841753,0.841967,5.972981,6.20125,"(DecisionTreeRegressor(max_depth=3, random_sta...",0.228269
4,DecisionTree,0.999653,0.635672,0.279508,9.415678,DecisionTreeRegressor(criterion='absolute_erro...,9.13617
