In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, root_mean_squared_error

In [2]:
df = pd.read_csv('eda_stud.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,0,female,group B,bachelor's degree,standard,none,72,72,74
1,1,female,group C,some college,standard,completed,69,90,88
2,2,female,group B,master's degree,standard,none,90,95,93
3,3,male,group A,associate's degree,free/reduced,none,47,57,44
4,4,male,group C,some college,standard,none,76,78,75


In [3]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [4]:
X = df.drop('math score', axis=1)
X.head(2)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88


In [5]:
y = df['math score']
y.head(2)

0    72
1    69
Name: math score, dtype: int64

In [6]:
num_features = X.select_dtypes(exclude='object').columns
cat_features = X.select_dtypes(include='object').columns

from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ('OneHotEncoder', oh_transformer, cat_features),
        ('StandardScaler', numeric_transformer, num_features)
    ]
)

In [7]:
X = preprocessor.fit_transform(X)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)
X_train.shape, X_test.shape

((800, 19), (200, 19))

In [9]:
def evaluate_model(true_val, pred):
    mae = mean_absolute_error(true_val, pred)
    mse = mean_squared_error(true_val, pred)
    rmse = root_mean_squared_error(true_val, pred)
    r2score = r2_score(true_val, pred)
    return mae, mse, rmse, r2score

In [10]:
models = {
    "LinearRegression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "KNeighborsRegressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(),
    "AdaBoost Regressor": AdaBoostRegressor()
}

model_list = []
r2_list = []

for model_name, model in models.items():
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    model_train_mae, model_train_mse, model_train_rmse, model_train_r2score = evaluate_model(y_train, y_train_pred)
    model_test_mae, model_test_mse, model_test_rmse, model_test_r2score = evaluate_model(y_test, y_test_pred)

    print(model_name)

    print("Model Performance for training data:")
    print(f"MAE: {model_train_mae}")
    print(f"MSE: {model_train_mse}")
    print(f"RMSE: {model_train_rmse}")
    print(f"r2_score: {model_train_r2score}")
    print("\nModel Performance for test data:")
    print(f"MAE: {model_test_mae}")
    print(f"MSE: {model_test_mse}")
    print(f"RMSE: {model_test_rmse}")
    print(f"r2_score: {model_test_r2score}")

    print('-' * 100)


LinearRegression
Model Performance for training data:
MAE: 4.266711846071957
MSE: 28.33487038064859
RMSE: 5.323050852720514
r2_score: 0.8743172040139593

Model Performance for test data:
MAE: 4.21476314247485
MSE: 29.095169866715487
RMSE: 5.393993869732843
r2_score: 0.8804332983749565
----------------------------------------------------------------------------------------------------
Lasso
Model Performance for training data:
MAE: 5.206302661246526
MSE: 43.47840400585579
RMSE: 6.593815587795566
r2_score: 0.8071462015863456

Model Performance for test data:
MAE: 5.157881810347763
MSE: 42.5064168384116
RMSE: 6.519694535667419
r2_score: 0.8253197323627853
----------------------------------------------------------------------------------------------------
Ridge
Model Performance for training data:
MAE: 4.26498782372598
MSE: 28.33778823308244
RMSE: 5.323324922741654
r2_score: 0.8743042615212909

Model Performance for test data:
MAE: 4.211100688014261
MSE: 29.056272192348302
RMSE: 5.39038701