In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, root_mean_squared_error

In [22]:
df = pd.read_csv('eda_stud.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,total_score,average_score
0,0,female,group B,bachelor's degree,standard,none,72,72,74,218,72.666667
1,1,female,group C,some college,standard,completed,69,90,88,247,82.333333
2,2,female,group B,master's degree,standard,none,90,95,93,278,92.666667
3,3,male,group A,associate's degree,free/reduced,none,47,57,44,148,49.333333
4,4,male,group C,some college,standard,none,76,78,75,229,76.333333


In [23]:
df.drop(['Unnamed: 0', 'total_score'], axis=1, inplace=True)
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,average_score
0,female,group B,bachelor's degree,standard,none,72,72,74,72.666667
1,female,group C,some college,standard,completed,69,90,88,82.333333
2,female,group B,master's degree,standard,none,90,95,93,92.666667
3,male,group A,associate's degree,free/reduced,none,47,57,44,49.333333
4,male,group C,some college,standard,none,76,78,75,76.333333


In [24]:
X = df.drop('average_score', axis=1)
X.head(2)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88


In [25]:
y = df['average_score']
y.head(2)

0    72.666667
1    82.333333
Name: average_score, dtype: float64

In [26]:
num_features = X.select_dtypes(exclude='object').columns
cat_features = X.select_dtypes(include='object').columns

from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ('OneHotEncoder', oh_transformer, cat_features),
        ('StandardScaler', numeric_transformer, num_features)
    ]
)

In [27]:
X = preprocessor.fit_transform(X)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)
X_train.shape, X_test.shape

((800, 20), (200, 20))

In [29]:
def evaluate_model(true_val, pred):
    mae = mean_absolute_error(true_val, pred)
    mse = mean_squared_error(true_val, pred)
    rmse = root_mean_squared_error(true_val, pred)
    r2score = r2_score(true_val, pred)
    return mae, mse, rmse, r2score

In [30]:
models = {
    "LinearRegression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "KNeighborsRegressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(),
    "AdaBoost Regressor": AdaBoostRegressor()
}

model_list = []
r2_list = []

for model_name, model in models.items():
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    model_train_mae, model_train_mse, model_train_rmse, model_train_r2score = evaluate_model(y_train, y_train_pred)
    model_test_mae, model_test_mse, model_test_rmse, model_test_r2score = evaluate_model(y_test, y_test_pred)

    print(model_name)

    print("Model Performance for training data:")
    print(f"MAE: {model_train_mae}")
    print(f"MSE: {model_train_mse}")
    print(f"RMSE: {model_train_rmse}")
    print(f"r2_score: {model_train_r2score}")
    print("\nModel Performance for test data:")
    print(f"MAE: {model_test_mae}")
    print(f"MSE: {model_test_mse}")
    print(f"RMSE: {model_test_rmse}")
    print(f"r2_score: {model_test_r2score}")

    print('-' * 100)


LinearRegression
Model Performance for training data:
MAE: 1.4659384817150566e-14
MSE: 3.8160357429161226e-28
RMSE: 1.9534676201350568e-14
r2_score: 1.0

Model Performance for test data:
MAE: 1.4725998198628075e-14
MSE: 4.173389732981241e-28
RMSE: 2.0428875967564247e-14
r2_score: 1.0
----------------------------------------------------------------------------------------------------
Lasso
Model Performance for training data:
MAE: 0.8506012933972135
MSE: 1.1331525964023317
RMSE: 1.0644964050678292
r2_score: 0.994318597201073

Model Performance for test data:
MAE: 0.8768807904727821
MSE: 1.2413568666700592
RMSE: 1.1141619571094945
r2_score: 0.9942091862272519
----------------------------------------------------------------------------------------------------
Ridge
Model Performance for training data:
MAE: 0.006499357373334163
MSE: 6.438220039522209e-05
RMSE: 0.00802385196742949
r2_score: 0.9999996772003924

Model Performance for test data:
MAE: 0.006696634238849892
MSE: 7.765406367228233