In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings("ignore")

In [55]:
df = pd.read_csv('studPerf.csv')
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [56]:
df['total score'] = df['math score'] + df['reading score'] + df['writing score']
df['average score'] = df['total score'] / 3

In [None]:
#df = df.drop(columns=['math score', 'reading score', 'writing score'], axis=1)

In [57]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,total score,average score
0,female,group B,bachelor's degree,standard,none,72,72,74,218,72.666667
1,female,group C,some college,standard,completed,69,90,88,247,82.333333
2,female,group B,master's degree,standard,none,90,95,93,278,92.666667
3,male,group A,associate's degree,free/reduced,none,47,57,44,148,49.333333
4,male,group C,some college,standard,none,76,78,75,229,76.333333


In [None]:
#df = df.drop(columns=['lunch'], axis=1)

In [58]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,total score,average score
0,female,group B,bachelor's degree,standard,none,72,72,74,218,72.666667
1,female,group C,some college,standard,completed,69,90,88,247,82.333333
2,female,group B,master's degree,standard,none,90,95,93,278,92.666667
3,male,group A,associate's degree,free/reduced,none,47,57,44,148,49.333333
4,male,group C,some college,standard,none,76,78,75,229,76.333333


In [59]:
y = df['average score'] 
X = df.drop(columns=['average score'], axis=1)

In [60]:
X.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,total score
0,female,group B,bachelor's degree,standard,none,72,72,74,218
1,female,group C,some college,standard,completed,69,90,88,247
2,female,group B,master's degree,standard,none,90,95,93,278
3,male,group A,associate's degree,free/reduced,none,47,57,44,148
4,male,group C,some college,standard,none,76,78,75,229


In [61]:
y.head()

0    72.666667
1    82.333333
2    92.666667
3    49.333333
4    76.333333
Name: average score, dtype: float64

In [62]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [63]:
print(type(X_train))
print(type(X_test))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [64]:
num_features = X_train.select_dtypes(exclude='object').columns
cat_features = X_train.select_dtypes(include='object').columns
print("Numerical Features:", num_features)
print("Categorical Features:", cat_features)

Numerical Features: Index(['math score', 'reading score', 'writing score', 'total score'], dtype='object')
Categorical Features: Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course'],
      dtype='object')


In [65]:
X_train.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,total score
82,male,group A,some college,free/reduced,completed,50,47,54,151
991,female,group B,some high school,standard,completed,65,82,78,225
789,female,group C,master's degree,free/reduced,none,52,65,61,178
894,female,group E,associate's degree,standard,none,59,62,69,190
398,male,group B,some high school,standard,none,74,63,57,194


In [66]:
num_features = X_train.select_dtypes(exclude='object').columns
cat_features = X_train.select_dtypes(include='object').columns

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

scaler = StandardScaler()
ohe = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    [
        ("StandardScaler", scaler, num_features),
        ("OneHotEncoder", ohe, cat_features)
    ]
)

In [67]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [None]:
models = {
    'LinearRegression' : LinearRegression(),
    'Ridge' : Ridge(),
    'Lasso' : Lasso(),
    'SVR' : SVR(),
    'KNN' : KNeighborsRegressor(),
    'DecisionTreeRegressor' : DecisionTreeRegressor(),
    'Random Forest Regressor' : RandomForestRegressor(),
    'AdaBoot Regressor' : AdaBoostRegressor(),
    'Gradient Boosting Regressor' : GradientBoostingRegressor(),
    'XGBoost Regressor' : XGBRegressor()
}

print("Predicting Average Score using different models...")
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    print(f"Model : {list(models.keys())[i]}")
    print("Train Data : ")
    print("Mean Absolute Error: ", mean_absolute_error(y_train, y_train_pred))
    print("Root Mean Squared Error: ", np.sqrt(mean_squared_error(y_train, y_train_pred)))
    print("R2 Score: ", r2_score(y_train, y_train_pred))
    print("\n")
    print("Test Data : ")
    print("Mean Absolute Error: ", mean_absolute_error(y_test, y_test_pred))
    print("Root Mean Squared Error: ", np.sqrt(mean_squared_error(y_test, y_test_pred)))
    print("R2 Score: ", r2_score(y_test, y_test_pred))
    print("="*40)
    

Predicting Math Score using different models...
Model : LinearRegression
Train Data : 
Mean Absolute Error:  8.34650866939531e-15
Root Mean Squared Error:  1.1056495517622623e-14
R2 Score:  1.0


Test Data : 
Mean Absolute Error:  7.787548383930699e-15
Root Mean Squared Error:  1.0345658394727079e-14
R2 Score:  1.0


Model : Ridge
Train Data : 
Mean Absolute Error:  0.00487525195170529
Root Mean Squared Error:  0.006029770143598173
R2 Score:  0.9999998134230809


Test Data : 
Mean Absolute Error:  0.005170436781149278
Root Mean Squared Error:  0.00668769054114807
R2 Score:  0.9999998000891418


Model : Lasso
Train Data : 
Mean Absolute Error:  0.8029801644215713
Root Mean Squared Error:  1.0000000000000047
R2 Score:  0.9948683573814506


Test Data : 
Mean Absolute Error:  0.8475248278441536
Root Mean Squared Error:  1.0839870682379984
R2 Score:  0.9947479089273638


Model : SVR
Train Data : 
Mean Absolute Error:  0.8376308880846521
Root Mean Squared Error:  2.909731619837817
R2 Score: 