In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,root_mean_squared_error
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings('ignore') 

In [3]:
df=pd.read_csv('data/stud.csv')
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [4]:
x=df.drop(columns=['math_score'],axis=1)
y=df['math_score']

In [5]:
x.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75


In [6]:
print("Cotegories in gender variable",end=" ")
print(df['gender'].unique())
print("Cotegories in race_ethnicity variable",end=" ")
print(df['race_ethnicity'].unique())
print("Cotegories in parental_level_of_education variable",end=" ")
print(df['parental_level_of_education'].unique())
print("Cotegories in lunch variable",end=" ")
print(df['lunch'].unique())
print("Cotegories in test_preparation_course variable",end=" ")
print(df['test_preparation_course'].unique())

Cotegories in gender variable ['female' 'male']
Cotegories in race_ethnicity variable ['group B' 'group C' 'group A' 'group D' 'group E']
Cotegories in parental_level_of_education variable ["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
Cotegories in lunch variable ['standard' 'free/reduced']
Cotegories in test_preparation_course variable ['none' 'completed']


In [7]:
# Create column transformer for categorical feature
num_feature=x.select_dtypes(exclude='object').columns
cat_feature=x.select_dtypes(include='object').columns
print("Numeric Feature: ",num_feature)
print('Categorical Feature : ',cat_feature)


Numeric Feature:  Index(['reading_score', 'writing_score'], dtype='object')
Categorical Feature :  Index(['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch',
       'test_preparation_course'],
      dtype='object')


In [8]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

num_transformer=StandardScaler()
oh_transformer=OneHotEncoder()

preprocessor=ColumnTransformer(
        [
        ('OneHotEncoder',oh_transformer,cat_feature),
        ("StandardScalar",num_transformer,num_feature)
        ]
)

In [9]:
x=preprocessor.fit_transform(x)

In [10]:
x.shape

(1000, 19)

In [11]:
# Train test split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [12]:
x_train.shape,x_test.shape

((800, 19), (200, 19))

Create Model evaluate Function to give all metrics after training model

In [17]:
def evaluate_model(true,predicted):
        mse=mean_squared_error(true,predicted)
        mae=mean_absolute_error(true,predicted)
        rmse=root_mean_squared_error(true,predicted)
        r2=r2_score(true,predicted)
        return mse,mae,rmse,r2

In [15]:
#Create Model Parameter
models={
        'Linear Regression':LinearRegression(),
        'Ridge':Ridge(),
        'Lasso':Lasso(),
        'Decision Tree Regressor':DecisionTreeRegressor(),
        'KNeighbors Regressor':KNeighborsRegressor(),
        'Random Forest Regressor':RandomForestRegressor(),
        'AdaBoost Regressor':AdaBoostRegressor(),
        'CatBoost Regressor':CatBoostRegressor(),
        'XGB  Regressor':XGBRegressor()
}

In [22]:
model_list=[]
r2_list=[]
for i in range(len(models)):
        model=list(models.values())[i]
        model.fit(x_train,y_train)

        #MAke Prediction
        x_train_pred=model.predict(x_train)
        x_test_pred=model.predict(x_test)
        

        model_train_mse,model_train_mae,model_train_rmse,model_train_r2=evaluate_model(y_train,x_train_pred)
        model_test_mse,model_test_mae,model_test_rmse,model_test_r2=evaluate_model(y_test,x_test_pred)

        print("Model Name :" ,list(models.values())[i])
        model_list.append(list(models.values())[i])

        print("Model Performance For Training Set")
        print("Mean Squired Error : {:.4f}".format(model_train_mse))
        print("Mean absolute Error : {:.4f}".format(model_train_mae))
        print("Root mean Squired Error : {:.4f}".format(model_train_rmse))
        print("R2 Score : {:.4f}".format(model_train_r2))

        print("-"*50)
        print("Model Performance For Testing Set")
        print("Mean Squired Error : {:.4f}".format(model_test_mse))
        print("Mean absolute Error : {:.4f}".format(model_test_mae))
        print("Root mean Squired Error : {:.4f}".format(model_test_rmse))
        print("R2 Score : {:.4f}".format(model_test_r2))
        print('='*70)



Model Name : LinearRegression()
Model Performance For Training Set
Mean Squired Error : 28.3349
Mean absolute Error : 4.2667
Root mean Squired Error : 5.3231
R2 Score : 0.8743
--------------------------------------------------
Model Performance For Testing Set
Mean Squired Error : 29.0952
Mean absolute Error : 4.2148
Root mean Squired Error : 5.3940
R2 Score : 0.8804
Model Name : Ridge()
Model Performance For Training Set
Mean Squired Error : 28.3378
Mean absolute Error : 4.2650
Root mean Squired Error : 5.3233
R2 Score : 0.8743
--------------------------------------------------
Model Performance For Testing Set
Mean Squired Error : 29.0563
Mean absolute Error : 4.2111
Root mean Squired Error : 5.3904
R2 Score : 0.8806
Model Name : Lasso()
Model Performance For Training Set
Mean Squired Error : 43.4784
Mean absolute Error : 5.2063
Root mean Squired Error : 6.5938
R2 Score : 0.8071
--------------------------------------------------
Model Performance For Testing Set
Mean Squired Error : 