In [34]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,accuracy_score
from sklearn.linear_model import   LinearRegression,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.svm import SVR
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv('data/students/StudentsPerformance.csv')

In [3]:
df.head(2)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88


In [4]:
x=df.drop('math score',axis=1)

In [5]:
y=df['math score']

In [6]:
x

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75
...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,99,95
996,male,group C,high school,free/reduced,none,55,55
997,female,group C,high school,free/reduced,completed,71,65
998,female,group D,some college,standard,completed,78,77


In [7]:
y

0      72
1      69
2      90
3      47
4      76
       ..
995    88
996    62
997    59
998    68
999    77
Name: math score, Length: 1000, dtype: int64

In [8]:
num_features=x.select_dtypes(exclude=object).columns
categorical_features=x.select_dtypes(include=object).columns

In [9]:
num_features

Index(['reading score', 'writing score'], dtype='object')

In [10]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

In [11]:
num_tr=StandardScaler()
cat_tr=OneHotEncoder()
preprocess=ColumnTransformer([('StandardScaler',num_tr,num_features),
                              ('OneHotEncoder',cat_tr,categorical_features),])

In [12]:
x.shape

(1000, 7)

In [13]:
x=preprocess.fit_transform(x)

In [14]:
x.shape

(1000, 19)

In [15]:
from sklearn.model_selection import StratifiedKFold

In [16]:
n_splits=5
sc=StratifiedKFold(n_splits)
for train,test in sc.split(x,y):
    x_train,x_test=x[train],x[test]
    y_train,y_test=y[train],y[test]

In [17]:
models={
    "LinearRegression":LinearRegression(),'Ridge':Ridge(),'Lasso':Lasso(),
    'KNeighborsRegressor':KNeighborsRegressor(),'DecisionTreeRegressor':DecisionTreeRegressor(),
    "RandomForestRegressor":RandomForestRegressor(),'XGBRegressor':XGBRegressor(),'ExtraTreesRegressor':ExtraTreesRegressor(),
    "CatBoostRegressor":CatBoostRegressor()
}

In [38]:
def evaluate(true,predicted):
    score=r2_score(true,predicted)
    mse=mean_squared_error(true,predicted)
    mae=mean_absolute_error(true,predicted)
    mase=np.sqrt(mae) 
    return mse,mae,mase,score

In [55]:
mod=[]
for i in range(len(models.keys())):
    model=list(models.values())[i]
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)
    mse,mae,mase,score=evaluate(y_test,y_pred)
    mod.append(evaluate(y_test,y_pred))
    print(model)
    print('mean_squared_error',mse)
    print('mean_absolute_error',mae)
    print('mean_absolute_square_error',mase)
    print('r2_score',score)
    print('*'*35)    

LinearRegression()
mean_squared_error 30.47471214591611
mean_absolute_error 4.434829497881288
mean_absolute_square_error 2.1059034873140052
r2_score 0.8673836121545815
***********************************
Ridge()
mean_squared_error 30.50923941569264
mean_absolute_error 4.434417810390381
mean_absolute_square_error 2.105805738996449
r2_score 0.8672333603071479
***********************************
Lasso()
mean_squared_error 48.438037575055276
mean_absolute_error 5.47083764860783
mean_absolute_square_error 2.338982182191183
r2_score 0.7892128546853125
***********************************
KNeighborsRegressor()
mean_squared_error 48.623999999999995
mean_absolute_error 5.26
mean_absolute_square_error 2.293468988235943
r2_score 0.7884036045452929
***********************************
DecisionTreeRegressor()
mean_squared_error 61.425
mean_absolute_error 6.265
mean_absolute_square_error 2.5029982021567654
r2_score 0.7326976680074575
***********************************
RandomForestRegressor()
mean_squ

In [56]:
final_model=LinearRegression()

In [57]:
final_model.fit(x_train,y_train)

In [58]:
y_predict=final_model.predict(x_test)

In [59]:
r2_score(y_test,y_predict)

0.8673836121545815

In [69]:
data=pd.DataFrame({'original_value':y_test,'predicted_value':y_predict,'difference':y_test-y_predict})

In [70]:
data

Unnamed: 0,original_value,predicted_value,difference
17,18,21.312060,-3.312060
306,99,89.715824,9.284176
470,83,77.532133,5.467867
492,83,80.811469,2.188531
618,95,86.581673,8.418327
...,...,...,...
995,88,87.056877,0.943123
996,62,58.539019,3.460981
997,59,53.374083,5.625917
998,68,67.071056,0.928944


In [68]:
y_predict.shape

(200,)