In [131]:
import pandas as pd 
from sklearn.impute import  SimpleImputer

In [132]:
dataset = pd.read_csv('./data/StudentPerformanceFactors.csv')

In [133]:
dataset.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70


In [134]:
# independent and dependent varaible
x = dataset.drop(columns=['Exam_Score'],axis=1)
y = dataset['Exam_Score']

## EDA 

In [135]:
# checking null value
x.isnull().sum()

Hours_Studied                  0
Attendance                     0
Parental_Involvement           0
Access_to_Resources            0
Extracurricular_Activities     0
Sleep_Hours                    0
Previous_Scores                0
Motivation_Level               0
Internet_Access                0
Tutoring_Sessions              0
Family_Income                  0
Teacher_Quality               78
School_Type                    0
Peer_Influence                 0
Physical_Activity              0
Learning_Disabilities          0
Parental_Education_Level      90
Distance_from_Home            67
Gender                         0
dtype: int64

So here we can see that there is some null in some columns. So we need to fix it.

In [136]:
dataset['Teacher_Quality'].value_counts()

Teacher_Quality
Medium    3925
High      1947
Low        657
Name: count, dtype: int64

In [137]:
x['Teacher_Quality'].fillna(x['Teacher_Quality'].mode,inplace=True)

In [138]:
x['Parental_Education_Level'].fillna(dataset['Parental_Education_Level'].mode,inplace=True)

In [139]:
x['Distance_from_Home'].value_counts()

Distance_from_Home
Near        3884
Moderate    1998
Far          658
Name: count, dtype: int64

In [140]:
x['Distance_from_Home'].fillna(x['Distance_from_Home'].mode,inplace=True)

In [141]:
x.isnull().sum()

Hours_Studied                 0
Attendance                    0
Parental_Involvement          0
Access_to_Resources           0
Extracurricular_Activities    0
Sleep_Hours                   0
Previous_Scores               0
Motivation_Level              0
Internet_Access               0
Tutoring_Sessions             0
Family_Income                 0
Teacher_Quality               0
School_Type                   0
Peer_Influence                0
Physical_Activity             0
Learning_Disabilities         0
Parental_Education_Level      0
Distance_from_Home            0
Gender                        0
dtype: int64

In [142]:
cat_fet = [f for f in x.columns if x[f].dtype == "O"]
num_fet = [f for f in x.columns if x[f].dtype != "O"]

In [143]:
for col in cat_fet:
    types = x[col].apply(lambda v: type(v).__name__).unique()
    print(f"{col}: {types}")


Parental_Involvement: ['str']
Access_to_Resources: ['str']
Extracurricular_Activities: ['str']
Motivation_Level: ['str']
Internet_Access: ['str']
Family_Income: ['str']
Teacher_Quality: ['str' 'method']
School_Type: ['str']
Peer_Influence: ['str']
Learning_Disabilities: ['str']
Parental_Education_Level: ['str' 'method']
Distance_from_Home: ['str' 'method']
Gender: ['str']


In [144]:
x.drop(columns=['Teacher_Quality','Parental_Education_Level','Distance_from_Home'],axis=True,inplace=True)

## Encoding and Scaling the dataset

In [151]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

num_encoder = StandardScaler()
fet_encoder = OneHotEncoder()

cat_fet = [f for f in x.columns if x[f].dtype == "O"]
num_fet = [f for f in x.columns if x[f].dtype != "O"]

preprocessor = ColumnTransformer([
    ('OneHotEncoder',fet_encoder,cat_fet),
    ('StandardScalar',num_encoder,num_fet),
])

Here is some columns which contain str and num value so we are going to drop this columns 


In [165]:
cat_fet

['Parental_Involvement',
 'Access_to_Resources',
 'Extracurricular_Activities',
 'Motivation_Level',
 'Internet_Access',
 'Family_Income',
 'School_Type',
 'Peer_Influence',
 'Learning_Disabilities',
 'Gender']

In [166]:
num_fet

['Hours_Studied',
 'Attendance',
 'Sleep_Hours',
 'Previous_Scores',
 'Tutoring_Sessions',
 'Physical_Activity']

## Train Test Split

In [154]:
x = preprocessor.fit_transform(x)

In [155]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [163]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import AdaBoostRegressor,GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

In [157]:
def eval_model(y_test,y_pred):
    mse = mean_squared_error(y_test,y_pred)
    mae = mean_absolute_error(y_test,y_pred)
    score = r2_score(y_test,y_pred)
    return mse,mae,score

In [159]:
models = {
    "LinearRegression":LinearRegression(),
    "Lasso":Lasso(),
    "Ridge":Ridge(),
    "KNeighborsRegressor":KNeighborsRegressor(),
    "DecisionTreeRegressor":DecisionTreeRegressor(),
    "SupportVectorRegressor":SVR(),
    "AdaboostRegressor":AdaBoostRegressor(),
    "GradientBoostingRegressor":GradientBoostingRegressor(),
}

In [160]:
r2_list = [] # for test dataset
for m in list(models):
   model = models[m]
   model.fit(x_train,y_train)
   y_train_pred = model.predict(x_train)
   y_test_pred = model.predict(x_test)

   # for training dataset
   y_train_mse,y_train_mae,y_train_score = eval_model(y_train,y_train_pred)
   # for test dataset
   y_test_mse,y_test_mae,y_test_score = eval_model(y_test,y_test_pred)

   r2_list.append(y_test_score)

   print(f"{m}")
   print("--"*30)
   print("Traing dataset")
   print(f"Mean Squared Error:{y_train_mse}")
   print(f"Mean Absolute Error:{y_train_mae}")
   print(f"R2 score:{y_train_score}")
   print("--"*30)
   print("Test data")
   print(f"Mean Squared Error:{y_test_mse}")
   print(f"Mean Absolute Error:{y_test_mae}")
   print(f"R2 score:{y_test_score}")
   print("=="*30)
   print("\n")

LinearRegression
------------------------------------------------------------
Traing dataset
Mean Squared Error:4.688513483861311
Mean Absolute Error:0.751113114947966
R2 score:0.695174254782924
------------------------------------------------------------
Test data
Mean Squared Error:3.6129578837107603
Mean Absolute Error:0.6782810136157338
R2 score:0.7443976399415164


Lasso
------------------------------------------------------------
Traing dataset
Mean Squared Error:9.248188499325941
Mean Absolute Error:1.960603118928058
R2 score:0.398724998676273
------------------------------------------------------------
Test data
Mean Squared Error:7.950068322537174
Mean Absolute Error:1.9063940444811696
R2 score:0.4375643748773437


Ridge
------------------------------------------------------------
Traing dataset
Mean Squared Error:4.6865527750564855
Mean Absolute Error:0.7462074189413999
R2 score:0.6953017311194306
------------------------------------------------------------
Test data
Mean Squ

In [161]:
pd.DataFrame(data=list(zip(list(models),r2_list)),columns=['Model name','R2 score']).sort_values(by=['R2 score'],ascending=False )

Unnamed: 0,Model name,R2 score
2,Ridge,0.744687
0,LinearRegression,0.744398
5,SupportVectorRegressor,0.736326
7,GradientBoostingRegressor,0.724395
3,KNeighborsRegressor,0.567717
1,Lasso,0.437564
4,DecisionTreeRegressor,0.181391
6,AdaboostRegressor,-0.843759
