In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import OrdinalEncoder
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import pickle

In [222]:
spf_df = pd.read_csv('SPF.csv')
spf_df.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70


In [165]:
spf_df.drop(columns=['School_Type','Parental_Education_Level','Gender','Internet_Access','Learning_Disabilities','Distance_from_Home','Extracurricular_Activities','Parental_Involvement','Family_Income','Teacher_Quality'], inplace=True)

In [166]:
spf_df['Exam_Score'].replace(101,100,inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  spf_df['Exam_Score'].replace(101,100,inplace=True)


In [167]:
spf_df.shape

(6607, 10)

In [168]:
spf_df.head()

Unnamed: 0,Hours_Studied,Attendance,Access_to_Resources,Sleep_Hours,Previous_Scores,Motivation_Level,Tutoring_Sessions,Peer_Influence,Physical_Activity,Exam_Score
0,23,84,High,7,73,Low,0,Positive,3,67
1,19,64,Medium,8,59,Low,2,Negative,4,61
2,24,98,Medium,7,91,Medium,2,Neutral,4,74
3,29,89,Medium,8,98,Medium,1,Negative,4,71
4,19,92,Medium,6,65,Medium,3,Neutral,4,70


In [169]:
spf_df.describe()

Unnamed: 0,Hours_Studied,Attendance,Sleep_Hours,Previous_Scores,Tutoring_Sessions,Physical_Activity,Exam_Score
count,6607.0,6607.0,6607.0,6607.0,6607.0,6607.0,6607.0
mean,19.975329,79.977448,7.02906,75.070531,1.493719,2.96761,67.235508
std,5.990594,11.547475,1.46812,14.399784,1.23057,1.031231,3.889161
min,1.0,60.0,4.0,50.0,0.0,0.0,55.0
25%,16.0,70.0,6.0,63.0,1.0,2.0,65.0
50%,20.0,80.0,7.0,75.0,1.0,3.0,67.0
75%,24.0,90.0,8.0,88.0,2.0,4.0,69.0
max,44.0,100.0,10.0,100.0,8.0,6.0,100.0


In [170]:
x_train, x_test, y_train, y_test = train_test_split(spf_df.iloc[:,:9], spf_df.iloc[:,9], test_size=0.2,random_state=42)

In [171]:
x_train.head()

Unnamed: 0,Hours_Studied,Attendance,Access_to_Resources,Sleep_Hours,Previous_Scores,Motivation_Level,Tutoring_Sessions,Peer_Influence,Physical_Activity
5810,27,79,High,8,63,High,2,Negative,5
1268,16,86,Medium,7,94,Medium,2,Neutral,3
414,22,87,Medium,8,83,Low,1,Neutral,1
4745,18,100,Medium,10,86,Medium,1,Neutral,3
654,35,78,Low,10,99,Medium,1,Positive,2


In [172]:
x_train.shape

(5285, 9)

In [173]:
oe_atr = OrdinalEncoder(categories=[['Low','Medium','High']])
x_train_atr = oe_atr.fit_transform(x_train[['Access_to_Resources']])
x_test_atr = oe_atr.transform(x_test[['Access_to_Resources']])


In [174]:
oe_ml = OrdinalEncoder(categories=[['Low','Medium','High']])
x_train_ml = oe_ml.fit_transform(x_train[['Motivation_Level']])
x_test_ml = oe_ml.transform(x_test[['Motivation_Level']])


In [175]:
oe_pi = OrdinalEncoder(categories=[['Negative','Neutral','Positive']])
x_train_pi = oe_pi.fit_transform(x_train[['Peer_Influence']])
x_test_pi = oe_pi.transform(x_test[['Peer_Influence']])

In [176]:
x_train_rem = x_train.drop(columns=['Access_to_Resources','Motivation_Level','Peer_Influence'])
x_test_rem = x_test.drop(columns=['Access_to_Resources','Motivation_Level','Peer_Influence'])

In [177]:
x_train_main = np.concatenate((x_train_rem,x_train_atr,x_train_ml,x_train_pi),axis=1)
x_test_main = np.concatenate((x_test_rem,x_test_atr,x_test_ml,x_test_pi),axis=1)

In [178]:
lr = LinearRegression()
lr.fit(x_train_main,y_train)
y_pred_lr = lr.predict(x_test_main)
r2_score(y_test, y_pred_lr).__format__('.2f')


'0.69'

In [179]:
dt = DecisionTreeRegressor()
dt.fit(x_train_main,y_train)
y_pred_dt = dt.predict(x_test_main)
r2_score(y_test, y_pred_dt).__format__('.2f')


'0.22'

In [180]:
rf = RandomForestRegressor()
rf.fit(x_train_main,y_train)
y_pred_rf = rf.predict(x_test_main)
r2_score(y_test, y_pred_rf).__format__('.2f')


'0.61'

In [181]:
pickle.dump(oe_atr, open('oe_atr.pkl', 'wb'))
pickle.dump(oe_ml, open('oe_ml.pkl', 'wb'))
pickle.dump(oe_pi, open('oe_pi.pkl', 'wb'))
pickle.dump(lr, open('lr_model.pkl', 'wb'))