### Read the data and perform basic EDA

In [1]:
import pandas as pd
import numpy as np

In [2]:
sp = pd.read_csv('Student_Performance.csv')

In [3]:
sp

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,12,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0
...,...,...,...,...,...,...
9995,1,49,Yes,4,2,23.0
9996,7,64,Yes,8,5,58.0
9997,6,83,Yes,8,5,74.0
9998,9,97,Yes,7,0,95.0


In [4]:
sp['Performance Index'].max()

np.float64(100.0)

In [5]:
sp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     10000 non-null  int64  
 1   Previous Scores                   10000 non-null  int64  
 2   Extracurricular Activities        10000 non-null  object 
 3   Sleep Hours                       10000 non-null  int64  
 4   Sample Question Papers Practiced  10000 non-null  int64  
 5   Performance Index                 10000 non-null  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 468.9+ KB


In [6]:
sp['Extracurricular Activities']

0       Yes
1        No
2       Yes
3       Yes
4        No
       ... 
9995    Yes
9996    Yes
9997    Yes
9998    Yes
9999     No
Name: Extracurricular Activities, Length: 10000, dtype: object

### Preparing the Data for the Modelling

In [7]:
sp['Extracurricular Activities'] = sp['Extracurricular Activities'].apply(lambda x : 1 if x == 'Yes' else 0)

In [8]:
sp['Extracurricular Activities']

0       1
1       0
2       1
3       1
4       0
       ..
9995    1
9996    1
9997    1
9998    1
9999    0
Name: Extracurricular Activities, Length: 10000, dtype: int64

In [9]:
sp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     10000 non-null  int64  
 1   Previous Scores                   10000 non-null  int64  
 2   Extracurricular Activities        10000 non-null  int64  
 3   Sleep Hours                       10000 non-null  int64  
 4   Sample Question Papers Practiced  10000 non-null  int64  
 5   Performance Index                 10000 non-null  float64
dtypes: float64(1), int64(5)
memory usage: 468.9 KB


In [10]:
x = sp.drop('Performance Index', axis = 1)

In [11]:
y = sp['Performance Index']

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.25, random_state = 42)

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

In [15]:
pipeline = Pipeline([('scaler', StandardScaler()),
                     ('model', LinearRegression())
])

pipeline.fit(x_train, y_train)

0,1,2
,steps,"[('scaler', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [16]:
x_train.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced
4901,5,49,0,5,5
4375,7,88,1,4,9
6698,3,94,1,7,1
9805,9,54,1,5,9
1101,4,56,1,8,6


In [17]:
x_test.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced
6252,5,69,0,8,2
4684,2,46,1,4,8
1731,7,56,1,7,5
4742,6,42,1,8,5
4521,7,53,0,4,6


In [18]:
pipeline.score(x_train, y_train)

0.9886395368813107

In [19]:
pipeline.score(x_test, y_test)

0.9890503599491122

In [20]:
print(pipeline.named_steps['scaler'].mean_)
print(pipeline.named_steps['scaler'].scale_)

[ 4.97933333 69.64066667  0.49453333  6.544       4.61906667]
[ 2.59542666 17.33605336  0.49997011  1.69903816  2.86106445]


# Training a Linear Regression Model

In [21]:
# from sklearn.linear_model import LinearRegression
# model = LinearRegression()
# model.fit(x_train, y_train)
# y_pred = model.predict(x_train)
# y_pred

In [22]:
# from sklearn.metrics import r2_score
# train_R2 = r2_score(y_train, y_pred)
# train_R2
# test_R2 = r2_score(y_test, y_pred_test)
# test_R2

# R2 Score -
## Linear Regression by default calculates R2 score using .score() method

In [23]:
# model.score(x_train, y_train)

In [24]:
# model.score(x_test, y_test)

#Adjusted R2 score

In [25]:
m1 = x_train.shape[0]
d1 = x_train.shape[1]
m2 = x_test.shape[0]
d2 = x_test.shape[1]

In [26]:
m1,  d1,  m2,  d2

(7500, 5, 2500, 5)

In [27]:
def adjusted_r2(r2, m, d):
  return (1 - ((1 - r2)*(m - 1))/(m - d - 1))

In [28]:
# adjusted_r2(train_R2, m1, d1)

In [29]:
# adjusted_r2(test_R2, m2, d2)

In [30]:
p = pd.DataFrame({'Hours Studied': [4], 'Previous Scores' : [83.8], 'Extracurricular Activities': [1], 'Sleep Hours': [6], 'Sample Question Papers Practiced' : [2]})

In [31]:
pipeline.predict(p)

array([66.54931149])

In [32]:
import joblib
joblib.dump(pipeline, 'LinearRegression.joblib')

['LinearRegression.joblib']