#### **Importing all Libraries**

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import joblib

### **Read the data and perform basic EDA**

In [2]:
df = pd.read_csv('Student_Performance.csv')
df.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,12,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0


In [3]:
df.shape

(10000, 6)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     10000 non-null  int64  
 1   Previous Scores                   10000 non-null  int64  
 2   Extracurricular Activities        10000 non-null  object 
 3   Sleep Hours                       10000 non-null  int64  
 4   Sample Question Papers Practiced  10000 non-null  int64  
 5   Performance Index                 10000 non-null  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 468.9+ KB


In [5]:
df.isna().sum()

Hours Studied                       0
Previous Scores                     0
Extracurricular Activities          0
Sleep Hours                         0
Sample Question Papers Practiced    0
Performance Index                   0
dtype: int64

In [6]:
df.duplicated().sum()

np.int64(127)

### **Preparing the data for the Modelling**

In [7]:
df.select_dtypes(include = 'object').columns

Index(['Extracurricular Activities'], dtype='object')

In [8]:
df['Extracurricular Activities'].value_counts()

Extracurricular Activities
No     5052
Yes    4948
Name: count, dtype: int64

In [9]:
def encoder(x):
    if x == 'Yes':
        return 1
    else:
        return 0

In [10]:
df['Extracurricular Activities'] = df['Extracurricular Activities'].apply(encoder)
df['Extracurricular Activities'].value_counts()

Extracurricular Activities
0    5052
1    4948
Name: count, dtype: int64

In [11]:
df.select_dtypes(include = ['int', 'float']).columns

Index(['Hours Studied', 'Previous Scores', 'Extracurricular Activities',
       'Sleep Hours', 'Sample Question Papers Practiced', 'Performance Index'],
      dtype='object')

### **Dividing the data into X and Y**

In [12]:
x = df.drop('Performance Index', axis = 1)
x.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced
0,7,99,1,12,1
1,4,82,0,4,2
2,8,51,1,7,2
3,5,52,1,5,2
4,7,75,0,8,5


In [13]:
y = df['Performance Index']
y.head()

0    91.0
1    65.0
2    45.0
3    36.0
4    66.0
Name: Performance Index, dtype: float64

### **Splitting the data into xtrain, ytrain, xtest and ytest**

In [14]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [15]:
x_train.shape, y_train.shape

((8000, 5), (8000,))

In [16]:
x_test.shape, y_test.shape

((2000, 5), (2000,))

### **Scaling**

In [17]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train[x_train.columns] = scaler.fit_transform(x_train)
x_test[x_test.columns] = scaler.transform(x_test)
x_train.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced
3507,0.393773,-0.317983,1.00778,0.28367,1.193342
6888,0.779684,0.428878,-0.99228,1.462561,-1.599428
4240,1.165594,1.23319,-0.99228,-0.305775,1.542438
327,-1.535778,-0.835041,1.00778,0.873116,-1.250332
6619,0.393773,-0.892492,-0.99228,-0.89522,1.193342


In [18]:
x_test.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced
9345,0.779684,1.635346,-0.99228,-1.484665,-0.203043
8854,1.165594,1.003387,-0.99228,0.28367,1.193342
2139,1.551504,-1.294648,1.00778,-0.305775,-0.901236
3493,1.551504,-0.892492,1.00778,1.462561,-1.599428
4460,-0.378047,-1.007394,1.00778,1.462561,-1.599428


In [19]:
model = LinearRegression()
model.fit(x_train, y_train)
model.score(x_train, y_train)

0.988935213979212

In [20]:
model.score(x_test, y_test)

0.9879920447282405

In [21]:
joblib.dump(model, 'linear_model.joblib')

['linear_model.joblib']

In [28]:
joblib.dump(scaler, 'scaler.joblib')

['scaler.joblib']