## Importing the libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Importing the dataset

In [2]:
students = pd.read_csv('Student_Performance.csv')

In [3]:
students.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0


### Replacing white spaces with _

In [4]:
columns = []
for _ in students.columns:
    columns.append(_.replace(' ','_'))
students.columns = columns

In [5]:
students.columns = columns

In [6]:
students.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours_Studied                     10000 non-null  int64  
 1   Previous_Scores                   10000 non-null  int64  
 2   Extracurricular_Activities        10000 non-null  object 
 3   Sleep_Hours                       10000 non-null  int64  
 4   Sample_Question_Papers_Practiced  10000 non-null  int64  
 5   Performance_Index                 10000 non-null  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 468.9+ KB


In [7]:
students.describe()

Unnamed: 0,Hours_Studied,Previous_Scores,Sleep_Hours,Sample_Question_Papers_Practiced,Performance_Index
count,10000.0,10000.0,10000.0,10000.0,10000.0
mean,4.9929,69.4457,6.5306,4.5833,55.2248
std,2.589309,17.343152,1.695863,2.867348,19.212558
min,1.0,40.0,4.0,0.0,10.0
25%,3.0,54.0,5.0,2.0,40.0
50%,5.0,69.0,7.0,5.0,55.0
75%,7.0,85.0,8.0,7.0,71.0
max,9.0,99.0,9.0,9.0,100.0


## Divinding Dataset into Independent Features and Dependent Feature

In [8]:
X = students.drop(columns = 'Performance_Index')
y = students['Performance_Index']

In [9]:
X.head()

Unnamed: 0,Hours_Studied,Previous_Scores,Extracurricular_Activities,Sleep_Hours,Sample_Question_Papers_Practiced
0,7,99,Yes,9,1
1,4,82,No,4,2
2,8,51,Yes,7,2
3,5,52,Yes,5,2
4,7,75,No,8,5


In [10]:
y.head()

0    91.0
1    65.0
2    45.0
3    36.0
4    66.0
Name: Performance_Index, dtype: float64

## Converting True False into 1 and 0

In [11]:
X['Extracurricular_Activities'] = X['Extracurricular_Activities'].map({'Yes':1, 'No': 0})

In [12]:
X.head()

Unnamed: 0,Hours_Studied,Previous_Scores,Extracurricular_Activities,Sleep_Hours,Sample_Question_Papers_Practiced
0,7,99,1,9,1
1,4,82,0,4,2
2,8,51,1,7,2
3,5,52,1,5,2
4,7,75,0,8,5


## Splitting the Data into training and testing

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 42)

In [14]:
X_train.head()

Unnamed: 0,Hours_Studied,Previous_Scores,Extracurricular_Activities,Sleep_Hours,Sample_Question_Papers_Practiced
9069,4,99,1,6,1
2603,6,90,1,9,3
7738,8,57,0,6,1
1579,6,92,1,8,7
5058,6,64,1,8,4


In [15]:
y_train.head()

9069    82.0
2603    79.0
7738    50.0
1579    81.0
5058    52.0
Name: Performance_Index, dtype: float64

## Normalizing the training dataset

In [16]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [17]:
X_train = scaler.fit_transform(X_train)

In [18]:
X_train

array([[-0.37921729,  1.69543433,  1.01962103, -0.31944271, -1.26226064],
       [ 0.39110222,  1.17637842,  1.01962103,  1.44869036, -0.56482402],
       [ 1.16142173, -0.72682657, -0.98075655, -0.31944271, -1.26226064],
       ...,
       [ 1.54658149, -1.24588248, -0.98075655,  0.26993498,  0.4813309 ],
       [-1.53469656, -1.30355536, -0.98075655,  1.44869036, -1.61097894],
       [-1.1495368 , -1.36122824, -0.98075655, -0.31944271,  0.4813309 ]])

In [19]:
X_test = scaler.transform(X_test)

In [20]:
X_test

array([[ 0.00594246, -0.03475203, -0.98075655,  0.85931267, -0.91354233],
       [-1.1495368 , -1.36122824,  1.01962103, -1.49819809,  1.17876751],
       [ 0.77626198, -0.78449945,  1.01962103,  0.26993498,  0.13261259],
       ...,
       [ 0.00594246, -1.24588248,  1.01962103, -0.31944271, -0.21610572],
       [-0.76437705,  0.94568691,  1.01962103,  1.44869036,  0.13261259],
       [ 0.39110222, -1.53424687,  1.01962103, -0.31944271, -0.21610572]])

## Apply Linear Regression 

In [21]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()

In [22]:
lr.fit(X_train, y_train)

In [23]:
lr.intercept_

55.39971428571428

In [24]:
lr.coef_

array([ 7.42276987, 17.60827303,  0.29080153,  0.8138699 ,  0.54598268])

In [26]:
y_predict = lr.predict(X_test)

In [27]:
y_predict

array([54.74728554, 22.61884434, 47.93665124, ..., 33.42451751,
       67.92578967, 31.20587077])

In [28]:
y_test

6252    51.0
4684    20.0
1731    46.0
4742    28.0
4521    41.0
        ... 
8014    32.0
1074    40.0
3063    33.0
6487    69.0
4705    29.0
Name: Performance_Index, Length: 3000, dtype: float64

In [30]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

mae = mean_absolute_error(y_test, y_predict)
mse = mean_squared_error(y_test, y_predict)

In [31]:
rmse = np.sqrt(mse)

In [33]:
print('Mean Squared Error: ', mse)
print('Root Mean Squared Error: ', rmse)
print('Mean Absolute Error: ', mae)

Mean Squared Error:  4.066563824092649
Root Mean Squared Error:  2.0165722957763377
Mean Absolute Error:  1.6090437564045172
