# **Importing required libraries**

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# **Loading Data**

In [4]:
df = pd.read_csv("Student_Performance.csv")
df

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0
...,...,...,...,...,...,...
9995,1,49,Yes,4,2,23.0
9996,7,64,Yes,8,5,58.0
9997,6,83,Yes,8,5,74.0
9998,9,97,Yes,7,0,95.0


# **Preparing the data**

## Converting non numeric to numeric

In [5]:
df['Extracurricular Activities']=df['Extracurricular Activities'].map({'Yes':1 , 'No':0})
df
df.dtypes

Hours Studied                         int64
Previous Scores                       int64
Extracurricular Activities            int64
Sleep Hours                           int64
Sample Question Papers Practiced      int64
Performance Index                   float64
dtype: object

## Seperating x and y

In [6]:
y=df['Performance Index']

In [7]:
x = df.drop('Performance Index', axis='columns')

In [8]:
x

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced
0,7,99,1,9,1
1,4,82,0,4,2
2,8,51,1,7,2
3,5,52,1,5,2
4,7,75,0,8,5
...,...,...,...,...,...
9995,1,49,1,4,2
9996,7,64,1,8,5
9997,6,83,1,8,5
9998,9,97,1,7,0


## Data Normalization

In [9]:
scaler = MinMaxScaler()
scaler.fit(x)
new_x = scaler.transform(x)
x = pd.DataFrame(new_x, columns=['Hours Studied', 'Previous Scores', 'Extracurricular Activities', 'Sleep Hours', 'Sample Question Papers Practiced'])
x

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced
0,0.750,1.000000,1.0,1.0,0.111111
1,0.375,0.711864,0.0,0.0,0.222222
2,0.875,0.186441,1.0,0.6,0.222222
3,0.500,0.203390,1.0,0.2,0.222222
4,0.750,0.593220,0.0,0.8,0.555556
...,...,...,...,...,...
9995,0.000,0.152542,1.0,0.0,0.222222
9996,0.750,0.406780,1.0,0.8,0.555556
9997,0.625,0.728814,1.0,0.8,0.555556
9998,1.000,0.966102,1.0,0.6,0.000000


## Data Splitting

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=100)

In [11]:
x_test

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced
8018,0.125,0.813559,1.0,0.4,0.666667
9225,0.500,0.406780,1.0,0.0,0.333333
3854,0.500,0.152542,0.0,1.0,1.000000
2029,0.875,0.983051,1.0,0.2,1.000000
3539,1.000,0.796610,0.0,0.6,0.777778
...,...,...,...,...,...
6923,0.625,0.813559,1.0,0.6,0.666667
1207,0.000,0.169492,1.0,0.6,0.666667
7960,0.000,0.694915,0.0,0.4,0.666667
2339,0.500,0.118644,0.0,0.8,1.000000


# **Model Building**

## Cost Funtion

In [20]:
def cost(x_train,y_train, m0_old,m1_old,m2_old,m3_old,m4_old,m5_old):
  n= len(x_train)
  mse = 0

  for i in range(n):
    y= y_train.iloc[i]
    x1,x2,x3,x4,x5=x_train.iloc[i]
    error = y - m0_old - (m1_old*x1) - (m2_old*x2) - (m3_old*x3) - (m4_old*x4) - (m5_old*x5)
    mse += error**2
  mse /= n
  
  return mse


## Gradient Function

In [21]:
def gradient(x_train,y_train, m0_old,m1_old,m2_old,m3_old,m4_old,m5_old, L):
    n= len(x_train)
    m0_grad=0
    m1_grad=0
    m2_grad=0
    m3_grad=0
    m4_grad=0
    m5_grad=0

    for i in range(n):
      y= y_train.iloc[i]
      x1,x2,x3,x4,x5=x_train.iloc[i]
      error = y - m0_old - (m1_old*x1) - (m2_old*x2) - (m3_old*x3) - (m4_old*x4) - (m5_old*x5)
      m0_grad += (-2/n)*error
      m1_grad += (-2/n)*error*x1
      m2_grad += (-2/n)*error*x2
      m3_grad += (-2/n)*error*x3
      m4_grad += (-2/n)*error*x4
      m5_grad += (-2/n)*error*x5


    m0=m0_old - L*m0_grad
    m1=m1_old - L*m1_grad
    m2=m2_old - L*m2_grad
    m3=m3_old - L*m3_grad
    m4=m4_old - L*m4_grad
    m5=m5_old - L*m5_grad

    return m0,m1,m2,m3,m4,m5

## Using all the functions to build model

#### Basically BUilding the model means to fit the values of parameters to minimise the error

In [22]:
m0,m1,m2,m3,m4,m5 =0,0,0,0,0,0
L=0.1
epochs = 1000

prev_cost = cost(x_train,y_train, m0,m1,m2,m3,m4,m5)

for i in range(epochs):
  m0,m1,m2,m3,m4,m5 =gradient(x_train,y_train,m0,m1,m2,m3,m4,m5, L)
  curr_cost = cost(x_train,y_train, m0,m1,m2,m3,m4,m5)
  if(abs(curr_cost-prev_cost)< 1e-6):
    break
  prev_cost = curr_cost


In [26]:
print('m0 = ', m0) #11.504165403690507
print('m1 = ', m1) #22.750790411272785
print('m2 = ', m2) #60.1036918904586
print('m3 = ', m3) #0.6101715193600866
print('m4 = ', m4) #2.367650173281325
print('m5 = ', m5) #1.7280457220438492


m0 =  11.504165403690507
m1 =  22.750790411272785
m2 =  60.1036918904586
m3 =  0.6101715193600866
m4 =  2.367650173281325
m5 =  1.7280457220438492


In [28]:
x_test

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced
8018,0.125,0.813559,1.0,0.4,0.666667
9225,0.500,0.406780,1.0,0.0,0.333333
3854,0.500,0.152542,0.0,1.0,1.000000
2029,0.875,0.983051,1.0,0.2,1.000000
3539,1.000,0.796610,0.0,0.6,0.777778
...,...,...,...,...,...
6923,0.625,0.813559,1.0,0.6,0.666667
1207,0.000,0.169492,1.0,0.6,0.666667
7960,0.000,0.694915,0.0,0.4,0.666667
2339,0.500,0.118644,0.0,0.8,1.000000


In [12]:
# Optional block
m0 =  11.504165403690507
m1 =  22.750790411272785
m2 =  60.1036918904586
m3 =  0.6101715193600866
m4 =  2.367650173281325
m5 =  1.7280457220438492

# **Prediction**

In [13]:
def predict(x_test, m0,m1,m2,m3,m4,m5):
    n = len(x_test)
    y_pred = []
    for i in range(n):
        x1,x2,x3,x4,x5=x_test.iloc[i]
        y = m0 + m1*x1 + m2*x2 + m3*x3 + m4*x4 + m5*x5
        y_pred.append(y)
    return y_pred
        

In [14]:
y_pred = predict(x_test, m0,m1,m2,m3,m4,m5)

In [15]:
result = pd.DataFrame(
    {'Y Actual': y_test,
     'Y Predicted': y_pred,
     'difference' : y_test - y_pred
    })
    

In [16]:
result

Unnamed: 0,Y Actual,Y Predicted,difference
8018,65.0,65.955195,-0.955195
9225,51.0,48.514707,2.485293
3854,35.0,36.143616,-1.143616
2029,97.0,93.307840,3.692160
3539,87.0,84.898794,2.101206
...,...,...,...
6923,75.0,77.804120,-2.804120
1207,26.0,24.874024,1.125976
7960,55.0,55.370228,-0.370228
2339,35.0,33.632673,1.367327


# **Model Evaluation**

In [17]:
from sklearn.metrics import mean_squared_error, r2_score

test_mse = mean_squared_error(y_pred, y_test)
test_r2 = r2_score(y_test, y_pred)

In [18]:
print('R2 Score on the test data : ', test_r2)
print('Mean Squared Error on the test data : ', test_mse)

R2 Score on the test data :  0.9885564752158252
Mean Squared Error on the test data :  4.241243210220372


In [19]:
def calculate_accuracy(y_true, y_pred, tolerance):
    accurate_predictions = np.abs(y_true - y_pred) <= tolerance
    accuracy = np.mean(accurate_predictions)
    return accuracy

In [27]:
tolerance = 4.5
accuracy_test = calculate_accuracy(y_test, y_pred, tolerance)

print(f'Test Accuracy within a tolerance of {tolerance}: {accuracy_test * 100:.2f}%') # at tolerance 4.5 accuracy is 96.6%

Test Accuracy within a tolerance of 4.5: 96.60%
