In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
class LinearRegression():
  def __init__(self, learning_rate = 0.01, iteration=1000, print_info=False):
    self.learning_rate = learning_rate
    self.iteration = iteration
    self.print_info = print_info

  def predictions(self, X):
    return np.dot(X, self.weights) + self.bias

  def mse(self, y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

  def gradient(self, X, y_true, y_pred):
    m = len(y_true)
    dw = -(2/m) * np.dot(X.T, (y_true - y_pred))
    db = -(2/m) * np.sum(y_true - y_pred)
    return dw, db

  def fit(self, X, y):

    n_features = X.shape[1]
    self.weights = 0.01 * np.random.rand(n_features)
    self.bias = 0.01

    for i in range(self.iteration):
      y_pred = self.predictions(X)

      dw, db = self.gradient(X, y, y_pred)

      self.weights -= self.learning_rate*dw
      self.bias -= self.learning_rate*db

      loss = self.mse(y, y_pred)

      if self.print_info == True:
        print(f"Iteration : {i} and loss : {loss}")

In [3]:
df = pd.read_csv("/content/Student_Performance.csv")

In [4]:
df.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0


In [5]:
df.isna().sum()

Unnamed: 0,0
Hours Studied,0
Previous Scores,0
Extracurricular Activities,0
Sleep Hours,0
Sample Question Papers Practiced,0
Performance Index,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     10000 non-null  int64  
 1   Previous Scores                   10000 non-null  int64  
 2   Extracurricular Activities        10000 non-null  object 
 3   Sleep Hours                       10000 non-null  int64  
 4   Sample Question Papers Practiced  10000 non-null  int64  
 5   Performance Index                 10000 non-null  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 468.9+ KB


In [7]:
df["Extracurricular Activities"] = df["Extracurricular Activities"].replace({'Yes': 1, 'No': 0}).astype(int)
# df["Extracurricular Activities"] = df["Extracurricular Activities"].astype(int)

  df["Extracurricular Activities"] = df["Extracurricular Activities"].replace({'Yes': 1, 'No': 0}).astype(int)


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     10000 non-null  int64  
 1   Previous Scores                   10000 non-null  int64  
 2   Extracurricular Activities        10000 non-null  int64  
 3   Sleep Hours                       10000 non-null  int64  
 4   Sample Question Papers Practiced  10000 non-null  int64  
 5   Performance Index                 10000 non-null  float64
dtypes: float64(1), int64(5)
memory usage: 468.9 KB


In [9]:
df.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,1,9,1,91.0
1,4,82,0,4,2,65.0
2,8,51,1,7,2,45.0
3,5,52,1,5,2,36.0
4,7,75,0,8,5,66.0


In [10]:
X = df[['Hours Studied', 'Previous Scores', 'Extracurricular Activities', 'Sleep Hours', 'Sample Question Papers Practiced']]
y = df['Performance Index']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [12]:
cols_to_scale = ['Hours Studied', 'Previous Scores', 'Sleep Hours', 'Sample Question Papers Practiced']

In [13]:
sc = StandardScaler()
X_train[cols_to_scale] = sc.fit_transform(X_train[cols_to_scale])
X_test[cols_to_scale] = sc.transform(X_test[cols_to_scale])

In [14]:
X_train.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced
8459,-1.162102,-1.295836,1,-0.309844,-0.551045
5737,1.155631,-1.641292,1,-0.8996,-1.599657
6918,-0.389524,-1.295836,1,0.279913,-1.599657
5708,-0.775813,-0.201891,0,-0.309844,0.148029
2518,-0.775813,-0.835227,0,1.459427,-0.201508


In [15]:
model = LinearRegression(iteration=5000)

In [16]:
model.fit(X_train, y_train)

In [17]:
model.weights

array([ 7.38188924, 17.69631537,  0.62594911,  0.82093108,  0.55948636])

In [18]:
model.bias

55.020737218498084

In [19]:
y_pred = model.predictions(X_test)

In [20]:
model.mse(y_test, y_pred)

4.063208494408181