In [445]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score as R2

RANDOM_STATE = 42


In [446]:
class HandmadeLinearRegression:
    def __init__(self):
        self.w = None

    def predict(self, X):
        return X.dot(self.w[1:]) + self.w[0]

    def fit(self, X, Y):
        pass

In [447]:
class GradientDescent(HandmadeLinearRegression):
    def __init__(self, alpha = 0.0001, maxLoop=5000, randomState=42):
        super().__init__()
        self.alpha = alpha
        self.maxLoop = maxLoop
        self.randomState = randomState

        
    def fit(self, X, Y):
        
        nSamples, nFeatures = X.shape
        
        numberGenerator = np.random.RandomState(self.randomState)
        self.w = numberGenerator.normal(loc=0.0, scale=0.01, size= 1 + nFeatures)
                
        for i in range(self.maxLoop):
            
            y = self.predict(X)
            diff = y - Y
            self.w[0] -= self.alpha * np.sum(diff)
            self.w[1:] -= self.alpha * np.dot(X.T, diff)

            if np.any(np.isnan(self.w)) or np.any(np.isinf(self.w)):
                print(f"NaN or Inf detected at iteration {i}")
                return


In [448]:
class NormalEquation(HandmadeLinearRegression):
    def addBias(self, X):
        return np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
    
    def _fit(self, X, Y):
        self.w = np.linalg.pinv(X.T @ X) @ (X.T @ Y)

    def fit(self, X, Y):
        XBias = self.addBias(X)
        self._fit(XBias, Y)

In [449]:
studentDataset = pd.read_csv('StudentPerformanceFactors.csv')
scaler = StandardScaler()

studentDataset = studentDataset.fillna(studentDataset.mean(numeric_only=True))
studentDataset = studentDataset.fillna(studentDataset.mode().iloc[0])


In [450]:
columnNames = studentDataset.columns.values

ordinalMap = {
    'Parental_Involvement': {'Low': -1, 'Medium': 0, 'High': 1},
    'Access_to_Resources': {'Low': -1, 'Medium': 0, 'High': 1},
    'Extracurricular_Activities': {'No': -1, 'Yes': 1},
    'Motivation_Level': {'Low': -1, 'Medium': 0, 'High': 1},
    'Internet_Access': {'No': -1, 'Yes': 1},
    'Family_Income': {'Low': -1, 'Medium': 0, 'High': 1},
    'Teacher_Quality': {'Low': -1, 'Medium': 0, 'High': 1},
    'School_Type': {'Public': 0, 'Private': 1},
    'Peer_Influence': {'Negative': -1,'Neutral': 0, 'Positive': 1},
    'Learning_Disabilities': {'No': -1, 'Yes': 1},
    'Parental_Education_Level': {'High School': 1, 'College': 2, 'Postgraduate': 3},
    'Distance_from_Home': {'Near': 1, 'Moderate': 0, 'Far': -1},
    'Gender': {'Male': 0, 'Female': 1},
}


In [451]:

# objectColumns = studentDataset.select_dtypes(include=['object']).columns.tolist()
# studentDataset = pd.get_dummies(studentDataset, columns=objectColumns, drop_first=True)
# studentDataset = studentDataset.astype(float)

targetColumn = 'Exam_Score'
featureColumns = [col for col in studentDataset.columns if col != targetColumn]


for col, mapping in ordinalMap.items():
    studentDataset[col] = studentDataset[col].map(mapping)

studentDataset[featureColumns] = scaler.fit_transform(studentDataset[featureColumns])

studentDataset.head(5)

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,0.504942,0.348375,-1.562146,1.288574,-1.214685,-0.019796,-0.1438,-1.302866,0.285825,-1.213934,-1.060721,-0.327233,-0.661006,1.07055,0.031411,-0.342867,-0.893742,0.748407,-0.855746,67
1,-0.162822,-1.383736,-1.562146,-0.143488,-1.214685,0.661399,-1.11611,-1.302866,0.285825,0.411451,0.285971,-0.327233,-0.661006,-1.575587,1.001199,-0.342867,0.390223,-0.743665,1.16857,61
2,0.671882,1.560853,-0.124267,-0.143488,0.823259,-0.019796,1.106313,0.134442,0.285825,0.411451,0.285971,-0.327233,-0.661006,-0.252518,1.001199,-0.342867,1.674187,0.748407,-0.855746,74
3,1.506587,0.781403,-1.562146,-0.143488,0.823259,0.661399,1.592469,0.134442,0.285825,-0.401242,0.285971,-0.327233,-0.661006,-1.575587,1.001199,-0.342867,-0.893742,-0.743665,-0.855746,71
4,-0.162822,1.04122,-0.124267,-0.143488,0.823259,-0.70099,-0.699406,0.134442,0.285825,1.224144,0.285971,1.348757,-0.661006,-0.252518,1.001199,-0.342867,0.390223,0.748407,1.16857,70


In [452]:
X = studentDataset[featureColumns]
Y = studentDataset[targetColumn]
XTrain, XTest, YTrain, YTest = train_test_split(X, Y, test_size=0.2, random_state=RANDOM_STATE)

In [453]:
model = LinearRegression()
model.fit(XTrain, YTrain)
YPred = model.predict(XTest)
print('MAE: ', round(MAE(YTest, YPred), 8))
print('MSE: ', round(MSE(YTest, YPred), 8))
print('R2S: ', round(R2(YTest, YPred), 8))

MAE:  0.44428946
MSE:  3.23794591
R2S:  0.77092824


In [454]:
model = NormalEquation()
model.fit(XTrain, YTrain)
YPred = model.predict(XTest)
print('MAE: ', round(MAE(YTest, YPred), 8))
print('MSE: ', round(MSE(YTest, YPred), 8))
print('R2S: ', round(R2(YTest, YPred), 8))

MAE:  0.44428946
MSE:  3.23794591
R2S:  0.77092824
