In [1]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

%cd "/content/drive/MyDrive/GSDS/MLDL_study"

Mounted at /content/drive
/content/drive/MyDrive/GSDS/MLDL_study


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
import warnings

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression

warnings.filterwarnings("ignore")

# Implementation of Linear Regression

In [3]:
class LinReg(object):
    def __init__(self, fit_intercept:bool=True, copy_X:bool=True) -> None:
        self.fit_intercept = fit_intercept
        self.copy_X = copy_X
        self.rank_ = int(0.0)
        self.coef_ = None
        self.intercept_ = float(0.0)

    def fit(self, X:object, y:object, sample_weight:object=None) -> object:
        ## convert X and y into numpy array
        try:
            X, y = np.array(X, dtype=np.float64), np.array(y, dtype=np.float64)
        except:
            raise TypeError("Input has invalid type.")

        if X.shape[0] != y.shape[0]:
            raise ValueError("Dimension mismatch.")

        if self.copy_X:
            X_ = X.copy()
        else:
            X_ = X
        
        if sample_weight:
            X_ = X_ * sample_weight
        
        self.rank_ = np.linalg.matrix_rank(X_)
        if self.fit_intercept:
            ## add columns of 1 to the df
            X_ = np.c_[np.ones(shape=X.shape[0]), X_]
        
        to_inv = X_.T@X_
        inv_rank = np.linalg.matrix_rank(to_inv)
        if inv_rank == np.maximum(to_inv.shape[0], to_inv.shape[1]): ## if X_.T@X_ is full rank
            beta = np.linalg.inv(to_inv)@X_.T@y
        else:
            beta = np.linalg.lstsq(X_, y, rcond=-1)[0]

        if self.fit_intercept:
            self.intercept_ = beta[0]
            self.coef_ = beta[1:]
        else:
            self.coef_ = beta

        return self

    def get_params(self) -> dict:
        params = dict()
        params["fit_intercept"] = self.fit_intercept
        params["copy_X"] = self.copy_X
        
        return params

    def predict(self, X:object) -> np.ndarray:
        try:
            X = np.array(X, dtype=np.float64)
        except:
            raise TypeError("Input has invalid type.")
        
        y_hat = X@self.coef_ + self.intercept_
        return y_hat

    def score(self, X:object, y:object, sample_weight:np.ndarray=None):
        try:
            X, y = np.array(X, dtype=np.float64), np.array(y, dtype=np.float64)
        except:
            raise TypeError("Input has invalid type.")

        if X.shape[0] != y.shape[0]:
            raise ValueError("Dimension mismatch.")

        if sample_weight:
            X = X * sample_weight

        y_hat = self.predict(X)
        numerator = ((y - y_hat)**2).sum()
        denominator = ((y - y.mean()) ** 2).sum()
        r_squared = 1 - (numerator / denominator)

        return r_squared

# Import & preprocess data

In [4]:
df = pd.read_csv("./StudentsPerformance.csv", engine="python")
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [6]:
print(f"unique values")
for col in df.columns.values:
    print(f"{col}\n\t{df[col].unique()}")

unique values
gender
	['female' 'male']
race/ethnicity
	['group B' 'group C' 'group A' 'group D' 'group E']
parental level of education
	["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
lunch
	['standard' 'free/reduced']
test preparation course
	['none' 'completed']
math score
	[ 72  69  90  47  76  71  88  40  64  38  58  65  78  50  18  46  54  66
  44  74  73  67  70  62  63  56  97  81  75  57  55  53  59  82  77  33
  52   0  79  39  45  60  61  41  49  30  80  42  27  43  68  85  98  87
  51  99  84  91  83  89  22 100  96  94  48  35  34  86  92  37  28  24
  26  95  36  29  32  93  19  23   8]
reading score
	[ 72  90  95  57  78  83  43  64  60  54  52  81  53  75  89  32  42  58
  69  73  71  74  70  65  87  56  61  84  55  44  41  85  59  17  39  80
  37  63  51  49  26  68  45  47  86  34  79  66  67  91 100  76  77  82
  92  93  62  88  50  28  48  46  23  38  94  97  99  31  96  24  29  40]
writing score
	[ 74  8

In [7]:
X = df[["gender", "lunch", "test preparation course", "math score"]]
y = (df["reading score"] + df["writing score"]) / 2

display(X.head())
display(y.head())

Unnamed: 0,gender,lunch,test preparation course,math score
0,female,standard,none,72
1,female,standard,completed,69
2,female,standard,none,90
3,male,free/reduced,none,47
4,male,standard,none,76


0    73.0
1    89.0
2    94.0
3    50.5
4    76.5
dtype: float64

In [8]:
for col in X.columns.values:
    if col != "math score":
        oh = OneHotEncoder()
        X[col] = oh.fit_transform(X[[col]]).toarray()[:, 1]

display(X)

Unnamed: 0,gender,lunch,test preparation course,math score
0,0.0,1.0,1.0,72
1,0.0,1.0,0.0,69
2,0.0,1.0,1.0,90
3,1.0,0.0,1.0,47
4,1.0,1.0,1.0,76
...,...,...,...,...
995,0.0,1.0,0.0,88
996,1.0,0.0,1.0,62
997,0.0,0.0,0.0,59
998,0.0,1.0,0.0,68


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=24790)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

display(X_train)

(800, 4) (800,)
(200, 4) (200,)


Unnamed: 0,gender,lunch,test preparation course,math score
191,1.0,1.0,0.0,76
9,0.0,0.0,1.0,38
996,1.0,0.0,1.0,62
486,1.0,0.0,1.0,61
581,0.0,1.0,1.0,77
...,...,...,...,...
477,1.0,1.0,1.0,80
697,0.0,1.0,1.0,59
237,0.0,1.0,0.0,64
621,1.0,0.0,1.0,62


# Fit & Comparison with Scikit-learn

## With Intercept

In [10]:
lr_custom = LinReg()
out = lr_custom.fit(X_train, y_train)
print(out.coef_)
print(out.intercept_)

[-12.62676639  -1.87584229  -3.83106408   0.85124836]
22.07976998325483


In [11]:
lr_scikit = LinearRegression()
out_scikit = lr_scikit.fit(X_train, y_train)
print(out_scikit.coef_)
print(out_scikit.intercept_)

[-12.62676639  -1.87584229  -3.83106408   0.85124836]
22.079769983255105


In [12]:
## predict
y_pred = out.predict(X_test)
y_pred_sk = out_scikit.predict(X_test)

print(f"Predicted result with custom class: {y_pred}")
print(f"Predicted result with scikit-learn: {y_pred_sk}")

Predicted result with custom class: [ 76.81149682  65.88722714  85.75005103  67.1959328   90.85754116
  75.25095905  68.29901327  58.93528137  76.81149682  69.15026163
  83.79482924  75.10900011  78.94006419  68.44097221  70.85275834
  47.15976333  53.96975017  77.66274518  47.33310891  85.75005103
  68.01579451  71.70400669  79.50720082  94.68771228  83.19630597
  55.84559246  50.42279782  60.38594596  66.59651656  66.3132978
  41.05906591  68.44097221  85.32398037  66.7384755   70.17485556
  81.20969753  41.23241149  68.44097221  48.04239833  66.7384755
  29.45689345  86.74325832  65.74526821  49.7135084   53.2918474
  82.77023531  87.02647708  62.48223372  77.66274518  60.81112365
  66.76986214  62.34027479  57.54808917  49.7135084   54.99434411
  50.99082741  71.70400669  56.52349524  95.11378294  64.18473043
  61.63098537  80.35844918  65.74526821  42.9349082   59.92848866
  95.11378294  92.70199681  54.56916641  59.92848866  90.14825175
  72.27203629  87.45254774  78.51399353  71

In [13]:
## score
print(f"R-squared with custom class: {out.score(X_test, y_test)}")
print(f"R-squared with scikit-learn: {out_scikit.score(X_test, y_test)}")

R-squared with custom class: 0.8622048983499487
R-squared with scikit-learn: 0.8622048983499492


In [14]:
## params
print(f"params of the custom class: {out.get_params()}")
print(f"params of the scikit-learn: {out_scikit.get_params()}")

params of the custom class: {'fit_intercept': True, 'copy_X': True}
params of the scikit-learn: {'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'normalize': 'deprecated', 'positive': False}


## Without Intercept

In [15]:
lr_custom_noC = LinReg(fit_intercept=False)
out_noC = lr_custom_noC.fit(X_train, y_train)
print(out_noC.coef_)
print(out_noC.intercept_)

[-12.39765014  -2.6002726    0.38083811   1.13691153]
0.0


In [16]:
lr_scikit_noC = LinearRegression(fit_intercept=False)
out_scikit_noC = lr_scikit_noC.fit(X_train, y_train)
print(out_scikit_noC.coef_)
print(out_scikit_noC.intercept_)

[-12.39765014  -2.6002726    0.38083811   1.13691153]
0.0


In [17]:
## predict
y_pred_noC = out_noC.predict(X_test)
y_pred_sk_noC = out_scikit_noC.predict(X_test)

print(f"Predicted result with custom class: {y_pred_noC}")
print(f"Predicted result with scikit-learn: {y_pred_sk_noC}")

Predicted result with custom class: [ 78.50128424  68.37745717  84.94191532  60.25631117  91.76338451
  80.88348401  67.13216893  54.62614208  78.50128424  68.26908046
  87.92302603  76.22746118  75.84662307  71.78819176  70.54290352
  43.36540347  52.46069573  79.63819577  43.69185301  84.94191532
  65.722796    71.67981505  86.56804167 102.3764264   81.53118073
  55.06096833  43.25702677  51.16101892  64.85834586  63.44897293
  30.75099992  71.78819176  89.87039956  69.5143687   69.73244153
  88.84186474  31.07744946  71.78819176  40.17274171  69.5143687
  19.81671085  90.73484969  63.72143433  46.77613807  51.65023373
  86.45966496  92.14422262  63.82981104  79.63819577  57.22641468
  65.1847954   59.17378821  57.33479139  46.77613807  53.9240568
  42.98456537  71.67981505  55.87143032  97.44794217  66.10363411
  62.69289951  87.7049532   63.72143433  33.35127252  60.41907645
  97.44794217  98.69323041  47.85866103  60.41907645  95.28249582
  71.40735366  87.21573839  80.77510731  7

In [18]:
## score
print(f"R-squared with custom class: {out_noC.score(X_test, y_test)}")
print(f"R-squared with scikit-learn: {out_scikit_noC.score(X_test, y_test)}")

R-squared with custom class: 0.8083094130735747
R-squared with scikit-learn: 0.8083094130735746


In [19]:
## params
print(f"params of the custom class: {out_noC.get_params()}")
print(f"params of the scikit-learn: {out_scikit_noC.get_params()}")

params of the custom class: {'fit_intercept': False, 'copy_X': True}
params of the scikit-learn: {'copy_X': True, 'fit_intercept': False, 'n_jobs': None, 'normalize': 'deprecated', 'positive': False}
