## Multiple Linear Regression From Scratch
We often wonder how the regression coefficients are calculated using the 'fit' function in 'sklearn' or 'OLS' in                'statsmodels'. This notebook finds the coefficients from (XT.X)-1.XT.Y and compares them with the ones obtained using          sklearn, statsmodels libraries in Python.
   
Note: The models below are not optimized for performance i.e. this code is only intended to demo how to find the regression coefficients without the using built in functions  


In [3]:
# Import all the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
from statsmodels.regression.linear_model import OLS
from numpy.linalg import inv

In [23]:
# Import the dataset
# This dataset contains GPAs of 100 students and other variables such as college, work hours etc.. 
data=pd.read_csv("D:/Excel sheets/Freshmen1.csv")
# Display top 5 records
df.head()

Unnamed: 0,GPA,Miles from Home,College,Accommodations,Years Off,Part-Time Work Hours,Attends Office Hours,High School GPA,Unnamed: 8
0,0.73,253,Social Sciences,Dorm,4,35,Sometimes,3.23,
1,1.6,143,Social Sciences,Dorm,5,30,Never,2.35,
2,2.17,171,Social Sciences,Dorm,0,25,Never,3.95,
3,1.02,332,Sciences,Off-campus,5,30,Sometimes,3.44,
4,3.14,112,Business,Dorm,0,25,Sometimes,3.2,


In [8]:
# Check if there are any null values in the dataframe
data.isnull().any()
# There are no null values in the data as shown below

GPA                     False
Miles from Home         False
College                 False
Accommodations          False
Years Off               False
Part-Time Work Hours    False
Attends Office Hours    False
High School GPA         False
Unnamed: 8               True
dtype: bool

In [24]:
# Cross check the data types of all the columns before modeling
data.dtypes
# 'College','Accommodations' and 'Attends Office Hours' need to be changed to categorical

GPA                     float64
Miles from Home           int64
College                  object
Accommodations           object
Years Off                 int64
Part-Time Work Hours      int64
Attends Office Hours     object
High School GPA         float64
dtype: object

In [26]:
# Change the data types of the categorical variables accordingly
data.College=data.College.astype('category')
data.Accommodations=data.Accommodations.astype('category')
data['Attends Office Hours']=data['Attends Office Hours'].astype('category')

In [28]:
data.dtypes

GPA                      float64
Miles from Home            int64
College                 category
Accommodations          category
Years Off                  int64
Part-Time Work Hours       int64
Attends Office Hours    category
High School GPA          float64
dtype: object

In [27]:
# Generate dummy values of the categorical variables and drop one (i.e. n-1 dummies for n categories)
data_dummies=pd.get_dummies(data,drop_first=True)
# The equation of a straight line is y=c+mx where c is the intercept. 
# When coverted to the matrix form the coefficient of the intercept needs to be 1. 
# Hence sticking 100 (length of the dataframe) ones to the dataframe
data_dummies.insert(1,'Intercept',[1]*len(data_dummies))
# Display top 5 records
data_dummies.head()

Unnamed: 0,GPA,Intercept,Miles from Home,Years Off,Part-Time Work Hours,High School GPA,College_Engineering,College_Liberal Arts,College_Sciences,College_Social Sciences,Accommodations_Off-campus,Accommodations_Other,Attends Office Hours_Regularly,Attends Office Hours_Sometimes
0,0.73,1,253,4,35,3.23,0,0,0,1,0,0,0,1
1,1.6,1,143,5,30,2.35,0,0,0,1,0,0,0,0
2,2.17,1,171,0,25,3.95,0,0,0,1,0,0,0,0
3,1.02,1,332,5,30,3.44,0,0,1,0,1,0,0,1
4,3.14,1,112,0,25,3.2,0,0,0,0,0,0,0,1


In [29]:
# Split the whole data into X and Y sets
X_train=data_dummies.iloc[:,1:]
Y_train=data_dummies.GPA

In [30]:
# Display top 5 records from train data 
X_train.head()

Unnamed: 0,Intercept,Miles from Home,Years Off,Part-Time Work Hours,High School GPA,College_Engineering,College_Liberal Arts,College_Sciences,College_Social Sciences,Accommodations_Off-campus,Accommodations_Other,Attends Office Hours_Regularly,Attends Office Hours_Sometimes
0,1,253,4,35,3.23,0,0,0,1,0,0,0,1
1,1,143,5,30,2.35,0,0,0,1,0,0,0,0
2,1,171,0,25,3.95,0,0,0,1,0,0,0,0
3,1,332,5,30,3.44,0,0,1,0,1,0,0,1
4,1,112,0,25,3.2,0,0,0,0,0,0,0,1


In [31]:
# fit sklearn model
model_sklearn=linear_model.LinearRegression().fit(X_train,Y_train)

In [32]:
# Coefficients from the sklearn model
model_sklearn.coef_

array([ 0.        , -0.00104171,  0.03850818, -0.00081253,  0.23876306,
       -0.12119174,  0.0338269 , -0.20859109,  0.10459972, -0.24059864,
       -0.30758173,  0.741165  ,  0.13650608])

In [33]:
# Intercept from the sklearn model
model_sklearn.intercept_

1.5950254174249923

In [34]:
# fit statsmodels model
model_statsmodels=OLS(Y_train,X_train).fit()
# Intercept and coefficients from OLS
model_statsmodels.params

Intercept                         1.595025
Miles from Home                  -0.001042
Years Off                         0.038508
Part-Time Work Hours             -0.000813
High School GPA                   0.238763
College_Engineering              -0.121192
College_Liberal Arts              0.033827
College_Sciences                 -0.208591
College_Social Sciences           0.104600
Accommodations_Off-campus        -0.240599
Accommodations_Other             -0.307582
Attends Office Hours_Regularly    0.741165
Attends Office Hours_Sometimes    0.136506
dtype: float64

### Model without using built in functions

In [35]:
# Convert X and Model without using built in functionsY train sets to matrices
A=np.array(X_train)
b=np.array(Y_train)

In [36]:
# Calculate coefficients from (XT.X)-1.XT.Y
coefficients=inv(A.transpose().dot(A)).dot(A.transpose()).dot(b)

In [37]:
# Coefficients from the equation
coefficients

array([ 1.59502542e+00, -1.04171269e-03,  3.85081837e-02, -8.12525712e-04,
        2.38763063e-01, -1.21191742e-01,  3.38268965e-02, -2.08591086e-01,
        1.04599722e-01, -2.40598637e-01, -3.07581733e-01,  7.41165003e-01,
        1.36506079e-01])

In [38]:
# Display all three functions' coefficients together
model_sklearn.coef_[0]=model_sklearn.intercept_
allcoeficients=pd.DataFrame({'Sklearn':model_sklearn.coef_,'OLS':model_statsmodels.params,
                            'Equation':coefficients},index=model_statsmodels.params.index)

In [39]:
# Coefficients from all the three models
allcoeficients
# All of them are exactly equal as you can see below

Unnamed: 0,Sklearn,OLS,Equation
Intercept,1.595025,1.595025,1.595025
Miles from Home,-0.001042,-0.001042,-0.001042
Years Off,0.038508,0.038508,0.038508
Part-Time Work Hours,-0.000813,-0.000813,-0.000813
High School GPA,0.238763,0.238763,0.238763
College_Engineering,-0.121192,-0.121192,-0.121192
College_Liberal Arts,0.033827,0.033827,0.033827
College_Sciences,-0.208591,-0.208591,-0.208591
College_Social Sciences,0.1046,0.1046,0.1046
Accommodations_Off-campus,-0.240599,-0.240599,-0.240599
