### Multiple Linear Regression from scratch 

### Problem Statement:
Using cars.csv data create a algorithm to predict MPG value using ['Weight','Horsepower','Displacement'] as predictors from scratch

In [7]:
#libraries
import os
import numpy as np
import pandas as pd
import statsmodels.api as sm

#reading file
os.chdir(r'C:\Users\shameel\Desktop\Praxis')
cars = pd.read_csv("cars.csv")
#removing anomalies
cars2 = cars.loc[(cars.MPG != 0), ]
#assigning predictors and target
predictors = cars2[['Weight','Horsepower','Displacement']]
target = cars2["MPG"]

In [3]:
b = np.array(target)         

In [8]:
# converting dataframe to array and adding a constant column in predictors
predictors = sm.add_constant(predictors)        #adding constant column of 1 
a = np.array(predictors)                         # assigning perdictors matrix
a

array([[1.000e+00, 3.504e+03, 1.300e+02, 3.070e+02],
       [1.000e+00, 3.693e+03, 1.650e+02, 3.500e+02],
       [1.000e+00, 3.436e+03, 1.500e+02, 3.180e+02],
       ...,
       [1.000e+00, 2.295e+03, 8.400e+01, 1.350e+02],
       [1.000e+00, 2.625e+03, 7.900e+01, 1.200e+02],
       [1.000e+00, 2.720e+03, 8.200e+01, 1.190e+02]])

In [2]:
#the formula to find the regression parameter using matrix method
# b = (xT*x)^-1*xTy

In [9]:
first = np.matmul(np.transpose(a),a)              #finding multiplication predictors and transpose of predictors

In [10]:
firstinv = np.linalg.inv(np.matmul(np.transpose(a),a))  #finding the inverse of first

In [11]:
firstinv                                                

array([[ 7.63198280e-02, -3.92722252e-05, -1.59794249e-04,
         3.06524983e-04],
       [-3.92722252e-05,  2.75897678e-08, -5.96109560e-08,
        -1.88948153e-07],
       [-1.59794249e-04, -5.96109560e-08,  6.60783367e-06,
        -1.77352410e-06],
       [ 3.06524983e-04, -1.88948153e-07, -1.77352410e-06,
         2.26038568e-06]])

In [11]:
second = np.matmul(b,a)                            # #finding multiplication predictors and target

In [12]:
second

array([9.35880000e+03, 2.56140409e+07, 8.68718800e+05, 1.55003940e+06])

In [13]:
#final b vector formula
final = np.matmul(second,firstinv)                    

In [14]:
#parameters for x and y
final

array([ 4.46511571e+01, -5.51769938e-03, -3.10427759e-02, -8.02655853e-03])

In [15]:
model = stm.OLS(target,predictors).fit()         #finding all the parameters using fuction 

In [16]:
model.params

const           44.651157
Weight          -0.005518
Horsepower      -0.031043
Displacement    -0.008027
dtype: float64

We can say that the parameters found from scratch and the parameters found using the function is same.

In [13]:
#function to predict MPG 
def predict(a,b,c):
    y = 4.46511571e+01 - 5.51769938e-03 * a - 3.10427759e-02 * b - 8.02655853e-03 * c 
    return y
    

In [147]:
predictors

Unnamed: 0,const,Weight,Horsepower,Displacement
0,1.0,3504,130,307.0
1,1.0,3693,165,350.0
2,1.0,3436,150,318.0
3,1.0,3433,150,304.0
4,1.0,3449,140,302.0
...,...,...,...,...
401,1.0,2790,86,140.0
402,1.0,2130,52,97.0
403,1.0,2295,84,135.0
404,1.0,2625,79,120.0


In [16]:
predictors['actual'] = target

In [14]:
#predicting the MPG values using predictors
final = predict(cars2.Weight,cars2.Horsepower,cars2.Displacement)
final

0      18.817424
1      16.342940
2      18.483480
3      18.612405
4      18.850603
         ...    
401    25.463379
402    30.505657
403    28.296858
404    26.751630
405    26.142347
Length: 398, dtype: float64

In [17]:
predictors['predicted_MPG'] = final

In [18]:
#compare the predicted and actual values of MPG
predictors

Unnamed: 0,const,Weight,Horsepower,Displacement,actual,predicted_MPG
0,1.0,3504,130,307.0,18.0,18.817424
1,1.0,3693,165,350.0,15.0,16.342940
2,1.0,3436,150,318.0,18.0,18.483480
3,1.0,3433,150,304.0,16.0,18.612405
4,1.0,3449,140,302.0,17.0,18.850603
...,...,...,...,...,...,...
401,1.0,2790,86,140.0,27.0,25.463379
402,1.0,2130,52,97.0,44.0,30.505657
403,1.0,2295,84,135.0,32.0,28.296858
404,1.0,2625,79,120.0,28.0,26.751630


In [20]:
error = np.mean((predictors['actual'] - predictors['predicted_MPG'])**2)
print("RMSE error :",error)

RMSE error : 18.037721140526408
