# Multiple Linear Regression from scratch

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [16]:
#dataset = pd.read_csv('50_Startups.csv')
dataset = pd.read_csv('https://raw.githubusercontent.com/tanvipenumudy/ML-Stream-Interns-Summer-21/main/Kunal%20Chhikara/Task%209/50_Startups.csv')
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [17]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [18]:
dataset.shape

(50, 5)

In [19]:
dataset.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
R&D Spend,50.0,73721.6156,45902.256482,0.0,39936.37,73051.08,101602.8,165349.2
Administration,50.0,121344.6396,28017.802755,51283.14,103730.875,122699.795,144842.18,182645.56
Marketing Spend,50.0,211025.0978,122290.310726,0.0,129300.1325,212716.24,299469.085,471784.1
Profit,50.0,112012.6392,40306.180338,14681.4,90138.9025,107978.19,139765.9775,192261.83


In [27]:
dataset['State'].unique()

array(['New York', 'California', 'Florida'], dtype=object)

In [32]:
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,4].values.reshape(50,1)

In [33]:
print(X.shape)
print(y.shape)

(50, 4)
(50, 1)


## Encoding categorical data

In [34]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [35]:
print(X)

[[0.0 0.0 1.0 165349.2 136897.8 471784.1]
 [1.0 0.0 0.0 162597.7 151377.59 443898.53]
 [0.0 1.0 0.0 153441.51 101145.55 407934.54]
 [0.0 0.0 1.0 144372.41 118671.85 383199.62]
 [0.0 1.0 0.0 142107.34 91391.77 366168.42]
 [0.0 0.0 1.0 131876.9 99814.71 362861.36]
 [1.0 0.0 0.0 134615.46 147198.87 127716.82]
 [0.0 1.0 0.0 130298.13 145530.06 323876.68]
 [0.0 0.0 1.0 120542.52 148718.95 311613.29]
 [1.0 0.0 0.0 123334.88 108679.17 304981.62]
 [0.0 1.0 0.0 101913.08 110594.11 229160.95]
 [1.0 0.0 0.0 100671.96 91790.61 249744.55]
 [0.0 1.0 0.0 93863.75 127320.38 249839.44]
 [1.0 0.0 0.0 91992.39 135495.07 252664.93]
 [0.0 1.0 0.0 119943.24 156547.42 256512.92]
 [0.0 0.0 1.0 114523.61 122616.84 261776.23]
 [1.0 0.0 0.0 78013.11 121597.55 264346.06]
 [0.0 0.0 1.0 94657.16 145077.58 282574.31]
 [0.0 1.0 0.0 91749.16 114175.79 294919.57]
 [0.0 0.0 1.0 86419.7 153514.11 0.0]
 [1.0 0.0 0.0 76253.86 113867.3 298664.47]
 [0.0 0.0 1.0 78389.47 153773.43 299737.29]
 [0.0 1.0 0.0 73994.56 122782.75 3

##  In linear regression we make our predictions by this equation :
### y_pred = θ n x n + θ n−1 x n−1 + θ n−2 x n−2 +...+ θ 2 x 2 + θ1 x 1 + θ 0

### We can write above equation as the matrix multiplication of theta and X
### y_pred = matrix_mul(X,theta)

In [36]:
# Now, to do a proper matrix multiplication of X and theta, we will need to add a column of 1s before all the features of X. 
# The reason for doing so is that we are multiplying θ2 with x2 , θ1 with x1 and there is no X0 to multiple with θ0 . 
# So we will add 1 at the place of X0.

### Adding a column of ones at the beginning

In [39]:
ones = np.ones((50,1))
X = np.concatenate((ones,X),axis=1)

In [40]:
print(X)

[[1.0 0.0 0.0 1.0 165349.2 136897.8 471784.1]
 [1.0 1.0 0.0 0.0 162597.7 151377.59 443898.53]
 [1.0 0.0 1.0 0.0 153441.51 101145.55 407934.54]
 [1.0 0.0 0.0 1.0 144372.41 118671.85 383199.62]
 [1.0 0.0 1.0 0.0 142107.34 91391.77 366168.42]
 [1.0 0.0 0.0 1.0 131876.9 99814.71 362861.36]
 [1.0 1.0 0.0 0.0 134615.46 147198.87 127716.82]
 [1.0 0.0 1.0 0.0 130298.13 145530.06 323876.68]
 [1.0 0.0 0.0 1.0 120542.52 148718.95 311613.29]
 [1.0 1.0 0.0 0.0 123334.88 108679.17 304981.62]
 [1.0 0.0 1.0 0.0 101913.08 110594.11 229160.95]
 [1.0 1.0 0.0 0.0 100671.96 91790.61 249744.55]
 [1.0 0.0 1.0 0.0 93863.75 127320.38 249839.44]
 [1.0 1.0 0.0 0.0 91992.39 135495.07 252664.93]
 [1.0 0.0 1.0 0.0 119943.24 156547.42 256512.92]
 [1.0 0.0 0.0 1.0 114523.61 122616.84 261776.23]
 [1.0 1.0 0.0 0.0 78013.11 121597.55 264346.06]
 [1.0 0.0 0.0 1.0 94657.16 145077.58 282574.31]
 [1.0 0.0 1.0 0.0 91749.16 114175.79 294919.57]
 [1.0 0.0 0.0 1.0 86419.7 153514.11 0.0]
 [1.0 1.0 0.0 0.0 76253.86 113867.3 29866

# _________________

# Linear regression overview

### Straight line
y_pred = θ n x n + θ n−1 x n−1 + θ n−2 x n−2 +...+ θ 2 x 2 + θ1 x 1 + θ 0

y_pred = matrix_mul(X,theta)

### Cost Function

cost = 1/2n * (Σ[(y_pred - y)^2])

### Gradient Descent

The derivarive of theta: d_theta = 1/n * [matrix_mul( Xᵀ, y_pred-y )]

theta = theta - α * d_theta   ;   where α is the learning rate defined for the model

# _________________________________________________

# Splitting the dataset into the Training set and Test set

In [41]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# _____________________________________________________________

# Preparing the multiple variable linear regression model

In [57]:
def model(X, Y, learning_rate, iteration):
    m = Y.size
    theta = np.zeros((X.shape[1], 1))
    cost_list = []
    
    for i in range(iteration):
        y_pred = np.dot(X, theta)
        cost = (1/(2*m))*np.sum(np.square(y_pred - Y))
        d_theta = (1/m)*np.dot(X.T, y_pred - Y)
        theta = theta - learning_rate*d_theta
        cost_list.append(cost)
    
    return theta, cost_list

# Calling the model

In [63]:
iteration = 1000
learning_rate = 0.01
theta, cost_list = model(X_train, y_train, learning_rate = learning_rate, iteration = iteration)

  cost = (1/(2*m))*np.sum(np.square(y_pred - Y))


In [64]:
print(cost_list)

[6807997862.101886, 3.557054579067899e+31, 1.9822266472205513e+53, 1.1047248601999978e+75, 6.156799899610342e+96, 3.4312783563195094e+118, 1.912303688692013e+140, 1.0657559714005828e+162, 5.9396203505359305e+183, 3.3102408858322327e+205, 1.8448476629060382e+227, 1.0281617008286707e+249, 5.730101754774243e+270, 3.1934729813027935e+292, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]
