# Multiple Linear Regression for Startups with Feature Scaling

## Importing the libraries

In [19]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

To make things simple, we usually structure the dataset so that the target variable column is the last column in the table

*  **X typically denotes the feature variables**, which are all the columns in the table except the last one
* **y typically denotes the single target variable**, which is the last column in the table

In [20]:
dataset = pd.read_csv('Startups.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [21]:
print ("Feature variables of the entire dataset")
print(X)

Feature variables of the entire dataset
[[165349.2 136897.8 471784.1 'New York']
 [162597.7 151377.59 443898.53 'California']
 [153441.51 101145.55 407934.54 'Florida']
 [144372.41 118671.85 383199.62 'New York']
 [142107.34 91391.77 366168.42 'Florida']
 [131876.9 99814.71 362861.36 'New York']
 [134615.46 147198.87 127716.82 'California']
 [130298.13 145530.06 323876.68 'Florida']
 [120542.52 148718.95 311613.29 'New York']
 [123334.88 108679.17 304981.62 'California']
 [101913.08 110594.11 229160.95 'Florida']
 [100671.96 91790.61 249744.55 'California']
 [93863.75 127320.38 249839.44 'Florida']
 [91992.39 135495.07 252664.93 'California']
 [119943.24 156547.42 256512.92 'Florida']
 [114523.61 122616.84 261776.23 'New York']
 [78013.11 121597.55 264346.06 'California']
 [94657.16 145077.58 282574.31 'New York']
 [91749.16 114175.79 294919.57 'Florida']
 [86419.7 153514.11 0.0 'New York']
 [76253.86 113867.3 298664.47 'California']
 [78389.47 153773.43 299737.29 'New York']
 [73994.5

## Encoding categorical data

In [22]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# This does one hot encoding
# ct = ColumnTransformer(transformers=[('encoder', 
#  OneHotEncoder(), [3])], remainder='passthrough')

# This does dummy encoding
# which is the preferred encoding approach
ct = ColumnTransformer(transformers=[('encoder', 
  OneHotEncoder(categories='auto',drop='first',sparse_output=False), [3])], remainder='passthrough')

X = np.array(ct.fit_transform(X))

In [23]:
print ("Dataset with the state categorical column dummy encoded into 2 columns")
print(X)

Dataset with the state categorical column dummy encoded into 2 columns
[[0.0 1.0 165349.2 136897.8 471784.1]
 [0.0 0.0 162597.7 151377.59 443898.53]
 [1.0 0.0 153441.51 101145.55 407934.54]
 [0.0 1.0 144372.41 118671.85 383199.62]
 [1.0 0.0 142107.34 91391.77 366168.42]
 [0.0 1.0 131876.9 99814.71 362861.36]
 [0.0 0.0 134615.46 147198.87 127716.82]
 [1.0 0.0 130298.13 145530.06 323876.68]
 [0.0 1.0 120542.52 148718.95 311613.29]
 [0.0 0.0 123334.88 108679.17 304981.62]
 [1.0 0.0 101913.08 110594.11 229160.95]
 [0.0 0.0 100671.96 91790.61 249744.55]
 [1.0 0.0 93863.75 127320.38 249839.44]
 [0.0 0.0 91992.39 135495.07 252664.93]
 [1.0 0.0 119943.24 156547.42 256512.92]
 [0.0 1.0 114523.61 122616.84 261776.23]
 [0.0 0.0 78013.11 121597.55 264346.06]
 [0.0 1.0 94657.16 145077.58 282574.31]
 [1.0 0.0 91749.16 114175.79 294919.57]
 [0.0 1.0 86419.7 153514.11 0.0]
 [0.0 0.0 76253.86 113867.3 298664.47]
 [0.0 1.0 78389.47 153773.43 299737.29]
 [1.0 0.0 73994.56 122782.75 303319.26]
 [1.0 0.0 6

## Splitting the dataset into the Training set and Test set

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [25]:
print (X_train)

[[1.0 0.0 55493.95 103057.49 214634.81]
 [0.0 1.0 46014.02 85047.44 205517.64]
 [1.0 0.0 75328.87 144135.98 134050.07]
 [0.0 0.0 46426.07 157693.92 210797.67]
 [1.0 0.0 91749.16 114175.79 294919.57]
 [1.0 0.0 130298.13 145530.06 323876.68]
 [1.0 0.0 119943.24 156547.42 256512.92]
 [0.0 1.0 1000.23 124153.04 1903.93]
 [0.0 1.0 542.05 51743.15 0.0]
 [0.0 1.0 65605.48 153032.06 107138.38]
 [0.0 1.0 114523.61 122616.84 261776.23]
 [1.0 0.0 61994.48 115641.28 91131.24]
 [0.0 0.0 63408.86 129219.61 46085.25]
 [0.0 0.0 78013.11 121597.55 264346.06]
 [0.0 0.0 23640.93 96189.63 148001.11]
 [0.0 0.0 76253.86 113867.3 298664.47]
 [0.0 1.0 15505.73 127382.3 35534.17]
 [0.0 1.0 120542.52 148718.95 311613.29]
 [0.0 0.0 91992.39 135495.07 252664.93]
 [0.0 0.0 64664.71 139553.16 137962.62]
 [0.0 1.0 131876.9 99814.71 362861.36]
 [0.0 1.0 94657.16 145077.58 282574.31]
 [0.0 0.0 28754.33 118546.05 172795.67]
 [0.0 0.0 0.0 116983.8 45173.06]
 [0.0 0.0 162597.7 151377.59 443898.53]
 [1.0 0.0 93863.75 1273

## Feature scaling on both the training and test data set

Remember that this must be done after the training and test split in order to avoid data leakage

Notice that we also only scale the training and test data set, but not the target variable y



In [26]:
from sklearn.preprocessing import StandardScaler

# Creating a StandardScaler object for feature scaling
sc_X = StandardScaler()

X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

# Scaling only the last 3 columns of the training set since the first 2 columns are categorial column data
X_train_scaled[:, -3:] = sc_X.fit_transform(X_train[:, -3:])

# Scaling only the last 3 columns of the test set using the same scaler fitted on training set
X_test_scaled[:, -3:] = sc_X.transform(X_test[:, -3:])


In [27]:
print (X_train_scaled)

[[1.0 0.0 -0.3500645436227844 -0.7854710924793271 0.1011968019362538]
 [0.0 1.0 -0.555303187426314 -1.481174262628151 0.02734979174277092]
 [1.0 0.0 0.07935762307586282 0.8013338146656704 -0.551521323997471]
 [0.0 0.0 -0.5463823849331263 1.3250581707161837 0.07011683779235604]
 [1.0 0.0 0.4348537132854595 -0.3559866348200946 0.7514851578736048]
 [1.0 0.0 1.2694314288195354 0.855185185174973 0.986031184474382]
 [1.0 0.0 1.045250070905001 1.2807704710942711 0.44039999942558483]
 [0.0 1.0 -1.529843000700978 0.029420649133439825 -1.6218751012780783]
 [0.0 1.0 -1.5397625087372082 -2.767672641730695 -1.6372965026688253]
 [0.0 1.0 -0.13115188245559178 1.144977005989267 -0.7694999122824163]
 [0.0 1.0 0.927916133722245 -0.029920619212990152 0.4830316172654914]
 [1.0 0.0 -0.20932933131592557 -0.29937679537926 -0.8991541175619614]
 [0.0 0.0 -0.17870827894287686 0.2251351979981532 -1.2640164249123271]
 [0.0 0.0 0.13747089788308053 -0.06929437355020689 0.5038466577007928]
 [0.0 0.0 -1.0396762417942

In [28]:
print (X_test_scaled)

[[1.0 0.0 -0.12149520306216502 2.2889053032012234 -0.6803228661410794]
 [0.0 0.0 0.6280306007656555 -1.2206949887864658 0.38557773677354845]
 [1.0 0.0 0.6549006077802048 -0.49434194923946795 0.21885524294941494]
 [1.0 0.0 -0.947621483893497 -1.494179356244993 -0.30512103616996794]
 [1.0 0.0 1.770481112122021 -0.8593266741758989 1.666880795073106]
 [0.0 1.0 0.009617753042540427 0.17279111579172604 1.2234122937277723]
 [0.0 1.0 -1.1135310904221214 -2.218961756571209 -0.13669119113063422]
 [0.0 1.0 -0.22790702970059823 1.1322241571857976 -0.922748838200462]
 [1.0 0.0 0.0504700706558041 -0.023511746780132174 0.819520742715403]
 [1.0 0.0 1.5250985323520592 -1.2361016229924873 1.3285846911963288]]


## Training the Multiple Linear Regression model 

Here we use the scaled training dataset to fit the model 

In [29]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train_scaled, y_train)

## Generating predictions using the scaled test dataset

Note that here we can still getting the same predicted results as is the case for the model that did not used scaled
training data

This is because the model was trained using the original target variable values in both cases

So even though this model is a completely different function internally than the original model for the case of normal, unscaled training data, it still produces the same output given the scaled test data as input.


 


In [30]:
y_pred_scaled = regressor.predict(X_test_scaled)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred_scaled.reshape(len(y_pred_scaled),1), y_test.reshape(len(y_test),1)),1))

[[103015.2  103282.38]
 [132582.28 144259.4 ]
 [132447.74 146121.95]
 [ 71976.1   77798.83]
 [178537.48 191050.39]
 [116161.24 105008.31]
 [ 67851.69  81229.06]
 [ 98791.73  97483.56]
 [113969.44 110352.25]
 [167921.07 166187.94]]


## Common metrics for model evaluation in regression
* R Square 
* Mean Square Error(MSE)/Root Mean Square Error(RMSE)
* Mean Absolute Error(MAE)

Notice that MSE / RMSE / MAE are still identical to the values of the model where we did not feature scale the training / test dataset

This is because MSE / RMSE / MAE are only affected by the values of the target variable (y), which we did not scale


In [31]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

print("R2 score : ", r2_score(y_test, y_pred_scaled))
print("MSE: ", mean_squared_error(y_test, y_pred_scaled))
print("RMSE : " ,np.sqrt(mean_squared_error(y_test, y_pred_scaled)))
print("MAE : " , mean_absolute_error(y_test,y_pred_scaled))

R2 score :  0.9347068473282423
MSE:  83502864.03257768
RMSE :  9137.99015279496
MAE :  7514.293659640616
