# Multiple Linear Regression for Startups: NumPy arrays version

## Importing the libraries

In [50]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

To make things simple, we usually structure the dataset so that the target variable column is the last column in the table

*  **X typically denotes the feature variables**, which are all the columns in the table except the last one
* **y typically denotes the single target variable**, which is the last column in the table

In [51]:
dataset = pd.read_csv('Startups.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [52]:
print ("Feature variables of the entire dataset")
print(X)
print (type(X))

Feature variables of the entire dataset
[[165349.2 136897.8 471784.1 'New York']
 [162597.7 151377.59 443898.53 'California']
 [153441.51 101145.55 407934.54 'Florida']
 [144372.41 118671.85 383199.62 'New York']
 [142107.34 91391.77 366168.42 'Florida']
 [131876.9 99814.71 362861.36 'New York']
 [134615.46 147198.87 127716.82 'California']
 [130298.13 145530.06 323876.68 'Florida']
 [120542.52 148718.95 311613.29 'New York']
 [123334.88 108679.17 304981.62 'California']
 [101913.08 110594.11 229160.95 'Florida']
 [100671.96 91790.61 249744.55 'California']
 [93863.75 127320.38 249839.44 'Florida']
 [91992.39 135495.07 252664.93 'California']
 [119943.24 156547.42 256512.92 'Florida']
 [114523.61 122616.84 261776.23 'New York']
 [78013.11 121597.55 264346.06 'California']
 [94657.16 145077.58 282574.31 'New York']
 [91749.16 114175.79 294919.57 'Florida']
 [86419.7 153514.11 281574.31 'New York']
 [76253.86 113867.3 298664.47 'California']
 [78389.47 153773.43 299737.29 'New York']
 [7

In [53]:
print ("Target variable of the entire dataset")
print (y)
print (type(y))

Target variable of the entire dataset
[192261.83 191792.06 191050.39 182901.99 166187.94 156991.12 156122.51
 155752.6  152211.77 149759.96 146121.95 144259.4  141585.52 134307.35
 132602.65 129917.04 126992.93 125370.37 124266.9  122776.86 118474.03
 111313.02 110352.25 108733.99 108552.04 107404.34 105733.54 105008.31
 103282.38 101004.64  99937.59  97483.56  97427.84  96778.92  96712.8
  96479.51  90708.19  89949.14  81229.06  81005.76  78239.91  77798.83
  71498.49  69758.98  65200.33  64926.08  49490.75  42559.73  35673.41
  14681.4 ]
<class 'numpy.ndarray'>


## Perform dummy encoding on categorical variables in dataset

Here, there is only one categorical variable column, State.

In [54]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# This performs dummy encoding

ct = ColumnTransformer(transformers=[('encoder', 
  OneHotEncoder(categories='auto',drop='first',sparse_output=False), [3])], remainder='passthrough')

X = np.array(ct.fit_transform(X))

In [55]:
print ("Feature variables after dummy encoding on the State column")
print(X)

Feature variables after dummy encoding on the State column
[[0.0 1.0 165349.2 136897.8 471784.1]
 [0.0 0.0 162597.7 151377.59 443898.53]
 [1.0 0.0 153441.51 101145.55 407934.54]
 [0.0 1.0 144372.41 118671.85 383199.62]
 [1.0 0.0 142107.34 91391.77 366168.42]
 [0.0 1.0 131876.9 99814.71 362861.36]
 [0.0 0.0 134615.46 147198.87 127716.82]
 [1.0 0.0 130298.13 145530.06 323876.68]
 [0.0 1.0 120542.52 148718.95 311613.29]
 [0.0 0.0 123334.88 108679.17 304981.62]
 [1.0 0.0 101913.08 110594.11 229160.95]
 [0.0 0.0 100671.96 91790.61 249744.55]
 [1.0 0.0 93863.75 127320.38 249839.44]
 [0.0 0.0 91992.39 135495.07 252664.93]
 [1.0 0.0 119943.24 156547.42 256512.92]
 [0.0 1.0 114523.61 122616.84 261776.23]
 [0.0 0.0 78013.11 121597.55 264346.06]
 [0.0 1.0 94657.16 145077.58 282574.31]
 [1.0 0.0 91749.16 114175.79 294919.57]
 [0.0 1.0 86419.7 153514.11 281574.31]
 [0.0 0.0 76253.86 113867.3 298664.47]
 [0.0 1.0 78389.47 153773.43 299737.29]
 [1.0 0.0 73994.56 122782.75 303319.26]
 [1.0 0.0 67532.5

## Splitting original dataset into the Training set and Test set


In [56]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [57]:
print (f"There are {len(X_train)} rows in the training dataset\n")
print ("The feature variable values are")
print (X_train)
print ("\nThe target variable values are")
print (y_train)

There are 40 rows in the training dataset

The feature variable values are
[[1.0 0.0 55493.95 103057.49 214634.81]
 [0.0 1.0 46014.02 85047.44 205517.64]
 [1.0 0.0 75328.87 144135.98 134050.07]
 [0.0 0.0 46426.07 157693.92 210797.67]
 [1.0 0.0 91749.16 114175.79 294919.57]
 [1.0 0.0 130298.13 145530.06 323876.68]
 [1.0 0.0 119943.24 156547.42 256512.92]
 [0.0 1.0 1000.23 124153.04 1903.93]
 [0.0 1.0 542.05 51743.15 165265.1]
 [0.0 1.0 65605.48 153032.06 107138.38]
 [0.0 1.0 114523.61 122616.84 261776.23]
 [1.0 0.0 61994.48 115641.28 91131.24]
 [0.0 0.0 63408.86 129219.61 46085.25]
 [0.0 0.0 78013.11 121597.55 264346.06]
 [0.0 0.0 23640.93 96189.63 148001.11]
 [0.0 0.0 76253.86 113867.3 298664.47]
 [0.0 1.0 15505.73 127382.3 35534.17]
 [0.0 1.0 120542.52 148718.95 311613.29]
 [0.0 0.0 91992.39 135495.07 252664.93]
 [0.0 0.0 64664.71 139553.16 137962.62]
 [0.0 1.0 131876.9 99814.71 362861.36]
 [0.0 1.0 94657.16 145077.58 282574.31]
 [0.0 0.0 28754.33 118546.05 172795.67]
 [0.0 0.0 59808.

In [58]:
print (f"There are {len(X_test)} rows in the test dataset\n")
print ("The feature variable values are")
print (X_test)

print ("\nThe target variable values are")
print (y_test)

There are 10 rows in the test dataset

The feature variable values are
[[1.0 0.0 66051.52 182645.56 118148.2]
 [0.0 0.0 100671.96 91790.61 249744.55]
 [1.0 0.0 101913.08 110594.11 229160.95]
 [1.0 0.0 27892.92 84710.77 164470.71]
 [1.0 0.0 153441.51 101145.55 407934.54]
 [0.0 1.0 72107.6 127864.55 353183.81]
 [0.0 1.0 20229.59 65947.93 185265.1]
 [0.0 1.0 61136.38 152701.92 88218.23]
 [1.0 0.0 73994.56 122782.75 303319.26]
 [1.0 0.0 142107.34 91391.77 366168.42]]

The target variable values are
[103282.38 144259.4  146121.95  77798.83 191050.39 105008.31  81229.06
  97483.56 110352.25 166187.94]


## Training the Multiple Linear Regression model using the training dataset

In [59]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

## Generating predictions using the test dataset

We generate predictions and then compare these visually with the actual values from the test dataset by printing them out

For a real life ML project, this step is not necessary. We would only need to check the regression metrics (next cell) to 
determine evaluate how good or accurate this model is


In [60]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[101624.21 103282.38]
 [126931.06 144259.4 ]
 [128394.7  146121.95]
 [ 70903.06  77798.83]
 [176773.07 191050.39]
 [121351.27 105008.31]
 [ 67273.41  81229.06]
 [ 94761.86  97483.56]
 [116616.43 110352.25]
 [164883.45 166187.94]]


## Common metrics for model evaluation in regression
* R Squared 
* Mean Square Error(MSE) / Root Mean Square Error(RMSE)
* Mean Absolute Error(MAE)

When we evaluate metrics, we use the target variable (Profit Column) in test dataset as the ground truth
and compare that with the predictions generated by the model for the feature variables in the test dataset

R Squared is always between 0 (totally useless) and 1 (perfect accuracy)

RMSE and MAE both are roughly in the same units or magnitude range as the target variable (Profit column)
RMSE / MAE are in thousands (1000s) while Profit is in (100,000s)

Therefore, RMSE / MAE will have different range of values, depending on the magnitude of the target variable column
In other words, they are dataset specific

They are good for **comparing between different models trained on the same dataset**

Typically used in  data science competitions such as Kaggle !



R Squared, on the other hand, is always between 0 and 1, regardless of the magnitude of the target variable column

It is good for **assessing the accuracy of the model as a whole, and is therefore a more popular metric**

It is also easier to explain and justify to non data-scientists ! 

 


In [61]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

print("R2 score : ", r2_score(y_test,y_pred))
print("MSE: ", mean_squared_error(y_test, y_pred))
print("RMSE : " ,np.sqrt(mean_squared_error(y_test, y_pred)))
print("MAE : " , mean_absolute_error(y_test,y_pred))

R2 score :  0.8921823057957278
MSE:  137887142.69482076
RMSE :  11742.535616076315
MAE :  9847.582188257618


## Using the model to generate predictions

Assuming we are happy with the metrics for the model, we are now ready to use the model by providing it some values for the feature variables and generating a prediction for the target variable

For e.g. assume we want to predict profit for a company in New York with the following situation


RnD: 165000


Admin: 135000


Marketing: 475000


To encode New York, we use State_Florida = 0 and State_New York = 1

In [62]:
# The format for the input that we want to generate predictions on must be the same format as the 
# training dataset that was used to train the model
# This will be a 2D Numpy array

sample_to_predict = [[0, 1, 165000, 135000, 475000]]
profit_pred = regressor.predict(sample_to_predict)
print ("The predicted target variable value (profit) for that sample data point is ", profit_pred)

The predicted target variable value (profit) for that sample data point is  [194529.35]
