# Backwards Elimination Example for Model Selection
Normally you do not need to do this when using sklearn but this will help understand the manual process of doing this.

Step 1: Select significance level i.e. SL=.05 @ 95 % confidence

Step 2: Fit the full model with all possible predictors

Step 3: Consider the predictor with the HIGHEST P-value. If P-value > SL, go to step 4, otherwise go to FINISH (model is ready)

Step 4: Remove the predictor

Step 5: Re-Fit the model without this variable*

(link belongs to SuperDataScience)
https://www.dropbox.com/sh/pknk0g9yu4z06u7/AADSTzieYEMfs1HHxKHt9j1ba?dl=0

# Multiple Linear Regression

# Import the Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Import the Dataset

In [2]:
dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values
print(X)

[[165349.2 136897.8 471784.1 'New York']
 [162597.7 151377.59 443898.53 'California']
 [153441.51 101145.55 407934.54 'Florida']
 [144372.41 118671.85 383199.62 'New York']
 [142107.34 91391.77 366168.42 'Florida']
 [131876.9 99814.71 362861.36 'New York']
 [134615.46 147198.87 127716.82 'California']
 [130298.13 145530.06 323876.68 'Florida']
 [120542.52 148718.95 311613.29 'New York']
 [123334.88 108679.17 304981.62 'California']
 [101913.08 110594.11 229160.95 'Florida']
 [100671.96 91790.61 249744.55 'California']
 [93863.75 127320.38 249839.44 'Florida']
 [91992.39 135495.07 252664.93 'California']
 [119943.24 156547.42 256512.92 'Florida']
 [114523.61 122616.84 261776.23 'New York']
 [78013.11 121597.55 264346.06 'California']
 [94657.16 145077.58 282574.31 'New York']
 [91749.16 114175.79 294919.57 'Florida']
 [86419.7 153514.11 0.0 'New York']
 [76253.86 113867.3 298664.47 'California']
 [78389.47 153773.43 299737.29 'New York']
 [73994.56 122782.75 303319.26 'Florida']
 [67532

# Encoding Categorical Data

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), [3])], remainder = 'passthrough')
X = np.array(ct.fit_transform(X))
print(X)

[[0.0 0.0 1.0 165349.2 136897.8 471784.1]
 [1.0 0.0 0.0 162597.7 151377.59 443898.53]
 [0.0 1.0 0.0 153441.51 101145.55 407934.54]
 [0.0 0.0 1.0 144372.41 118671.85 383199.62]
 [0.0 1.0 0.0 142107.34 91391.77 366168.42]
 [0.0 0.0 1.0 131876.9 99814.71 362861.36]
 [1.0 0.0 0.0 134615.46 147198.87 127716.82]
 [0.0 1.0 0.0 130298.13 145530.06 323876.68]
 [0.0 0.0 1.0 120542.52 148718.95 311613.29]
 [1.0 0.0 0.0 123334.88 108679.17 304981.62]
 [0.0 1.0 0.0 101913.08 110594.11 229160.95]
 [1.0 0.0 0.0 100671.96 91790.61 249744.55]
 [0.0 1.0 0.0 93863.75 127320.38 249839.44]
 [1.0 0.0 0.0 91992.39 135495.07 252664.93]
 [0.0 1.0 0.0 119943.24 156547.42 256512.92]
 [0.0 0.0 1.0 114523.61 122616.84 261776.23]
 [1.0 0.0 0.0 78013.11 121597.55 264346.06]
 [0.0 0.0 1.0 94657.16 145077.58 282574.31]
 [0.0 1.0 0.0 91749.16 114175.79 294919.57]
 [0.0 0.0 1.0 86419.7 153514.11 0.0]
 [1.0 0.0 0.0 76253.86 113867.3 298664.47]
 [0.0 0.0 1.0 78389.47 153773.43 299737.29]
 [0.0 1.0 0.0 73994.56 122782.75 3

# Avoiding the Dummy Variable Trap

In [4]:
# Get rid of one of the dummy variable manually. Sklearn linear regression should do this for us, but we are doing it manually do demonstrate how
X = X[:,1:]
# FL and NY are left as dummy variables
print(X)

[[0.0 1.0 165349.2 136897.8 471784.1]
 [0.0 0.0 162597.7 151377.59 443898.53]
 [1.0 0.0 153441.51 101145.55 407934.54]
 [0.0 1.0 144372.41 118671.85 383199.62]
 [1.0 0.0 142107.34 91391.77 366168.42]
 [0.0 1.0 131876.9 99814.71 362861.36]
 [0.0 0.0 134615.46 147198.87 127716.82]
 [1.0 0.0 130298.13 145530.06 323876.68]
 [0.0 1.0 120542.52 148718.95 311613.29]
 [0.0 0.0 123334.88 108679.17 304981.62]
 [1.0 0.0 101913.08 110594.11 229160.95]
 [0.0 0.0 100671.96 91790.61 249744.55]
 [1.0 0.0 93863.75 127320.38 249839.44]
 [0.0 0.0 91992.39 135495.07 252664.93]
 [1.0 0.0 119943.24 156547.42 256512.92]
 [0.0 1.0 114523.61 122616.84 261776.23]
 [0.0 0.0 78013.11 121597.55 264346.06]
 [0.0 1.0 94657.16 145077.58 282574.31]
 [1.0 0.0 91749.16 114175.79 294919.57]
 [0.0 1.0 86419.7 153514.11 0.0]
 [0.0 0.0 76253.86 113867.3 298664.47]
 [0.0 1.0 78389.47 153773.43 299737.29]
 [1.0 0.0 73994.56 122782.75 303319.26]
 [1.0 0.0 67532.53 105751.03 304768.73]
 [0.0 1.0 77044.01 99281.34 140574.81]
 [0

# Splitting the data into training and testing data

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Fitting the multiple linear regression to the training set

In [6]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

# Predicting the Test set results

In [7]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[103015.2  103282.38]
 [132582.28 144259.4 ]
 [132447.74 146121.95]
 [ 71976.1   77798.83]
 [178537.48 191050.39]
 [116161.24 105008.31]
 [ 67851.69  81229.06]
 [ 98791.73  97483.56]
 [113969.44 110352.25]
 [167921.07 166187.94]]


# Building the optimal model using Backward elimination

In [8]:
import statsmodels.api as sm
# The statsmodels library does not take into account the constant b0 in the equation, so we need to add a column of 1s to correspond
# to the constant so that the equation includes the y = b0x0 +b1x1..... where x0=1
X = sm.add_constant(X)
print(X)

[[1.0 0.0 1.0 165349.2 136897.8 471784.1]
 [1.0 0.0 0.0 162597.7 151377.59 443898.53]
 [1.0 1.0 0.0 153441.51 101145.55 407934.54]
 [1.0 0.0 1.0 144372.41 118671.85 383199.62]
 [1.0 1.0 0.0 142107.34 91391.77 366168.42]
 [1.0 0.0 1.0 131876.9 99814.71 362861.36]
 [1.0 0.0 0.0 134615.46 147198.87 127716.82]
 [1.0 1.0 0.0 130298.13 145530.06 323876.68]
 [1.0 0.0 1.0 120542.52 148718.95 311613.29]
 [1.0 0.0 0.0 123334.88 108679.17 304981.62]
 [1.0 1.0 0.0 101913.08 110594.11 229160.95]
 [1.0 0.0 0.0 100671.96 91790.61 249744.55]
 [1.0 1.0 0.0 93863.75 127320.38 249839.44]
 [1.0 0.0 0.0 91992.39 135495.07 252664.93]
 [1.0 1.0 0.0 119943.24 156547.42 256512.92]
 [1.0 0.0 1.0 114523.61 122616.84 261776.23]
 [1.0 0.0 0.0 78013.11 121597.55 264346.06]
 [1.0 0.0 1.0 94657.16 145077.58 282574.31]
 [1.0 1.0 0.0 91749.16 114175.79 294919.57]
 [1.0 0.0 1.0 86419.7 153514.11 0.0]
 [1.0 0.0 0.0 76253.86 113867.3 298664.47]
 [1.0 0.0 1.0 78389.47 153773.43 299737.29]
 [1.0 1.0 0.0 73994.56 122782.75 3

  import pandas.util.testing as tm


In [25]:
# create optimal matrix of features where features are statistically significant
# Include all features at first and the remove one by one those whose p>.05

X_opt = X[:,[0,1,2,3,4,5]]
X_opt.dtype
#  The X_opt is a python object data type and must be converted to a float to run the OLS

X_opt = X_opt.astype(float)
X_opt.dtype

# The reason we added the inctercept X0 coefficient back in earlier is that the OLS library does NOT include the intercept by default
regressor_ols = sm.OLS(endog=y, exog=X_opt).fit()
print(regressor_ols.summary())

# The X2 will be removed at P=.99, X2 is the state of NY dummy variable
X_opt = X[:,[0,1,3,4,5]]
X_opt = X_opt.astype(float)
regressor_ols = sm.OLS(endog=y, exog=X_opt).fit()
print(regressor_ols.summary())

# The X1 will be removed at P=.94, X1 is the state of FL dummy variable
X_opt = X[:,[0,3,4,5]]
X_opt = X_opt.astype(float)
regressor_ols = sm.OLS(endog=y, exog=X_opt).fit()
print(regressor_ols.summary())

# X2 will be removed at p=.602, X2 is the admin spend
X_opt = X[:,[0,3,5]]
X_opt = X_opt.astype(float)
regressor_ols = sm.OLS(endog=y, exog=X_opt).fit()
print(regressor_ols.summary())

# X2 will be removed at p=.06, X2 is the marketing spend
X_opt = X[:,[0,3]]
X_opt = X_opt.astype(float)
regressor_ols = sm.OLS(endog=y, exog=X_opt).fit()
print(regressor_ols.summary())

# The R&D spend is significant and we stop removal of features
# The model is now complete

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.951
Model:                            OLS   Adj. R-squared:                  0.945
Method:                 Least Squares   F-statistic:                     169.9
Date:                Wed, 24 Jun 2020   Prob (F-statistic):           1.34e-27
Time:                        21:05:57   Log-Likelihood:                -525.38
No. Observations:                  50   AIC:                             1063.
Df Residuals:                      44   BIC:                             1074.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       5.013e+04   6884.820      7.281      0.0