In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### 1. Data Preprocessing

In [28]:
dataset = pd.read_csv("50_Startups.csv")
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [29]:
X = dataset.iloc[:,:-1].values
y=  dataset.iloc[:,4].values

In [30]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
labelencoder_X = LabelEncoder()
X[:,3] = labelencoder_X.fit_transform(X[:,3])
onehotencoder = OneHotEncoder(categorical_features = [3])
X = onehotencoder.fit_transform(X).toarray()

#### Avoiding the Dummy Variable Trap

- Used to avoid multicollinearity
- Python Linear Regression Library already take care of that

In [31]:
X = X[:,1:]

In [32]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=  0.2,random_state = 0)

### 2. Fit the Multiple Linear Regression

- No need to do feature scaling, Sklearn Regression API already take care of that

In [33]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

### 3. Predict the Test Result

In [34]:
y_pred = regressor.predict(X_test)
y_pred

array([ 103015.20159796,  132582.27760815,  132447.73845175,
         71976.09851258,  178537.48221056,  116161.24230166,
         67851.69209676,   98791.73374687,  113969.43533013,
        167921.06569551])

### 4. Building the optimal model using Backward Elimination

- Add Additional variable for b0

In [35]:
import statsmodels.formula.api as sm

X = np.append(arr = np.ones((50,1)).astype(int),values = X,axis = 1)


In [46]:
### Set the p_vlaue Significant level: SL = 0.05

## Initialize
X_opt = X[:,[0,1,2,3,4,5]]
regressor_ols = sm.OLS(endog = y,exog = X_opt).fit()
regressor_ols.summary()

## Remove index with highest p-vlaue
X_opt = X[:,[0,1,3,4,5]]
regressor_ols = sm.OLS(endog = y,exog = X_opt).fit()
regressor_ols.summary()


## Remove index with highest p-vlaue
X_opt = X[:,[0,3,4,5]]
regressor_ols = sm.OLS(endog = y,exog = X_opt).fit()
regressor_ols.summary()

## Remove index with highest p-vlaue
X_opt = X[:,[0,3,5]]
regressor_ols = sm.OLS(endog = y,exog = X_opt).fit()
regressor_ols.summary()



0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,450.8
Date:,"Fri, 20 Jul 2018",Prob (F-statistic):,2.1600000000000003e-31
Time:,13:41:23,Log-Likelihood:,-525.54
No. Observations:,50,AIC:,1057.0
Df Residuals:,47,BIC:,1063.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,4.698e+04,2689.933,17.464,0.000,4.16e+04 5.24e+04
x1,0.7966,0.041,19.266,0.000,0.713 0.880
x2,0.0299,0.016,1.927,0.060,-0.001 0.061

0,1,2,3
Omnibus:,14.677,Durbin-Watson:,1.257
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.161
Skew:,-0.939,Prob(JB):,2.54e-05
Kurtosis:,5.575,Cond. No.,532000.0


### Additional 1: Backward Elimination with p-values only:

In [50]:
import statsmodels.formula.api as sm
def backwardElimination(x, sl):
    numVars = len(x[0])
    for i in range(0, numVars):
        regressor_OLS = sm.OLS(y, x).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        if maxVar > sl:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    x = np.delete(x, j, 1)
    regressor_OLS.summary()
    return x
 
SL = 0.05
X_opt = X[:, [0, 1, 2, 3, 4, 5]]
X_Modeled = backwardElimination(X_opt, SL)

### Additional 2: Backward Elimination with p-values and Adjusted R Squared:

In [49]:
import statsmodels.formula.api as sm
def backwardElimination(x, SL):
    numVars = len(x[0])
    temp = np.zeros((50,6)).astype(int)
    for i in range(0, numVars):
        regressor_OLS = sm.OLS(y, x).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        adjR_before = regressor_OLS.rsquared_adj.astype(float)
        if maxVar > SL:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    temp[:,j] = x[:, j]
                    x = np.delete(x, j, 1)
                    tmp_regressor = sm.OLS(y, x).fit()
                    adjR_after = tmp_regressor.rsquared_adj.astype(float)
                    if (adjR_before >= adjR_after):
                        x_rollback = np.hstack((x, temp[:,[0,j]]))
                        x_rollback = np.delete(x_rollback, j, 1)
                        print (regressor_OLS.summary())
                        return x_rollback
                    else:
                        continue
    regressor_OLS.summary()
    return x
 
SL = 0.05
X_opt = X[:, [0, 1, 2, 3, 4, 5]]
X_Modeled = backwardElimination(X_opt, SL)

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.950
Model:                            OLS   Adj. R-squared:                  0.948
Method:                 Least Squares   F-statistic:                     450.8
Date:                Fri, 20 Jul 2018   Prob (F-statistic):           2.16e-31
Time:                        14:16:41   Log-Likelihood:                -525.54
No. Observations:                  50   AIC:                             1057.
Df Residuals:                      47   BIC:                             1063.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const       4.698e+04   2689.933     17.464      0.0