In [70]:
import numpy as np
import pandas as pd
from pandas import DataFrame
np.set_printoptions(suppress=True)

In [71]:
# Importing the dataset
dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values

In [72]:
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [73]:
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder = LabelEncoder()
X[:, 3] = labelencoder.fit_transform(X[:, 3])
onehotencoder = OneHotEncoder(categorical_features = [3])
X = onehotencoder.fit_transform(X).toarray()

In [74]:
# Avoid dummy variable trap
X = X[:, 1:]

In [75]:
# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [76]:
# Fitting Multiple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [77]:
# Predicting the Test set results
y_pred = regressor.predict(X_test)

In [78]:
print(y_pred)

[103015.20159796 132582.27760815 132447.73845175  71976.09851258
 178537.48221056 116161.24230166  67851.69209676  98791.73374687
 113969.43533013 167921.06569551]


In [79]:
# Building the optimal model using Backward Elimination
import statsmodels.formula.api as sm

def backwardElimination(x, sl):
    numVars = len(x[0])
    for i in range(0, numVars):
        regressor_OLS = sm.OLS(y, x).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        if maxVar > sl:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    x = np.delete(x, j, 1)
    print(regressor_OLS.summary())
    return x

In [80]:
# Append dummy feature column with ones
X = np.append(arr = np.ones((50, 1)).astype(int), values = X, axis = 1)  

In [81]:
# Significance level to stay in the model
SL = 0.05
X_opt = X[:, [0, 1, 2, 3, 4, 5]]
X_Modeled = backwardElimination(X_opt, SL)

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.947
Model:                            OLS   Adj. R-squared:                  0.945
Method:                 Least Squares   F-statistic:                     849.8
Date:                Tue, 14 May 2019   Prob (F-statistic):           3.50e-32
Time:                        15:50:58   Log-Likelihood:                -527.44
No. Observations:                  50   AIC:                             1059.
Df Residuals:                      48   BIC:                             1063.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       4.903e+04   2537.897     19.320      0.0

In [82]:
print(DataFrame(X_Modeled[:,1:]))

            0
0   165349.20
1   162597.70
2   153441.51
3   144372.41
4   142107.34
5   131876.90
6   134615.46
7   130298.13
8   120542.52
9   123334.88
10  101913.08
11  100671.96
12   93863.75
13   91992.39
14  119943.24
15  114523.61
16   78013.11
17   94657.16
18   91749.16
19   86419.70
20   76253.86
21   78389.47
22   73994.56
23   67532.53
24   77044.01
25   64664.71
26   75328.87
27   72107.60
28   66051.52
29   65605.48
30   61994.48
31   61136.38
32   63408.86
33   55493.95
34   46426.07
35   46014.02
36   28663.76
37   44069.95
38   20229.59
39   38558.51
40   28754.33
41   27892.92
42   23640.93
43   15505.73
44   22177.74
45    1000.23
46    1315.46
47       0.00
48     542.05
49       0.00
