In [12]:
import numpy as np
import pandas as pd

np.set_printoptions(suppress=True)

In [13]:
# Importing the dataset
dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values

In [14]:
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [15]:
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder = LabelEncoder()
X[:, 3] = labelencoder.fit_transform(X[:, 3])
onehotencoder = OneHotEncoder(categorical_features = [3])
X = onehotencoder.fit_transform(X).toarray()

In [16]:
# Avoid dummy variable trap
X = X[:, 1:]

In [17]:
# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [18]:
# Fitting Multiple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [19]:
# Predicting the Test set results
y_pred = regressor.predict(X_test)

In [20]:
print(y_pred)

[103015.20159796 132582.27760815 132447.73845175  71976.09851258
 178537.48221056 116161.24230166  67851.69209676  98791.73374687
 113969.43533013 167921.06569551]


In [21]:
# Building the optimal model using Backward Elimination
import statsmodels.formula.api as sm

def backwardElimination(x, sl):
    numVars = len(x[0])
    for i in range(0, numVars):
        regressor_OLS = sm.OLS(y, x).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        if maxVar > sl:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    x = np.delete(x, j, 1)
    regressor_OLS.summary()
    return x

In [22]:
# Append dummy feature column with ones
X = np.append(arr = np.ones((50, 1)).astype(int), values = X, axis = 1)  

In [23]:
# Significance level to stay in the model
SL = 0.05
X_opt = X[:, [0, 1, 2, 3, 4, 5]]
X_Modeled = backwardElimination(X_opt, SL)

In [26]:
print(np.matrix.round(X_Modeled,0))

[[1.00000e+00 1.65349e+05]
 [1.00000e+00 1.62598e+05]
 [1.00000e+00 1.53442e+05]
 [1.00000e+00 1.44372e+05]
 [1.00000e+00 1.42107e+05]
 [1.00000e+00 1.31877e+05]
 [1.00000e+00 1.34615e+05]
 [1.00000e+00 1.30298e+05]
 [1.00000e+00 1.20543e+05]
 [1.00000e+00 1.23335e+05]
 [1.00000e+00 1.01913e+05]
 [1.00000e+00 1.00672e+05]
 [1.00000e+00 9.38640e+04]
 [1.00000e+00 9.19920e+04]
 [1.00000e+00 1.19943e+05]
 [1.00000e+00 1.14524e+05]
 [1.00000e+00 7.80130e+04]
 [1.00000e+00 9.46570e+04]
 [1.00000e+00 9.17490e+04]
 [1.00000e+00 8.64200e+04]
 [1.00000e+00 7.62540e+04]
 [1.00000e+00 7.83890e+04]
 [1.00000e+00 7.39950e+04]
 [1.00000e+00 6.75330e+04]
 [1.00000e+00 7.70440e+04]
 [1.00000e+00 6.46650e+04]
 [1.00000e+00 7.53290e+04]
 [1.00000e+00 7.21080e+04]
 [1.00000e+00 6.60520e+04]
 [1.00000e+00 6.56050e+04]
 [1.00000e+00 6.19940e+04]
 [1.00000e+00 6.11360e+04]
 [1.00000e+00 6.34090e+04]
 [1.00000e+00 5.54940e+04]
 [1.00000e+00 4.64260e+04]
 [1.00000e+00 4.60140e+04]
 [1.00000e+00 2.86640e+04]
 