In [1]:
# Importing libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
# Loading data

dataset = pd.read_csv("Org_data.csv")

print(dataset.head())

X = dataset.iloc[:, :-1].values
# print(X)

y = dataset.iloc[:, -1].values
# print(y)

dataset.describe()

    Research  Operation  Marketing       State     Profit
0  165349.20  136897.80  471784.10    New York  192261.83
1  162597.70  151377.59  443898.53  California  191792.06
2  153441.51  101145.55  407934.54     Florida  191050.39
3  144372.41  118671.85  383199.62    New York  182901.99
4  142107.34   91391.77  366168.42     Florida  166187.94


Unnamed: 0,Research,Operation,Marketing,Profit
count,50.0,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978,112012.6392
std,45902.256482,28017.802755,122290.310726,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,129300.1325,90138.9025
50%,73051.08,122699.795,212716.24,107978.19
75%,101602.8,144842.18,299469.085,139765.9775
max,165349.2,182645.56,471784.1,192261.83


In [3]:
# Encoding categorical data - 'State'

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(transformers=[('one_hot_encoder', OneHotEncoder(categories='auto'), [3])],
                      remainder='passthrough')

X = np.array(ct.fit_transform(X), dtype=np.float)


In [5]:
# Avoiding dummy variable trap - dropping the first column of the encoded data

X = X[:, 1:]

In [7]:
# Split dataset into train and test set

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print(X_train.shape)
print(X_test.shape)

(40, 5)
(10, 5)


In [9]:
# Fitting

from sklearn.linear_model import LinearRegression

regressor = LinearRegression()

regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [10]:
# Predictions

y_pred = regressor.predict(X_test)

print(y_pred)
print(y_test)

[103015.20159796 132582.27760815 132447.73845175  71976.09851258
 178537.48221056 116161.24230166  67851.69209676  98791.73374687
 113969.43533013 167921.06569551]
[103282.38 144259.4  146121.95  77798.83 191050.39 105008.31  81229.06
  97483.56 110352.25 166187.94]


In [25]:
# Optimizing the model using backward elimination

import statsmodels.api as sm

# add a columns of ones (1s) to the features
X = np.append(arr=np.ones((50,1)).astype(int), values=X, axis=1)
# print(X)

# all possible predictors - matrix of features
X_opt = X[:, [0,1,2,3,4,5]]

regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()

regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.0
Model:,OLS,Adj. R-squared:,0.0
Method:,Least Squares,F-statistic:,
Date:,"Wed, 27 May 2020",Prob (F-statistic):,
Time:,17:12:14,Log-Likelihood:,-600.65
No. Observations:,50,AIC:,1203.0
Df Residuals:,49,BIC:,1205.0
Df Model:,0,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.867e+04,950.026,19.651,0.000,1.68e+04,2.06e+04
x1,1.867e+04,950.026,19.651,0.000,1.68e+04,2.06e+04
x2,1.867e+04,950.026,19.651,0.000,1.68e+04,2.06e+04
x3,1.867e+04,950.026,19.651,0.000,1.68e+04,2.06e+04
x4,1.867e+04,950.026,19.651,0.000,1.68e+04,2.06e+04
x5,1.867e+04,950.026,19.651,0.000,1.68e+04,2.06e+04

0,1,2,3
Omnibus:,0.018,Durbin-Watson:,0.02
Prob(Omnibus):,0.991,Jarque-Bera (JB):,0.068
Skew:,0.023,Prob(JB):,0.966
Kurtosis:,2.825,Cond. No.,1.97e+80


In [31]:
X_opt = X[:, [0,1,3,4,5]]

regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()

regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.0
Model:,OLS,Adj. R-squared:,0.0
Method:,Least Squares,F-statistic:,
Date:,"Wed, 27 May 2020",Prob (F-statistic):,
Time:,17:14:48,Log-Likelihood:,-600.65
No. Observations:,50,AIC:,1203.0
Df Residuals:,49,BIC:,1205.0
Df Model:,0,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.24e+04,1140.031,19.651,0.000,2.01e+04,2.47e+04
x1,2.24e+04,1140.031,19.651,0.000,2.01e+04,2.47e+04
x2,2.24e+04,1140.031,19.651,0.000,2.01e+04,2.47e+04
x3,2.24e+04,1140.031,19.651,0.000,2.01e+04,2.47e+04
x4,2.24e+04,1140.031,19.651,0.000,2.01e+04,2.47e+04

0,1,2,3
Omnibus:,0.018,Durbin-Watson:,0.02
Prob(Omnibus):,0.991,Jarque-Bera (JB):,0.068
Skew:,0.023,Prob(JB):,0.966
Kurtosis:,2.825,Cond. No.,1.61e+64


In [30]:
X_opt = X[:, [0,3,4,5]]

regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()

regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.0
Model:,OLS,Adj. R-squared:,0.0
Method:,Least Squares,F-statistic:,inf
Date:,"Wed, 27 May 2020",Prob (F-statistic):,
Time:,17:14:37,Log-Likelihood:,-600.65
No. Observations:,50,AIC:,1203.0
Df Residuals:,49,BIC:,1205.0
Df Model:,0,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.8e+04,1425.039,19.651,0.000,2.51e+04,3.09e+04
x1,2.8e+04,1425.039,19.651,0.000,2.51e+04,3.09e+04
x2,2.8e+04,1425.039,19.651,0.000,2.51e+04,3.09e+04
x3,2.8e+04,1425.039,19.651,0.000,2.51e+04,3.09e+04

0,1,2,3
Omnibus:,0.018,Durbin-Watson:,0.02
Prob(Omnibus):,0.991,Jarque-Bera (JB):,0.068
Skew:,0.023,Prob(JB):,0.966
Kurtosis:,2.825,Cond. No.,1.47e+48


In [32]:
X_opt = X[:, [0,3,5]]

regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()

regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.0
Model:,OLS,Adj. R-squared:,0.0
Method:,Least Squares,F-statistic:,
Date:,"Wed, 27 May 2020",Prob (F-statistic):,
Time:,17:15:44,Log-Likelihood:,-600.65
No. Observations:,50,AIC:,1203.0
Df Residuals:,49,BIC:,1205.0
Df Model:,0,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.734e+04,1900.052,19.651,0.000,3.35e+04,4.12e+04
x1,3.734e+04,1900.052,19.651,0.000,3.35e+04,4.12e+04
x2,3.734e+04,1900.052,19.651,0.000,3.35e+04,4.12e+04

0,1,2,3
Omnibus:,0.018,Durbin-Watson:,0.02
Prob(Omnibus):,0.991,Jarque-Bera (JB):,0.068
Skew:,0.023,Prob(JB):,0.966
Kurtosis:,2.825,Cond. No.,4.0600000000000004e+32


In [33]:
X_opt = X[:, [0,3]]

regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()

regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.0
Model:,OLS,Adj. R-squared:,0.0
Method:,Least Squares,F-statistic:,
Date:,"Wed, 27 May 2020",Prob (F-statistic):,
Time:,17:16:27,Log-Likelihood:,-600.65
No. Observations:,50,AIC:,1203.0
Df Residuals:,49,BIC:,1205.0
Df Model:,0,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.601e+04,2850.077,19.651,0.000,5.03e+04,6.17e+04
x1,5.601e+04,2850.077,19.651,0.000,5.03e+04,6.17e+04

0,1,2,3
Omnibus:,0.018,Durbin-Watson:,0.02
Prob(Omnibus):,0.991,Jarque-Bera (JB):,0.068
Skew:,0.023,Prob(JB):,0.966
Kurtosis:,2.825,Cond. No.,4.91e+16


In [34]:
print(X_opt)

[[1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]]
