In [2]:
#import standard libraries

In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

In [4]:
#import the data and load it

In [5]:
data = pd.read_csv("datasets/50_Startups.csv")

In [6]:
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [8]:
X = data.iloc[:,:-1].values
y = data.iloc[:,-1].values

In [11]:
#preprocessing the data - one hot encoding

In [18]:
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[3])],remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [20]:
## split and train the data

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [23]:
X_train.shape,X_test.shape

((40, 6), (10, 6))

In [58]:
X_train

array([[0.0, 1.0, 0.0, 93863.75, 127320.38, 249839.44],
       [0.0, 1.0, 0.0, 142107.34, 91391.77, 366168.42],
       [1.0, 0.0, 0.0, 44069.95, 51283.14, 197029.42],
       [0.0, 0.0, 1.0, 120542.52, 148718.95, 311613.29],
       [0.0, 0.0, 1.0, 144372.41, 118671.85, 383199.62],
       [1.0, 0.0, 0.0, 134615.46, 147198.87, 127716.82],
       [0.0, 1.0, 0.0, 27892.92, 84710.77, 164470.71],
       [0.0, 1.0, 0.0, 1315.46, 115816.21, 297114.46],
       [1.0, 0.0, 0.0, 0.0, 135426.92, 0.0],
       [0.0, 0.0, 1.0, 114523.61, 122616.84, 261776.23],
       [1.0, 0.0, 0.0, 123334.88, 108679.17, 304981.62],
       [1.0, 0.0, 0.0, 78013.11, 121597.55, 264346.06],
       [0.0, 0.0, 1.0, 77044.01, 99281.34, 140574.81],
       [1.0, 0.0, 0.0, 46426.07, 157693.92, 210797.67],
       [0.0, 0.0, 1.0, 61136.38, 152701.92, 88218.23],
       [0.0, 0.0, 1.0, 165349.2, 136897.8, 471784.1],
       [1.0, 0.0, 0.0, 22177.74, 154806.14, 28334.72],
       [0.0, 0.0, 1.0, 72107.6, 127864.55, 353183.81],
       

In [25]:
## train the data

In [26]:
mlr = LinearRegression()
mlr.fit(X_train,y_train)

In [28]:
#predict on the data

In [29]:
y_pred = mlr.predict(X_test)

In [60]:
mlr.predict([[0.0, 1.0, 0.0,160000,  130000,300000 ]])

array([183881.85])

In [31]:
y_pred,y_test

(array([126362.87908251,  84608.45383642,  99677.49425154,  46357.46068582,
        128750.48288497,  50912.41741904, 109741.350327  , 100643.24281643,
         97599.27574599, 113097.42524436]),
 array([134307.35,  81005.76,  99937.59,  64926.08, 125370.37,  35673.41,
        105733.54, 107404.34,  97427.84, 122776.86]))

In [32]:
#concatenate the prediction and real value

In [38]:
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1) , y_test.reshape(len(y_test),1)),1))

[[126362.88 134307.35]
 [ 84608.45  81005.76]
 [ 99677.49  99937.59]
 [ 46357.46  64926.08]
 [128750.48 125370.37]
 [ 50912.42  35673.41]
 [109741.35 105733.54]
 [100643.24 107404.34]
 [ 97599.28  97427.84]
 [113097.43 122776.86]]


In [39]:
## feature selection using backward elimination

In [48]:
X = np.append(arr=np.ones((50,1)).astype(int),values=X,axis=1)
X[:5]

array([[1, 1, 1, 1, 1, 1, 0.0, 0.0, 1.0, 165349.2, 136897.8, 471784.1],
       [1, 1, 1, 1, 1, 1, 1.0, 0.0, 0.0, 162597.7, 151377.59, 443898.53],
       [1, 1, 1, 1, 1, 1, 0.0, 1.0, 0.0, 153441.51, 101145.55, 407934.54],
       [1, 1, 1, 1, 1, 1, 0.0, 0.0, 1.0, 144372.41, 118671.85, 383199.62],
       [1, 1, 1, 1, 1, 1, 0.0, 1.0, 0.0, 142107.34, 91391.77, 366168.42]],
      dtype=object)

In [47]:
X_opt = X[:,[0,1,2,3,4,5]]
X_opt[:5]

array([[1, 1, 1, 1, 1, 0.0],
       [1, 1, 1, 1, 1, 1.0],
       [1, 1, 1, 1, 1, 0.0],
       [1, 1, 1, 1, 1, 0.0],
       [1, 1, 1, 1, 1, 0.0]], dtype=object)

In [49]:
X_opt = X_opt.astype(np.float64)
X_opt[:5]

array([[1., 1., 1., 1., 1., 0.],
       [1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 0.],
       [1., 1., 1., 1., 1., 0.],
       [1., 1., 1., 1., 1., 0.]])

In [50]:
reg_ols = sm.OLS(endog=y,exog=X_opt).fit()
reg_ols.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.021
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,1.043
Date:,"Tue, 01 Aug 2023",Prob (F-statistic):,0.312
Time:,18:56:13,Log-Likelihood:,-600.12
No. Observations:,50,AIC:,1204.0
Df Residuals:,48,BIC:,1208.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.324e+04,1402.665,16.567,0.000,2.04e+04,2.61e+04
x1,2.324e+04,1402.665,16.567,0.000,2.04e+04,2.61e+04
x2,2.324e+04,1402.665,16.567,0.000,2.04e+04,2.61e+04
x3,2.324e+04,1402.665,16.567,0.000,2.04e+04,2.61e+04
x4,2.324e+04,1402.665,16.567,0.000,2.04e+04,2.61e+04
x5,-1.228e+04,1.2e+04,-1.021,0.312,-3.65e+04,1.19e+04

0,1,2,3
Omnibus:,0.079,Durbin-Watson:,0.073
Prob(Omnibus):,0.961,Jarque-Bera (JB):,0.19
Skew:,0.087,Prob(JB):,0.909
Kurtosis:,2.753,Cond. No.,3.3899999999999997e+47


In [51]:
#2nd iteration
X_opt = X[:, [0, 3, 4, 5]]

X_opt = X_opt.astype(np.float64)

X_opt[:5]

array([[1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.]])

In [52]:
reg_ols = sm.OLS(endog=y,exog=X_opt).fit()
reg_ols.summary()

  return np.sqrt(eigvals[0]/eigvals[-1])


0,1,2,3
Dep. Variable:,y,R-squared:,0.0
Model:,OLS,Adj. R-squared:,0.0
Method:,Least Squares,F-statistic:,
Date:,"Tue, 01 Aug 2023",Prob (F-statistic):,
Time:,21:10:24,Log-Likelihood:,-600.65
No. Observations:,50,AIC:,1203.0
Df Residuals:,49,BIC:,1205.0
Df Model:,0,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.8e+04,1425.039,19.651,0.000,2.51e+04,3.09e+04
x1,2.8e+04,1425.039,19.651,0.000,2.51e+04,3.09e+04
x2,2.8e+04,1425.039,19.651,0.000,2.51e+04,3.09e+04
x3,2.8e+04,1425.039,19.651,0.000,2.51e+04,3.09e+04

0,1,2,3
Omnibus:,0.018,Durbin-Watson:,0.02
Prob(Omnibus):,0.991,Jarque-Bera (JB):,0.068
Skew:,0.023,Prob(JB):,0.966
Kurtosis:,2.825,Cond. No.,inf


In [53]:
#3rd iteration
X_opt = X[:, [0, 3, 5]]

X_opt = X_opt.astype(np.float64)

X_opt[:5]

array([[1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.]])

In [54]:
reg_ols = sm.OLS(endog=y,exog=X_opt).fit()
reg_ols.summary()

  return np.sqrt(eigvals[0]/eigvals[-1])


0,1,2,3
Dep. Variable:,y,R-squared:,0.0
Model:,OLS,Adj. R-squared:,0.0
Method:,Least Squares,F-statistic:,
Date:,"Tue, 01 Aug 2023",Prob (F-statistic):,
Time:,21:12:15,Log-Likelihood:,-600.65
No. Observations:,50,AIC:,1203.0
Df Residuals:,49,BIC:,1205.0
Df Model:,0,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.734e+04,1900.052,19.651,0.000,3.35e+04,4.12e+04
x1,3.734e+04,1900.052,19.651,0.000,3.35e+04,4.12e+04
x2,3.734e+04,1900.052,19.651,0.000,3.35e+04,4.12e+04

0,1,2,3
Omnibus:,0.018,Durbin-Watson:,0.02
Prob(Omnibus):,0.991,Jarque-Bera (JB):,0.068
Skew:,0.023,Prob(JB):,0.966
Kurtosis:,2.825,Cond. No.,inf


In [55]:
#4th iteration
X_opt = X[:, [0, 3]]

X_opt = X_opt.astype(np.float64)

X_opt[:5]

array([[1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.]])

In [56]:
reg_ols = sm.OLS(endog=y,exog=X_opt).fit()
reg_ols.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.0
Model:,OLS,Adj. R-squared:,0.0
Method:,Least Squares,F-statistic:,
Date:,"Tue, 01 Aug 2023",Prob (F-statistic):,
Time:,21:14:25,Log-Likelihood:,-600.65
No. Observations:,50,AIC:,1203.0
Df Residuals:,49,BIC:,1205.0
Df Model:,0,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.601e+04,2850.077,19.651,0.000,5.03e+04,6.17e+04
x1,5.601e+04,2850.077,19.651,0.000,5.03e+04,6.17e+04

0,1,2,3
Omnibus:,0.018,Durbin-Watson:,0.02
Prob(Omnibus):,0.991,Jarque-Bera (JB):,0.068
Skew:,0.023,Prob(JB):,0.966
Kurtosis:,2.825,Cond. No.,4.91e+16
