In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

In [2]:
df = pd.read_csv(".\DATA\MultipleLinearData.csv")
#df.loc[df["State"] == "New York", "State"] = 0

X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# one-hot
ct = ColumnTransformer([("State", OneHotEncoder(), [3])], remainder='passthrough')
X = ct.fit_transform(X)

# one-hot drop one column
X = X[:, 1:]

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print("Data split done!")

Data split done!


In [3]:
X_train[:5]

array([[1.0, 0.0, 55493.95, 103057.49, 214634.81],
       [0.0, 1.0, 46014.02, 85047.44, 205517.64],
       [1.0, 0.0, 75328.87, 144135.98, 134050.07],
       [0.0, 0.0, 46426.07, 157693.92, 210797.67],
       [1.0, 0.0, 91749.16, 114175.79, 294919.57]], dtype=object)

In [4]:
y_train[:5]

array([ 96778.92,  96479.51, 105733.54,  96712.8 , 124266.9 ])

In [5]:
# linear regression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

In [6]:
y_pred = regressor.predict(X_test)
y_pred

array([103015.20159796, 132582.27760815, 132447.73845175,  71976.09851258,
       178537.48221056, 116161.24230166,  67851.69209676,  98791.73374687,
       113969.43533013, 167921.06569551])

In [7]:
acc = regressor.score(X_test, y_test)
acc

0.9347068473282446

In [8]:
# append 1 std
X_train = np.append(arr=np.ones((len(X_train), 1)).astype(int), values=X_train, axis=1)
X_train[:5]

array([[1, 1.0, 0.0, 55493.95, 103057.49, 214634.81],
       [1, 0.0, 1.0, 46014.02, 85047.44, 205517.64],
       [1, 1.0, 0.0, 75328.87, 144135.98, 134050.07],
       [1, 0.0, 0.0, 46426.07, 157693.92, 210797.67],
       [1, 1.0, 0.0, 91749.16, 114175.79, 294919.57]], dtype=object)

In [9]:
# try to optimize model
X_opt = X_train[:, [0, 1, 2, 3, 4, 5]]
X_opt = np.array(X_opt, dtype=float)  # array obj to array float

# sm.OLS()
regressor_OLS = sm.OLS(endog=y_train, exog=X_opt).fit()
regressor_OLS.summary()
# drop x1~x5 where P > 0.05 and the bigest.

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.943
Method:,Least Squares,F-statistic:,129.7
Date:,"Wed, 18 Nov 2020",Prob (F-statistic):,3.91e-21
Time:,15:18:50,Log-Likelihood:,-421.1
No. Observations:,40,AIC:,854.2
Df Residuals:,34,BIC:,864.3
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.255e+04,8358.538,5.091,0.000,2.56e+04,5.95e+04
x1,-959.2842,4038.108,-0.238,0.814,-9165.706,7247.138
x2,699.3691,3661.563,0.191,0.850,-6741.822,8140.560
x3,0.7735,0.055,14.025,0.000,0.661,0.886
x4,0.0329,0.066,0.495,0.624,-0.102,0.168
x5,0.0366,0.019,1.884,0.068,-0.003,0.076

0,1,2,3
Omnibus:,15.823,Durbin-Watson:,2.468
Prob(Omnibus):,0.0,Jarque-Bera (JB):,23.231
Skew:,-1.094,Prob(JB):,9.03e-06
Kurtosis:,6.025,Cond. No.,1490000.0


In [10]:
X_opt = X_train[:, [0, 1, 3, 4, 5]]
X_opt = np.array(X_opt, dtype=float)  
regressor_OLS = sm.OLS(endog=y_train, exog=X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.944
Method:,Least Squares,F-statistic:,166.7
Date:,"Wed, 18 Nov 2020",Prob (F-statistic):,2.87e-22
Time:,15:18:50,Log-Likelihood:,-421.12
No. Observations:,40,AIC:,852.2
Df Residuals:,35,BIC:,860.7
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.292e+04,8020.397,5.352,0.000,2.66e+04,5.92e+04
x1,-1272.1608,3639.780,-0.350,0.729,-8661.308,6116.986
x2,0.7754,0.053,14.498,0.000,0.667,0.884
x3,0.0319,0.065,0.488,0.629,-0.101,0.165
x4,0.0363,0.019,1.902,0.065,-0.002,0.075

0,1,2,3
Omnibus:,16.074,Durbin-Watson:,2.467
Prob(Omnibus):,0.0,Jarque-Bera (JB):,24.553
Skew:,-1.086,Prob(JB):,4.66e-06
Kurtosis:,6.164,Cond. No.,1430000.0


In [11]:
X_opt = X_train[:, [0, 3, 4, 5]]
X_opt = np.array(X_opt, dtype=float)  
regressor_OLS = sm.OLS(endog=y_train, exog=X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,227.8
Date:,"Wed, 18 Nov 2020",Prob (F-statistic):,1.8499999999999998e-23
Time:,15:18:50,Log-Likelihood:,-421.19
No. Observations:,40,AIC:,850.4
Df Residuals:,36,BIC:,857.1
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.299e+04,7919.773,5.428,0.000,2.69e+04,5.91e+04
x1,0.7788,0.052,15.003,0.000,0.674,0.884
x2,0.0294,0.064,0.458,0.650,-0.101,0.160
x3,0.0347,0.018,1.896,0.066,-0.002,0.072

0,1,2,3
Omnibus:,15.557,Durbin-Watson:,2.481
Prob(Omnibus):,0.0,Jarque-Bera (JB):,22.539
Skew:,-1.081,Prob(JB):,1.28e-05
Kurtosis:,5.974,Cond. No.,1430000.0


In [12]:
X_opt = X_train[:, [0, 3, 5]]
X_opt = np.array(X_opt, dtype=float)  
regressor_OLS = sm.OLS(endog=y_train, exog=X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.947
Method:,Least Squares,F-statistic:,349.0
Date:,"Wed, 18 Nov 2020",Prob (F-statistic):,9.65e-25
Time:,15:18:50,Log-Likelihood:,-421.3
No. Observations:,40,AIC:,848.6
Df Residuals:,37,BIC:,853.7
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.635e+04,2971.236,15.598,0.000,4.03e+04,5.24e+04
x1,0.7886,0.047,16.846,0.000,0.694,0.883
x2,0.0326,0.018,1.860,0.071,-0.003,0.068

0,1,2,3
Omnibus:,14.666,Durbin-Watson:,2.518
Prob(Omnibus):,0.001,Jarque-Bera (JB):,20.582
Skew:,-1.03,Prob(JB):,3.39e-05
Kurtosis:,5.847,Cond. No.,497000.0


In [13]:
X_opt = X_train[:, [0, 3]]
X_opt = np.array(X_opt, dtype=float)  
regressor_OLS = sm.OLS(endog=y_train, exog=X_opt).fit()
regressor_OLS.summary()
# all P < 0.05, done!
# but if watch Adj. R-squared will find performance decrease
# so, in fact we don't need to drop last column

0,1,2,3
Dep. Variable:,y,R-squared:,0.945
Model:,OLS,Adj. R-squared:,0.944
Method:,Least Squares,F-statistic:,652.4
Date:,"Wed, 18 Nov 2020",Prob (F-statistic):,1.56e-25
Time:,15:18:50,Log-Likelihood:,-423.09
No. Observations:,40,AIC:,850.2
Df Residuals:,38,BIC:,853.6
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.842e+04,2842.717,17.032,0.000,4.27e+04,5.42e+04
x1,0.8516,0.033,25.542,0.000,0.784,0.919

0,1,2,3
Omnibus:,13.132,Durbin-Watson:,2.325
Prob(Omnibus):,0.001,Jarque-Bera (JB):,16.254
Skew:,-0.991,Prob(JB):,0.000295
Kurtosis:,5.413,Cond. No.,157000.0


In [14]:
X_train_opt = X_train[:, 3].reshape(-1, 1)    # select opt's columns
X_test_opt = X_test[:, 2].reshape(-1, 1)    # select opt's columns correspond train

# do linear regression
regressor_opt = LinearRegression()
regressor_opt.fit(X_train_opt, y_train)
# than we can find the performance better than previous model's

LinearRegression()

In [15]:
y_pred_opt = regressor_opt.predict(X_test_opt)
y_pred_opt

array([104667.27805998, 134150.83410578, 135207.80019517,  72170.54428856,
       179090.58602508, 109824.77386586,  65644.27773757, 100481.43277139,
       111431.75202432, 169438.14843539])

In [16]:
acc_opt = regressor_opt.score(X_test_opt, y_test)
acc_opt

0.9464587607787219