## Multiple linear regression using statsmodels
- auto-mpg dataset

### Previously covered preprocessing steps
- creating dummy variables for categorical features
- log transformation on continuous predictors

In [2]:
import pandas as pd
import numpy as np

In [11]:
data = pd.read_csv('../data/input/auto-mpg.csv')

In [8]:
data['horsepower'].astype(str).astype(int)

0      130
1      165
2      150
3      150
4      140
5      198
6      220
7      215
8      225
9      190
10     170
11     160
12     150
13     225
14      95
15      95
16      97
17      85
18      88
19      46
20      87
21      90
22      95
23     113
24      90
25     215
26     200
27     210
28     193
29      88
      ... 
367     88
368     88
369     88
370     85
371     84
372     90
373     92
375     74
376     68
377     68
378     63
379     70
380     88
381     75
382     70
383     67
384     67
385     67
386    110
387     85
388     92
389    112
390     96
391     84
392     90
393     86
394     52
395     84
396     79
397     82
Name: horsepower, Length: 392, dtype: int64

In [12]:
acc = data["acceleration"]
logdisp = np.log(data["displacement"])
loghorse = np.log(data["horsepower"])
logweight= np.log(data["weight"])

scaled_acc = (acc-min(acc))/(max(acc)-min(acc))	
scaled_disp = (logdisp-np.mean(logdisp))/np.sqrt(np.var(logdisp))
scaled_horse = (loghorse-np.mean(loghorse))/(max(loghorse)-min(loghorse))
scaled_weight= (logweight-np.mean(logweight))/np.sqrt(np.var(logweight))

data_fin = pd.DataFrame([])
data_fin["acc"]= scaled_acc
data_fin["disp"]= scaled_disp
data_fin["horse"] = scaled_horse
data_fin["weight"] = scaled_weight
cyl_dummies = pd.get_dummies(data["cylinders"], prefix="cyl")
yr_dummies = pd.get_dummies(data["model year"], prefix="yr")
orig_dummies = pd.get_dummies(data["origin"], prefix="orig")
mpg = data["mpg"]
data_fin = pd.concat([mpg, data_fin, cyl_dummies, yr_dummies, orig_dummies], axis=1)

In [13]:
data_fin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 26 columns):
mpg       392 non-null float64
acc       392 non-null float64
disp      392 non-null float64
horse     392 non-null float64
weight    392 non-null float64
cyl_3     392 non-null uint8
cyl_4     392 non-null uint8
cyl_5     392 non-null uint8
cyl_6     392 non-null uint8
cyl_8     392 non-null uint8
yr_70     392 non-null uint8
yr_71     392 non-null uint8
yr_72     392 non-null uint8
yr_73     392 non-null uint8
yr_74     392 non-null uint8
yr_75     392 non-null uint8
yr_76     392 non-null uint8
yr_77     392 non-null uint8
yr_78     392 non-null uint8
yr_79     392 non-null uint8
yr_80     392 non-null uint8
yr_81     392 non-null uint8
yr_82     392 non-null uint8
orig_1    392 non-null uint8
orig_2    392 non-null uint8
orig_3    392 non-null uint8
dtypes: float64(5), uint8(21)
memory usage: 23.4 KB


In [14]:
# use selected data 
data_ols = pd.concat([mpg, scaled_acc, scaled_weight, orig_dummies], axis=1)

In [15]:
data_ols.head()

Unnamed: 0,mpg,acceleration,weight,orig_1,orig_2,orig_3
0,18.0,0.238095,0.720986,1,0,0
1,15.0,0.208333,0.908047,1,0,0
2,18.0,0.178571,0.651205,1,0,0
3,16.0,0.238095,0.648095,1,0,0
4,17.0,0.14881,0.664652,1,0,0


### statsmodels

In [16]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [17]:
formula = 'mpg~acceleration+weight+orig_1+orig_2+orig_2'
model = ols(formula=formula, data=data_ols).fit()

In [18]:
# alternative to typing out all predictors
outcome = 'mpg'
predictor = data_ols.drop('mpg', axis=1)
predictors = '+'.join(predictor.columns)
formula = outcome + '~' + predictors

In [19]:
model = ols(formula=formula, data=data_ols).fit()
model.summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.726
Model:,OLS,Adj. R-squared:,0.723
Method:,Least Squares,F-statistic:,256.7
Date:,"Mon, 24 Aug 2020",Prob (F-statistic):,1.86e-107
Time:,21:12:00,Log-Likelihood:,-1107.2
No. Observations:,392,AIC:,2224.0
Df Residuals:,387,BIC:,2244.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,16.1041,0.509,31.636,0.000,15.103,17.105
acceleration,5.0494,1.389,3.634,0.000,2.318,7.781
weight,-5.8764,0.282,-20.831,0.000,-6.431,-5.322
orig_1,4.6566,0.363,12.839,0.000,3.944,5.370
orig_2,5.0690,0.454,11.176,0.000,4.177,5.961
orig_3,6.3785,0.430,14.829,0.000,5.533,7.224

0,1,2,3
Omnibus:,37.427,Durbin-Watson:,0.84
Prob(Omnibus):,0.0,Jarque-Bera (JB):,55.989
Skew:,0.648,Prob(JB):,6.95e-13
Kurtosis:,4.322,Cond. No.,2180000000000000.0


### statsmodels part 2 - add_constant

In [26]:
import statsmodels.api as sm
predictors_int = sm.add_constant(predictor)
model = sm.OLS(data['mpg'], predictors_int).fit()
model.summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.726
Model:,OLS,Adj. R-squared:,0.723
Method:,Least Squares,F-statistic:,256.7
Date:,"Mon, 24 Aug 2020",Prob (F-statistic):,1.86e-107
Time:,21:20:58,Log-Likelihood:,-1107.2
No. Observations:,392,AIC:,2224.0
Df Residuals:,387,BIC:,2244.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,16.1041,0.509,31.636,0.000,15.103,17.105
acceleration,5.0494,1.389,3.634,0.000,2.318,7.781
weight,-5.8764,0.282,-20.831,0.000,-6.431,-5.322
orig_1,4.6566,0.363,12.839,0.000,3.944,5.370
orig_2,5.0690,0.454,11.176,0.000,4.177,5.961
orig_3,6.3785,0.430,14.829,0.000,5.533,7.224

0,1,2,3
Omnibus:,37.427,Durbin-Watson:,0.84
Prob(Omnibus):,0.0,Jarque-Bera (JB):,55.989
Skew:,0.648,Prob(JB):,6.95e-13
Kurtosis:,4.322,Cond. No.,2180000000000000.0


### sklearn

In [28]:
from sklearn.linear_model import LinearRegression

In [29]:
y = data_ols['mpg']
linreg = LinearRegression()
linreg.fit(predictor, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [30]:
linreg.coef_

array([ 5.04941007, -5.87640551, -0.71140721, -0.29903267,  1.01043987])

In [31]:
linreg.intercept_

21.472164286075394

### conslusion: same results

In [None]:
# although sklearn and statsmodel have slightly difference
# in coefficients and intercepts, they both produce the same
# result, even if droping out a column to try like below
# generates the same result

In [32]:
# sklearn
predictors = predictor.drop('orig_3', axis=1)
linreg = LinearRegression()
linreg.fit(predictors, y)
linreg.coef_

array([ 5.04941007, -5.87640551, -1.72184708, -1.30947254])

In [35]:
linreg.intercept_

22.48260416045567

In [36]:
# statsmodels

In [37]:
pred_sum = '+'.join(predictors.columns)
formula = outcome + '~' + pred_sum
model = ols(formula=formula, data=data_ols).fit()
model.summary()

NameError: name 'outcomt' is not defined