In [54]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
%matplotlib inline

In [55]:
df = pd.read_csv('cars_multivariate.txt')
df.head(3)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite


In [56]:
# changing horsepower to hp and strings to numbers
df['hp'] = pd.to_numeric(df.horsepower, errors='coerce')
df = df[df.hp.notnull()]

# setting my dependent and independent variables
y = df.mpg
X = df.drop(['mpg', 'origin', 'car_name', 'model', 'horsepower'], axis=1)

# adding a bias/y-intercept
X = sm.add_constant(X)

In [57]:
y.head(1)

0    18.0
Name: mpg, dtype: float64

In [58]:
X.head(1)

Unnamed: 0,const,cylinders,displacement,weight,acceleration,hp
0,1.0,8,307.0,3504.0,12.0,130.0


In [59]:
# fit with statsmodels
model1 = sm.OLS(y,X)
results = model1.fit()
results.summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.708
Model:,OLS,Adj. R-squared:,0.704
Method:,Least Squares,F-statistic:,186.9
Date:,"Tue, 06 Mar 2018",Prob (F-statistic):,9.82e-101
Time:,16:16:35,Log-Likelihood:,-1120.1
No. Observations:,392,AIC:,2252.0
Df Residuals:,386,BIC:,2276.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,46.2643,2.669,17.331,0.000,41.016,51.513
cylinders,-0.3979,0.411,-0.969,0.333,-1.205,0.409
displacement,-8.313e-05,0.009,-0.009,0.993,-0.018,0.018
weight,-0.0052,0.001,-6.351,0.000,-0.007,-0.004
acceleration,-0.0291,0.126,-0.231,0.817,-0.276,0.218
hp,-0.0453,0.017,-2.716,0.007,-0.078,-0.012

0,1,2,3
Omnibus:,38.561,Durbin-Watson:,0.865
Prob(Omnibus):,0.0,Jarque-Bera (JB):,52.737
Skew:,0.706,Prob(JB):,3.53e-12
Kurtosis:,4.111,Cond. No.,38700.0


In [60]:
# compute rmse
y_hat = results.fittedvalues
(((y-y_hat)**2).sum()/386)**0.5

4.247055448506921

In [61]:
# compute r2
1-((y-y_hat)**2).sum()/((y-y.mean())**2).sum()

0.70769263267705163

# Computing statistics for `cylinder` beta coefficient

In [62]:
# t-statistic
# beta-0/se
-0.3979/0.411

-0.9681265206812653

In [63]:
# critical value
# find in table
1.96

1.96

In [64]:
# p-value
# find in table
0.333

0.333

In [65]:
# compute t-statistic
# beta +/- critical_value * se
-0.3979-(1.96*0.411), -0.3979+(1.96*0.411),

(-1.20346, 0.40765999999999997)