In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

%matplotlib inline

In [2]:
dataset_file = '../datasets/Auto.csv'
data = pd.read_csv(dataset_file, index_col='name')
data = data.loc[data.horsepower != '?']
data.horsepower = data.horsepower.apply(int)
data.head()

Unnamed: 0_level_0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
chevrolet chevelle malibu,18.0,8,307.0,130,3504,12.0,70,1
buick skylark 320,15.0,8,350.0,165,3693,11.5,70,1
plymouth satellite,18.0,8,318.0,150,3436,11.0,70,1
amc rebel sst,16.0,8,304.0,150,3433,12.0,70,1
ford torino,17.0,8,302.0,140,3449,10.5,70,1


In [3]:
train_data = data.sample(frac=.5, random_state=1)
test_data = data.drop(train_data.index)
train_X = train_data.horsepower.values.reshape(-1, 1)
train_X_2 = np.hstack([train_X, train_X ** 2])
train_X_3 = np.hstack([train_X, train_X ** 2, train_X ** 3])
test_X = test_data.horsepower.values.reshape(-1, 1)
test_X_2 = np.hstack([test_X, test_X ** 2])
test_X_3 = np.hstack([test_X, test_X ** 2, test_X ** 3])
train_y = train_data.mpg
test_y = test_data.mpg

In [4]:
model = LinearRegression().fit(train_X, train_y)
(test_y - model.predict(test_X)).pow(2).mean()

25.01423339941708

In [5]:
model2 = LinearRegression().fit(train_X_2, train_y)
(test_y - model2.predict(test_X_2)).pow(2).mean()

20.942287489569363

In [6]:
model3 = LinearRegression().fit(train_X_3, train_y)
(test_y - model3.predict(test_X_3)).pow(2).mean()

20.87712085084548

In [7]:
train_data = data.sample(frac=.5, random_state=5)
test_data = data.drop(train_data.index)
train_X = train_data.horsepower.values.reshape(-1, 1)
train_X_2 = np.hstack([train_X, train_X ** 2])
train_X_3 = np.hstack([train_X, train_X ** 2, train_X ** 3])
test_X = test_data.horsepower.values.reshape(-1, 1)
test_X_2 = np.hstack([test_X, test_X ** 2])
test_X_3 = np.hstack([test_X, test_X ** 2, test_X ** 3])
train_y = train_data.mpg
test_y = test_data.mpg

In [8]:
model = LinearRegression().fit(train_X, train_y)
print((test_y - model.predict(test_X)).pow(2).mean())
model2 = LinearRegression().fit(train_X_2, train_y)
print((test_y - model2.predict(test_X_2)).pow(2).mean())
model3 = LinearRegression().fit(train_X_3, train_y)
print((test_y - model3.predict(test_X_3)).pow(2).mean())

20.44845821786757
15.436639885482204
15.353884023392903


In [9]:
loocv_error = 0
for i in range(test_X.shape[0]):
    model = LinearRegression().fit(np.delete(train_X, i).reshape(-1, 1), np.delete(train_y.to_list(), i))
    loocv_error += ((np.delete(test_y.to_list(), i) - model.predict(np.delete(test_X, i).reshape(-1, 1))) ** 2).mean()
loocv_error / train_X.shape[0]

15.231088419160079

In [10]:
loocv_error = 0
for i in range(test_X_2.shape[0]):
    model = LinearRegression().fit(np.delete(train_X_2, i, axis=0), np.delete(train_y.to_list(), i))
    loocv_error += ((np.delete(test_y.to_list(), i) - model.predict(np.delete(test_X_2, i, axis=0))) ** 2).mean()
loocv_error / train_X_2.shape[0]

11.499112188721568

In [11]:
loocv_error = 0
for i in range(test_X_3.shape[0]):
    model = LinearRegression().fit(np.delete(train_X_3, i, axis=0), np.delete(train_y.to_list(), i))
    loocv_error += ((np.delete(test_y.to_list(), i) - model.predict(np.delete(test_X_3, i, axis=0))) ** 2).mean()
loocv_error / train_X_3.shape[0]

11.438497535998787

In [12]:
from sklearn.model_selection import KFold

In [13]:
cv = KFold(10, shuffle=True, random_state=1)

In [22]:
cv_error = 0
X = train_X
y = train_y.values
for train, test in cv.split(X):
    model = LinearRegression().fit(X[train], y[train])
    cv_error += ((y[test] - model.predict(X[test])) ** 2).mean()
cv_error / 10

28.58442114404799

In [23]:
cv_error = 0
for train, test in cv.split(X):
    model = LinearRegression().fit(train_X_2[train], y[train])
    cv_error += ((y[test] - model.predict(train_X_2[test])) ** 2).mean()
cv_error / 10

23.99543085741586

In [24]:
cv_error = 0
for train, test in cv.split(X):
    model = LinearRegression().fit(train_X_3[train], y[train])
    cv_error += ((y[test] - model.predict(train_X_3[test])) ** 2).mean()
cv_error / 10

24.531021279265218

In [25]:
from sklearn.utils import resample

In [52]:
X_y = np.concatenate((X, y.reshape(-1, 1)), axis=1)

In [64]:
def get_b(data):
    model = LinearRegression().fit(data[:, 0].reshape(-1, 1), data[:, 1])
    return model.intercept_, model.coef_[0]

In [73]:
vals = []
for i in range(1000):
    vals.append(get_b(resample(X_y)))

b0 = [val[0] for val in vals]
b1 = [val[1] for val in vals]

print(np.mean(b0))
print(np.mean(b1))
print(np.std(b0))
print(np.std(b1))

41.59695249789293
-0.16929898341674215
1.2774482634250124
0.010664599954861695


In [72]:
model = sm.OLS(X_y[:, 1], sm.add_constant(X_y[:, 0])).fit()
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.61
Model:,OLS,Adj. R-squared:,0.608
Method:,Least Squares,F-statistic:,304.0
Date:,"Wed, 09 Sep 2020",Prob (F-statistic):,1.42e-41
Time:,17:21:13,Log-Likelihood:,-604.64
No. Observations:,196,AIC:,1213.0
Df Residuals:,194,BIC:,1220.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,41.4653,1.076,38.519,0.000,39.342,43.588
x1,-0.1679,0.010,-17.434,0.000,-0.187,-0.149

0,1,2,3
Omnibus:,5.707,Durbin-Watson:,2.098
Prob(Omnibus):,0.058,Jarque-Bera (JB):,5.592
Skew:,0.413,Prob(JB):,0.0611
Kurtosis:,3.046,Cond. No.,317.0


In [76]:
X = pd.concat([data.horsepower, data.horsepower ** 2], axis=1)
y = data.mpg

In [77]:
def get_b(X, y):
    model = LinearRegression().fit(X, y)
    return model.intercept_, model.coef_[0], model.coef_[1]

In [81]:
vals = []
for i in range(1000):
    idxs = resample(X.index)
    vals.append(get_b(X.loc[idxs], y.loc[idxs]))

b0 = [val[0] for val in vals]
b1 = [val[1] for val in vals]
b2 = [val[2] for val in vals]

print('Means')
print(np.mean(b0))
print(np.mean(b1))
print(np.mean(b2))
print('STDS')
print(np.std(b0))
print(np.std(b1))
print(np.std(b2))

Means
55.99870530989338
-0.46667992093406707
0.0012618840272627671
STDS
1.5143916500513315
0.024333354942184072
8.820324848180505e-05


In [83]:
model = sm.OLS(y, sm.add_constant(X)).fit()
model.summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.688
Model:,OLS,Adj. R-squared:,0.686
Method:,Least Squares,F-statistic:,428.0
Date:,"Wed, 09 Sep 2020",Prob (F-statistic):,5.4000000000000005e-99
Time:,17:37:47,Log-Likelihood:,-1133.2
No. Observations:,392,AIC:,2272.0
Df Residuals:,389,BIC:,2284.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,56.9001,1.800,31.604,0.000,53.360,60.440
horsepower,-0.4662,0.031,-14.978,0.000,-0.527,-0.405
horsepower,0.0012,0.000,10.080,0.000,0.001,0.001

0,1,2,3
Omnibus:,16.158,Durbin-Watson:,1.078
Prob(Omnibus):,0.0,Jarque-Bera (JB):,30.662
Skew:,0.218,Prob(JB):,2.2e-07
Kurtosis:,4.299,Cond. No.,129000.0
