In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import statsmodels.api as sm
from statsmodels.stats.anova import anova_lm

In [2]:
from ISLP import load_data
from ISLP.models import (ModelSpec as MS,
                         summarize)

In [3]:
Auto = load_data('Auto')

In [4]:
Auto

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
387,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
388,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
389,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
390,28.0,4,120.0,79,2625,18.6,82,1,ford ranger


## Simple linear regression in Python
We are now going to build a simple linear regression model with the objective to predict `mpg` based on `horsepower`:

### Step 1: Build design matrix

In [9]:
X = pd.DataFrame({
        'intercept' : np.ones(Auto.shape[0]),
        'horsepower' : Auto['horsepower']})
X

Unnamed: 0,intercept,horsepower
0,1.0,130
1,1.0,165
2,1.0,150
3,1.0,150
4,1.0,140
...,...,...
387,1.0,86
388,1.0,52
389,1.0,84
390,1.0,79


### Step 2: Build response variable

In [11]:
y = Auto['mpg']
y

0      18.0
1      15.0
2      18.0
3      16.0
4      17.0
       ... 
387    27.0
388    44.0
389    32.0
390    28.0
391    31.0
Name: mpg, Length: 392, dtype: float64

### Step 3: Define and train model

In [12]:
model = sm.OLS(y,X) #defining model as a statsmodel OLS object with response variable y 
# and predictors X

In [18]:
results = model.fit() #run the Ordinary Least Squares algorithm to find the optimal model
# coefficients

### Step 4: Analyze the model

In [19]:
summarize(results)

  results_table = pd.read_html(tab.as_html(),


Unnamed: 0,coef,std err,t,P>|t|
intercept,39.9359,0.717,55.66,0.0
horsepower,-0.1578,0.006,-24.489,0.0


In [15]:
results.summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.606
Model:,OLS,Adj. R-squared:,0.605
Method:,Least Squares,F-statistic:,599.7
Date:,"Mon, 06 May 2024",Prob (F-statistic):,7.03e-81
Time:,18:24:57,Log-Likelihood:,-1178.7
No. Observations:,392,AIC:,2361.0
Df Residuals:,390,BIC:,2369.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,39.9359,0.717,55.660,0.000,38.525,41.347
horsepower,-0.1578,0.006,-24.489,0.000,-0.171,-0.145

0,1,2,3
Omnibus:,16.432,Durbin-Watson:,0.92
Prob(Omnibus):,0.0,Jarque-Bera (JB):,17.305
Skew:,0.492,Prob(JB):,0.000175
Kurtosis:,3.299,Cond. No.,322.0


**Question**: How can we interpret the fact that the `horsepower` coefficient has been calculated to be equal to -0.1578?

- There is a negative relationship between `mpg` and `horsepower`.
- Increasing `horsepower` by 1, on average leads to a decrease of `mpg` by 0.1578.

## Multiple linear regression in Python

In [21]:
predictors = Auto.columns.drop(['mpg', 'origin', 'name'])
predictors

Index(['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration',
       'year'],
      dtype='object')

In [39]:
# create design matrix
design = MS(predictors)
X = design.fit_transform(Auto)
X

  if is_categorical[i]:
  if is_categorical[i]:
  if is_categorical[i]:
  if is_categorical[i]:
  if is_categorical[i]:
  if is_categorical[i]:
  if is_categorical[i]:
  if is_categorical[i]:
  if is_categorical[i]:


Unnamed: 0,intercept,cylinders,displacement,horsepower,weight,acceleration,year
0,1.0,8,307.0,130,3504,12.0,70
1,1.0,8,350.0,165,3693,11.5,70
2,1.0,8,318.0,150,3436,11.0,70
3,1.0,8,304.0,150,3433,12.0,70
4,1.0,8,302.0,140,3449,10.5,70
...,...,...,...,...,...,...,...
387,1.0,4,140.0,86,2790,15.6,82
388,1.0,4,97.0,52,2130,24.6,82
389,1.0,4,135.0,84,2295,11.6,82
390,1.0,4,120.0,79,2625,18.6,82


In [40]:
# create vector consisting of response variables
y  = Auto['mpg']

In [41]:
model2 = sm.OLS(y,X)
results2 = model2.fit()
results2.summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.809
Model:,OLS,Adj. R-squared:,0.806
Method:,Least Squares,F-statistic:,272.2
Date:,"Mon, 06 May 2024",Prob (F-statistic):,3.79e-135
Time:,19:27:42,Log-Likelihood:,-1036.5
No. Observations:,392,AIC:,2087.0
Df Residuals:,385,BIC:,2115.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,-14.5353,4.764,-3.051,0.002,-23.902,-5.169
cylinders,-0.3299,0.332,-0.993,0.321,-0.983,0.323
displacement,0.0077,0.007,1.044,0.297,-0.007,0.022
horsepower,-0.0004,0.014,-0.028,0.977,-0.028,0.027
weight,-0.0068,0.001,-10.141,0.000,-0.008,-0.005
acceleration,0.0853,0.102,0.836,0.404,-0.115,0.286
year,0.7534,0.053,14.318,0.000,0.650,0.857

0,1,2,3
Omnibus:,37.865,Durbin-Watson:,1.232
Prob(Omnibus):,0.0,Jarque-Bera (JB):,60.248
Skew:,0.63,Prob(JB):,8.26e-14
Kurtosis:,4.449,Cond. No.,85300.0
