In [1352]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)
%config InlineBackend.figure_format = "retina"

In [1353]:
column_names = [
    "mpg",
    "cylinders",
    "displacement",
    "horsepower",
    "weight",
    "acceleration",
    "model_year",
    "origin",
    "car_name",
]

df = pd.read_csv("../data/auto-mpg.data", names=column_names, sep=r"\s+", na_values="?")
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [1354]:
df.isna().agg(["sum", "mean"]).T.apply(
    lambda x: x.astype(int) if x.name == "sum" else round(x * 100, 2)
).set_axis(["Null Count", "Null %"], axis=1)

Unnamed: 0,Null Count,Null %
mpg,0,0.0
cylinders,0,0.0
displacement,0,0.0
horsepower,6,1.51
weight,0,0.0
acceleration,0,0.0
model_year,0,0.0
origin,0,0.0
car_name,0,0.0


In [1355]:
df.drop(columns="car_name").corr()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
mpg,1.0,-0.775396,-0.804203,-0.778427,-0.831741,0.420289,0.579267,0.56345
cylinders,-0.775396,1.0,0.950721,0.842983,0.896017,-0.505419,-0.348746,-0.562543
displacement,-0.804203,0.950721,1.0,0.897257,0.932824,-0.543684,-0.370164,-0.609409
horsepower,-0.778427,0.842983,0.897257,1.0,0.864538,-0.689196,-0.416361,-0.455171
weight,-0.831741,0.896017,0.932824,0.864538,1.0,-0.417457,-0.306564,-0.581024
acceleration,0.420289,-0.505419,-0.543684,-0.689196,-0.417457,1.0,0.288137,0.205873
model_year,0.579267,-0.348746,-0.370164,-0.416361,-0.306564,0.288137,1.0,0.180662
origin,0.56345,-0.562543,-0.609409,-0.455171,-0.581024,0.205873,0.180662,1.0


In [1356]:
df["speed_category"] = pd.cut(
    df["acceleration"], 5, labels=["fastest", "fast", "medium", "slow", "slowest"]
)
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name,speed_category
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu,fast
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320,fast
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite,fastest
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst,fast
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino,fastest
...,...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790.0,15.6,82,1,ford mustang gl,medium
394,44.0,4,97.0,52.0,2130.0,24.6,82,2,vw pickup,slowest
395,32.0,4,135.0,84.0,2295.0,11.6,82,1,dodge rampage,fast
396,28.0,4,120.0,79.0,2625.0,18.6,82,1,ford ranger,slow


In [1357]:
df.groupby("speed_category").count()

Unnamed: 0_level_0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
speed_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
fastest,21,21,21,21,21,21,21,21,21
fast,136,136,136,135,136,136,136,136,136
medium,172,172,172,169,172,172,172,172,172
slow,58,58,58,56,58,58,58,58,58
slowest,11,11,11,11,11,11,11,11,11


In [1358]:
mean_hp = df.groupby("speed_category")["horsepower"].mean().round(1)
mean_hp

speed_category
fastest    183.8
fast       123.7
medium      90.1
slow        80.2
slowest     61.6
Name: horsepower, dtype: float64

In [1359]:
df["horsepower"] = df.apply(
    lambda row: (
        mean_hp[row["speed_category"]]
        if pd.isna(row["horsepower"])
        else row["horsepower"]
    ),
    axis=1,
)

In [1360]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
mpg,398.0,23.514573,7.815984,9.0,17.5,23.0,29.0,46.6
cylinders,398.0,5.454774,1.701004,3.0,4.0,4.0,8.0,8.0
displacement,398.0,193.425879,104.269838,68.0,104.25,148.5,262.0,455.0
horsepower,398.0,104.287437,38.270143,46.0,76.0,92.0,125.0,230.0
weight,398.0,2970.424623,846.841774,1613.0,2223.75,2803.5,3608.0,5140.0
acceleration,398.0,15.56809,2.757689,8.0,13.825,15.5,17.175,24.8
model_year,398.0,76.01005,3.697627,70.0,73.0,76.0,79.0,82.0
origin,398.0,1.572864,0.802055,1.0,1.0,1.0,2.0,3.0


In [1361]:
df.drop(columns=["car_name", "speed_category"]).corr()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
mpg,1.0,-0.775396,-0.804203,-0.77332,-0.831741,0.420289,0.579267,0.56345
cylinders,-0.775396,1.0,0.950721,0.840348,0.896017,-0.505419,-0.348746,-0.562543
displacement,-0.804203,0.950721,1.0,0.895224,0.932824,-0.543684,-0.370164,-0.609409
horsepower,-0.77332,0.840348,0.895224,1.0,0.862604,-0.689582,-0.411728,-0.451782
weight,-0.831741,0.896017,0.932824,0.862604,1.0,-0.417457,-0.306564,-0.581024
acceleration,0.420289,-0.505419,-0.543684,-0.689582,-0.417457,1.0,0.288137,0.205873
model_year,0.579267,-0.348746,-0.370164,-0.411728,-0.306564,0.288137,1.0,0.180662
origin,0.56345,-0.562543,-0.609409,-0.451782,-0.581024,0.205873,0.180662,1.0


In [1362]:
median_mpg_by_year = df.groupby("model_year")["mpg"].median()
px.line(median_mpg_by_year)

In [1363]:
def categorize_year(year):
    if 70 <= year <= 73:
        return 0
    elif 74 <= year <= 79:
        return 1
    elif 80 <= year <= 82:
        return 2
    else:
        return None

df["model_time_period"] = df["model_year"].apply(categorize_year)

In [1364]:
df.groupby(["model_time_period"])["mpg"].mean().round(2)

model_time_period
0    18.53
1    22.84
2    31.91
Name: mpg, dtype: float64

In [1365]:
df.groupby("origin")["mpg"].agg(["mean", "median"])

Unnamed: 0_level_0,mean,median
origin,Unnamed: 1_level_1,Unnamed: 2_level_1
1,20.083534,18.5
2,27.891429,26.5
3,30.450633,31.6


In [1366]:
df.groupby(["origin", "model_time_period"])["mpg"].mean().round(2)

origin  model_time_period
1       0                    16.03
        1                    20.38
        2                    28.21
2       0                    24.71
        1                    26.22
        2                    36.13
3       0                    24.67
        1                    28.86
        2                    34.40
Name: mpg, dtype: float64

In [1367]:
df[["origin", "mpg"]].corr()

Unnamed: 0,origin,mpg
origin,1.0,0.56345
mpg,0.56345,1.0


In [1368]:
df["origin_us"] = df["origin"].apply(lambda x: 1 if x == 1 else 0)

In [1369]:
df[["mpg", "origin", "origin_us"]].corr()

Unnamed: 0,mpg,origin,origin_us
mpg,1.0,0.56345,-0.568192
origin,0.56345,1.0,-0.924486
origin_us,-0.568192,-0.924486,1.0


In [1370]:
df.drop(columns=["model_year", "origin", "car_name", "speed_category"], inplace=True)
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_time_period,origin_us
0,18.0,8,307.0,130.0,3504.0,12.0,0,1
1,15.0,8,350.0,165.0,3693.0,11.5,0,1
2,18.0,8,318.0,150.0,3436.0,11.0,0,1
3,16.0,8,304.0,150.0,3433.0,12.0,0,1
4,17.0,8,302.0,140.0,3449.0,10.5,0,1
...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790.0,15.6,2,1
394,44.0,4,97.0,52.0,2130.0,24.6,2,0
395,32.0,4,135.0,84.0,2295.0,11.6,2,1
396,28.0,4,120.0,79.0,2625.0,18.6,2,1


In [1371]:
df.drop(columns=["model_time_period", "origin_us"]).corr().round(2)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration
mpg,1.0,-0.78,-0.8,-0.77,-0.83,0.42
cylinders,-0.78,1.0,0.95,0.84,0.9,-0.51
displacement,-0.8,0.95,1.0,0.9,0.93,-0.54
horsepower,-0.77,0.84,0.9,1.0,0.86,-0.69
weight,-0.83,0.9,0.93,0.86,1.0,-0.42
acceleration,0.42,-0.51,-0.54,-0.69,-0.42,1.0


In [1372]:
df.drop(columns="mpg")


Output cache limit (currently 1000 entries) hit.
Flushing oldest 200 entries.



Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_time_period,origin_us
0,8,307.0,130.0,3504.0,12.0,0,1
1,8,350.0,165.0,3693.0,11.5,0,1
2,8,318.0,150.0,3436.0,11.0,0,1
3,8,304.0,150.0,3433.0,12.0,0,1
4,8,302.0,140.0,3449.0,10.5,0,1
...,...,...,...,...,...,...,...
393,4,140.0,86.0,2790.0,15.6,2,1
394,4,97.0,52.0,2130.0,24.6,2,0
395,4,135.0,84.0,2295.0,11.6,2,1
396,4,120.0,79.0,2625.0,18.6,2,1


In [1373]:
df[["mpg", "horsepower", "weight"]].corr()

Unnamed: 0,mpg,horsepower,weight
mpg,1.0,-0.77332,-0.831741
horsepower,-0.77332,1.0,0.862604
weight,-0.831741,0.862604,1.0


In [1374]:
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_time_period,origin_us
0,18.0,8,307.0,130.0,3504.0,12.0,0,1
1,15.0,8,350.0,165.0,3693.0,11.5,0,1
2,18.0,8,318.0,150.0,3436.0,11.0,0,1
3,16.0,8,304.0,150.0,3433.0,12.0,0,1
4,17.0,8,302.0,140.0,3449.0,10.5,0,1
...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790.0,15.6,2,1
394,44.0,4,97.0,52.0,2130.0,24.6,2,0
395,32.0,4,135.0,84.0,2295.0,11.6,2,1
396,28.0,4,120.0,79.0,2625.0,18.6,2,1


In [1375]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(
    df.drop(columns=["cylinders", "displacement"]), random_state=42, train_size=0.8
)

In [1376]:
df_train.shape, df_test.shape

((318, 6), (80, 6))

In [1377]:
df_train["horsepower"] = np.log1p(df_train["horsepower"])
df_train["weight"] = np.log1p(df_train["weight"])
df_train["mpg"] = np.log1p(df_train["mpg"])

In [1378]:
df[["mpg", "horsepower", "weight"]].corr()

Unnamed: 0,mpg,horsepower,weight
mpg,1.0,-0.77332,-0.831741
horsepower,-0.77332,1.0,0.862604
weight,-0.831741,0.862604,1.0


In [1379]:
mpg_mean = np.mean(df_train["mpg"])
horsepower_mean = np.mean(df_train["horsepower"])
weight_mean = np.mean(df_train["weight"])
acceleration_mean = np.mean(df_train["acceleration"])

mpg_std = np.std(df_train["mpg"])
horsepower_std = np.std(df_train["horsepower"])
weight_std = np.std(df_train["weight"])
acceleration_std = np.std(df_train["acceleration"])

df_train["horsepower"] = (df_train["horsepower"] - horsepower_mean) / horsepower_std
df_train["weight"] = (df_train["weight"] - weight_mean) / weight_std
df_train["acceleration"] = (df_train["acceleration"] - acceleration_mean) / acceleration_std
df_train["mpg"] = (df_train["mpg"] - mpg_mean) / mpg_std

In [1380]:
df_train

Unnamed: 0,mpg,horsepower,weight,acceleration,model_time_period,origin_us
3,-0.973521,1.293338,0.659813,-1.319334,0,1
18,0.556341,-0.301345,-1.047851,-0.413182,0,0
376,1.492614,-1.069149,-1.228691,0.927922,2,0
248,1.419127,-1.440887,-1.650016,0.275493,1,0
177,0.083729,-0.072956,-0.207478,-0.231952,1,0
...,...,...,...,...,...,...
71,-0.475252,-0.010757,-0.726794,-0.775643,0,0
106,-1.795994,1.839990,1.627401,-1.138103,0,1
270,-0.169135,-0.072956,-0.453455,-0.304444,1,0
348,1.548578,-1.343570,-1.184798,0.601707,2,0


In [1381]:
df_test["horsepower"] = np.log1p(df_test["horsepower"])
df_test["weight"] = np.log1p(df_test["weight"])
# df_test["mpg"] = np.log1p(df_test["mpg"])

In [1382]:
df_test["horsepower"] = (df_test["horsepower"] - horsepower_mean) / horsepower_std
df_test["weight"] = (df_test["weight"] - weight_mean) / weight_std
df_test["acceleration"] = (df_test["acceleration"] - acceleration_mean) / acceleration_std
# df_test["mpg"] = (df_test["mpg"] - mpg_mean) / mpg_std

In [1383]:
df_test

Unnamed: 0,mpg,horsepower,weight,acceleration,model_time_period,origin_us
198,33.0,-1.808574,-1.659965,0.637953,1,0
396,28.0,-0.622939,-0.300304,1.072906,2,1
33,19.0,0.080202,-0.288059,-0.956873,0,1
208,13.0,1.293338,1.152672,-0.884381,1,1
93,14.0,1.293338,1.412712,-0.413182,0,1
...,...,...,...,...,...,...
249,19.9,0.364994,0.588230,-0.050722,1,1
225,17.5,0.364994,0.749357,0.275493,1,1
367,28.0,-0.301345,-0.327667,1.435367,2,1
175,29.0,-0.982955,-1.387623,-0.594413,1,0


In [1384]:
df_test.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
mpg,80.0,23.1425,7.378823,10.0,17.375,23.0,27.25,44.0
horsepower,80.0,0.135068,1.087825,-1.86496,-0.651763,-0.072956,0.864738,2.575791
weight,80.0,0.00231,1.020669,-1.659965,-0.865344,-0.156724,0.934783,1.972852
acceleration,80.0,-0.129557,0.990925,-2.587946,-0.748458,-0.050722,0.357046,3.247669
model_time_period,80.0,0.825,0.775821,0.0,0.0,1.0,1.0,2.0
origin_us,80.0,0.6625,0.47584,0.0,0.0,1.0,1.0,1.0


In [1385]:
y_train = df_train.pop("mpg").to_numpy()
X_train = df_train.to_numpy()

In [1386]:
y_test = df_test.pop("mpg").to_numpy()
X_test = df_test.to_numpy()

In [1387]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((318, 5), (318,), (80, 5), (80,))

In [1388]:
beta = (np.linalg.pinv(X_train.T @ X_train) @ X_train.T) @ y_train

In [1389]:
beta.shape

(5,)

In [1294]:
y_pred = np.expm1(X_test @ beta * mpg_std + mpg_mean)

In [1295]:
round(r2_score(y_test, y_pred), 3)

0.856

In [1296]:
for col in range(X_train.shape[1]):
    fig = px.histogram(data_frame=X_train[:, col])
    fig.show()

In [1303]:
fig = px.scatter(x=y_test, y=y_pred)
fig.update_layout(height=600, width=1000, xaxis=dict(range=(0, 50)), yaxis=dict(range=(0, 50)))
fig.show()

In [1304]:
residuals = y_pred - y_test

In [1311]:
np.diag(residuals ** 2)

array([[2.56628676e+01, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 2.18921678e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 2.88600599e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        6.88331637e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 3.11606461e+01, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 2.81695460e-04]], shape=(80, 80))

In [None]:
W = residuals ** 2

# Some Testing

In [None]:
np.linalg.inv(X.T @ X) @ X.T @ y

In [None]:
np.cov(X, rowvar=False)