In [34]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)
%config InlineBackend.figure_format = "retina"

In [35]:
column_names = [
    "mpg",
    "cylinders",
    "displacement",
    "horsepower",
    "weight",
    "acceleration",
    "model_year",
    "origin",
    "car_name",
]

df = pd.read_csv("../data/auto-mpg.data", names=column_names, sep=r"\s+", na_values="?")
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [36]:
df.isna().agg(["sum", "mean"]).T.apply(
    lambda x: x.astype(int) if x.name == "sum" else round(x * 100, 2)
).set_axis(["Null Count", "Null %"], axis=1)

Unnamed: 0,Null Count,Null %
mpg,0,0.0
cylinders,0,0.0
displacement,0,0.0
horsepower,6,1.51
weight,0,0.0
acceleration,0,0.0
model_year,0,0.0
origin,0,0.0
car_name,0,0.0


In [37]:
df.drop(columns="car_name").corr()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
mpg,1.0,-0.775396,-0.804203,-0.778427,-0.831741,0.420289,0.579267,0.56345
cylinders,-0.775396,1.0,0.950721,0.842983,0.896017,-0.505419,-0.348746,-0.562543
displacement,-0.804203,0.950721,1.0,0.897257,0.932824,-0.543684,-0.370164,-0.609409
horsepower,-0.778427,0.842983,0.897257,1.0,0.864538,-0.689196,-0.416361,-0.455171
weight,-0.831741,0.896017,0.932824,0.864538,1.0,-0.417457,-0.306564,-0.581024
acceleration,0.420289,-0.505419,-0.543684,-0.689196,-0.417457,1.0,0.288137,0.205873
model_year,0.579267,-0.348746,-0.370164,-0.416361,-0.306564,0.288137,1.0,0.180662
origin,0.56345,-0.562543,-0.609409,-0.455171,-0.581024,0.205873,0.180662,1.0


In [38]:
df["speed_category"] = pd.cut(
    df["acceleration"], 5, labels=["fastest", "fast", "medium", "slow", "slowest"]
)
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name,speed_category
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu,fast
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320,fast
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite,fastest
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst,fast
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino,fastest
...,...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790.0,15.6,82,1,ford mustang gl,medium
394,44.0,4,97.0,52.0,2130.0,24.6,82,2,vw pickup,slowest
395,32.0,4,135.0,84.0,2295.0,11.6,82,1,dodge rampage,fast
396,28.0,4,120.0,79.0,2625.0,18.6,82,1,ford ranger,slow


In [39]:
mean_hp = df.groupby("speed_category")["horsepower"].mean().round(1)
mean_hp

speed_category
fastest    183.8
fast       123.7
medium      90.1
slow        80.2
slowest     61.6
Name: horsepower, dtype: float64

In [40]:
df["horsepower"] = df.apply(
    lambda row: (
        mean_hp[row["speed_category"]]
        if pd.isna(row["horsepower"])
        else row["horsepower"]
    ),
    axis=1,
)

In [41]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
mpg,398.0,23.514573,7.815984,9.0,17.5,23.0,29.0,46.6
cylinders,398.0,5.454774,1.701004,3.0,4.0,4.0,8.0,8.0
displacement,398.0,193.425879,104.269838,68.0,104.25,148.5,262.0,455.0
horsepower,398.0,104.287437,38.270143,46.0,76.0,92.0,125.0,230.0
weight,398.0,2970.424623,846.841774,1613.0,2223.75,2803.5,3608.0,5140.0
acceleration,398.0,15.56809,2.757689,8.0,13.825,15.5,17.175,24.8
model_year,398.0,76.01005,3.697627,70.0,73.0,76.0,79.0,82.0
origin,398.0,1.572864,0.802055,1.0,1.0,1.0,2.0,3.0


In [42]:
df.drop(columns=["car_name", "speed_category"]).corr()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
mpg,1.0,-0.775396,-0.804203,-0.77332,-0.831741,0.420289,0.579267,0.56345
cylinders,-0.775396,1.0,0.950721,0.840348,0.896017,-0.505419,-0.348746,-0.562543
displacement,-0.804203,0.950721,1.0,0.895224,0.932824,-0.543684,-0.370164,-0.609409
horsepower,-0.77332,0.840348,0.895224,1.0,0.862604,-0.689582,-0.411728,-0.451782
weight,-0.831741,0.896017,0.932824,0.862604,1.0,-0.417457,-0.306564,-0.581024
acceleration,0.420289,-0.505419,-0.543684,-0.689582,-0.417457,1.0,0.288137,0.205873
model_year,0.579267,-0.348746,-0.370164,-0.411728,-0.306564,0.288137,1.0,0.180662
origin,0.56345,-0.562543,-0.609409,-0.451782,-0.581024,0.205873,0.180662,1.0


In [43]:
median_mpg_by_year = df.groupby("model_year")["mpg"].median()
px.line(median_mpg_by_year)

In [44]:
def categorize_year(year):
    if 70 <= year <= 73:
        return 0
    elif 74 <= year <= 79:
        return 1
    elif 80 <= year <= 82:
        return 2
    else:
        return None

df["model_time_period"] = df["model_year"].apply(categorize_year)

In [45]:
df.groupby(["model_time_period"])["mpg"].mean().round(2)

model_time_period
0    18.53
1    22.84
2    31.91
Name: mpg, dtype: float64

In [46]:
df.groupby("origin")["mpg"].agg(["mean", "median"])

Unnamed: 0_level_0,mean,median
origin,Unnamed: 1_level_1,Unnamed: 2_level_1
1,20.083534,18.5
2,27.891429,26.5
3,30.450633,31.6


In [47]:
df.groupby(["origin", "model_time_period"])["mpg"].mean().round(2)

origin  model_time_period
1       0                    16.03
        1                    20.38
        2                    28.21
2       0                    24.71
        1                    26.22
        2                    36.13
3       0                    24.67
        1                    28.86
        2                    34.40
Name: mpg, dtype: float64

In [48]:
df[["origin", "mpg"]].corr()

Unnamed: 0,origin,mpg
origin,1.0,0.56345
mpg,0.56345,1.0


In [49]:
df["origin_us"] = df["origin"].apply(lambda x: 1 if x == 1 else 0)

In [50]:
df[["mpg", "origin", "origin_us"]].corr()

Unnamed: 0,mpg,origin,origin_us
mpg,1.0,0.56345,-0.568192
origin,0.56345,1.0,-0.924486
origin_us,-0.568192,-0.924486,1.0


In [51]:
df.drop(columns=["model_year", "origin", "car_name", "speed_category"], inplace=True)
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_time_period,origin_us
0,18.0,8,307.0,130.0,3504.0,12.0,0,1
1,15.0,8,350.0,165.0,3693.0,11.5,0,1
2,18.0,8,318.0,150.0,3436.0,11.0,0,1
3,16.0,8,304.0,150.0,3433.0,12.0,0,1
4,17.0,8,302.0,140.0,3449.0,10.5,0,1
...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790.0,15.6,2,1
394,44.0,4,97.0,52.0,2130.0,24.6,2,0
395,32.0,4,135.0,84.0,2295.0,11.6,2,1
396,28.0,4,120.0,79.0,2625.0,18.6,2,1


In [52]:
df.drop(columns=["model_time_period", "origin_us"]).corr().round(2)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration
mpg,1.0,-0.78,-0.8,-0.77,-0.83,0.42
cylinders,-0.78,1.0,0.95,0.84,0.9,-0.51
displacement,-0.8,0.95,1.0,0.9,0.93,-0.54
horsepower,-0.77,0.84,0.9,1.0,0.86,-0.69
weight,-0.83,0.9,0.93,0.86,1.0,-0.42
acceleration,0.42,-0.51,-0.54,-0.69,-0.42,1.0


In [53]:
y = df.pop("mpg").to_numpy()
X = df.drop(columns=["cylinders", "displacement"]).to_numpy()

In [54]:
X.shape, y.shape

((398, 5), (398,))

In [55]:
import numpy as np

In [56]:
beta = (np.linalg.inv(X.T @ X) @ X.T) @ y

In [57]:
X.shape, beta.shape

((398, 5), (5,))

In [58]:
from sklearn.metrics import r2_score

r2_score(y, X @ beta)

0.6507421172572578

In [70]:
fig = px.scatter(x=y, y=X @ beta)
fig.update_layout(height=800, width=800, xaxis=dict(range=(0, 50)), yaxis=dict(range=(0, 50)))