In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)
%config InlineBackend.figure_format = "retina"

In [507]:
column_names = [
    "mpg",
    "cylinders",
    "displacement",
    "horsepower",
    "weight",
    "acceleration",
    "model_year",
    "origin",
    "car_name",
]

df = pd.read_csv("../data/auto-mpg.data", names=column_names, sep=r"\s+", na_values="?")
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [508]:
df.isna().agg(["sum", "mean"]).T.apply(
    lambda x: x.astype(int) if x.name == "sum" else round(x * 100, 2)
).set_axis(["Null Count", "Null %"], axis=1)

Unnamed: 0,Null Count,Null %
mpg,0,0.0
cylinders,0,0.0
displacement,0,0.0
horsepower,6,1.51
weight,0,0.0
acceleration,0,0.0
model_year,0,0.0
origin,0,0.0
car_name,0,0.0


In [509]:
df.drop(columns="car_name").corr()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
mpg,1.0,-0.775396,-0.804203,-0.778427,-0.831741,0.420289,0.579267,0.56345
cylinders,-0.775396,1.0,0.950721,0.842983,0.896017,-0.505419,-0.348746,-0.562543
displacement,-0.804203,0.950721,1.0,0.897257,0.932824,-0.543684,-0.370164,-0.609409
horsepower,-0.778427,0.842983,0.897257,1.0,0.864538,-0.689196,-0.416361,-0.455171
weight,-0.831741,0.896017,0.932824,0.864538,1.0,-0.417457,-0.306564,-0.581024
acceleration,0.420289,-0.505419,-0.543684,-0.689196,-0.417457,1.0,0.288137,0.205873
model_year,0.579267,-0.348746,-0.370164,-0.416361,-0.306564,0.288137,1.0,0.180662
origin,0.56345,-0.562543,-0.609409,-0.455171,-0.581024,0.205873,0.180662,1.0


In [510]:
df["speed_category"] = pd.cut(
    df["acceleration"], 5, labels=["fastest", "fast", "medium", "slow", "slowest"]
)
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name,speed_category
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu,fast
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320,fast
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite,fastest
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst,fast
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino,fastest
...,...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790.0,15.6,82,1,ford mustang gl,medium
394,44.0,4,97.0,52.0,2130.0,24.6,82,2,vw pickup,slowest
395,32.0,4,135.0,84.0,2295.0,11.6,82,1,dodge rampage,fast
396,28.0,4,120.0,79.0,2625.0,18.6,82,1,ford ranger,slow


In [511]:
df.groupby("speed_category").count()

Unnamed: 0_level_0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
speed_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
fastest,21,21,21,21,21,21,21,21,21
fast,136,136,136,135,136,136,136,136,136
medium,172,172,172,169,172,172,172,172,172
slow,58,58,58,56,58,58,58,58,58
slowest,11,11,11,11,11,11,11,11,11


In [None]:
mean_hp = df.groupby("speed_category")["horsepower"].mean().round(1)
mean_hp

In [None]:
df["horsepower"] = df.apply(
    lambda row: (
        mean_hp[row["speed_category"]]
        if pd.isna(row["horsepower"])
        else row["horsepower"]
    ),
    axis=1,
)

In [None]:
df.describe().T

In [None]:
df.drop(columns=["car_name", "speed_category"]).corr()

In [None]:
median_mpg_by_year = df.groupby("model_year")["mpg"].median()
px.line(median_mpg_by_year)

In [None]:
def categorize_year(year):
    if 70 <= year <= 73:
        return 0
    elif 74 <= year <= 79:
        return 1
    elif 80 <= year <= 82:
        return 2
    else:
        return None

df["model_time_period"] = df["model_year"].apply(categorize_year)

In [None]:
df.groupby(["model_time_period"])["mpg"].mean().round(2)

In [None]:
df.groupby("origin")["mpg"].agg(["mean", "median"])

In [None]:
df.groupby(["origin", "model_time_period"])["mpg"].mean().round(2)

In [None]:
df[["origin", "mpg"]].corr()

In [None]:
df["origin_us"] = df["origin"].apply(lambda x: 1 if x == 1 else 0)

In [None]:
df[["mpg", "origin", "origin_us"]].corr()

In [None]:
df.drop(columns=["model_year", "origin", "car_name", "speed_category"], inplace=True)
df

In [None]:
df.drop(columns=["model_time_period", "origin_us"]).corr().round(2)

In [None]:
df.drop(columns="mpg")

In [None]:
df

In [None]:
df.describe().T

In [None]:
df[["mpg", "horsepower", "weight"]].corr()

In [None]:
df["horsepower"] = np.log1p(df["horsepower"])
df["weight"] = np.log1p(df["weight"])
df["mpg"] = np.log1p(df["mpg"])

In [None]:
df[["mpg", "horsepower", "weight"]].corr()

In [None]:
mpg_mean = np.mean(df["mpg"])
mpg_std = np.std(df["mpg"])

df["horsepower"] = (df["horsepower"] - np.mean(df["horsepower"])) / np.std(df["horsepower"])
df["weight"] = (df["weight"] - np.mean(df["weight"])) / np.std(df["weight"])
df["acceleration"] = (df["acceleration"] - np.mean(df["acceleration"])) / np.std(df["acceleration"])
df["mpg"] = (df["mpg"] - mpg_mean) / mpg_std

In [None]:
df.describe().T

In [None]:
y = df.pop("mpg").to_numpy()
X = df.drop(columns=["cylinders", "displacement"]).to_numpy()

In [None]:
X.shape, y.shape

In [None]:
beta = (np.linalg.pinv(X.T @ X) @ X.T) @ y

In [None]:
X.shape, beta.shape

In [None]:
r2_score(y, X @ beta)

In [None]:
for col in range(X.shape[1]):
    fig = px.histogram(data_frame=X[:, col])
    fig.show()

In [None]:
fig = px.scatter(x=np.exp(y * mpg_std + mpg_mean), y=np.exp((X @ beta) * mpg_std + mpg_mean))
fig.update_layout(height=800, width=800, xaxis=dict(range=(5, 50)), yaxis=dict(range=(5, 50)))

# Some Testing

In [None]:
np.linalg.inv(X.T @ X) @ X.T @ y

In [None]:
np.cov(X, rowvar=False)