In [114]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)
%config InlineBackend.figure_format = "retina"
pd.options.plotting.backend = "plotly"

In [115]:
column_names = [
    "mpg",
    "cylinders",
    "displacement",
    "horsepower",
    "weight",
    "acceleration",
    "model_year",
    "origin",
    "car_name",
]

df = pd.read_csv("../data/auto-mpg.data", names=column_names, sep=r"\s+", na_values="?")

df.isna().agg(["sum", "mean"]).T.apply(
    lambda x: x.astype(int) if x.name == "sum" else round(x * 100, 2)
).set_axis(["Null Count", "Null %"], axis=1)

df["speed_category"] = pd.cut(
    df["acceleration"], 5, labels=["fastest", "fast", "medium", "slow", "slowest"]
)

mean_hp = df.groupby("speed_category")["horsepower"].mean().round(1)
df["horsepower"] = df.apply(
    lambda row: (
        mean_hp[row["speed_category"]]
        if pd.isna(row["horsepower"])
        else row["horsepower"]
    ),
    axis=1,
)

def categorize_year(year):
    if 70 <= year <= 73:
        return 0
    elif 74 <= year <= 79:
        return 1
    elif 80 <= year <= 82:
        return 2
    else:
        return None

df["model_time_period"] = df["model_year"].apply(categorize_year)
df["origin_us"] = df["origin"].apply(lambda x: 1 if x == 1 else 0)

In [116]:
df.drop(columns=["model_year", "origin", "car_name", "speed_category"], inplace=True)
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_time_period,origin_us
0,18.0,8,307.0,130.0,3504.0,12.0,0,1
1,15.0,8,350.0,165.0,3693.0,11.5,0,1
2,18.0,8,318.0,150.0,3436.0,11.0,0,1
3,16.0,8,304.0,150.0,3433.0,12.0,0,1
4,17.0,8,302.0,140.0,3449.0,10.5,0,1
...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790.0,15.6,2,1
394,44.0,4,97.0,52.0,2130.0,24.6,2,0
395,32.0,4,135.0,84.0,2295.0,11.6,2,1
396,28.0,4,120.0,79.0,2625.0,18.6,2,1


In [180]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(
    df.drop(columns=["cylinders", "displacement"]), random_state=42, train_size=0.8
)

In [181]:
df_train["horsepower"] = np.log1p(df_train["horsepower"])
df_train["weight"] = np.log1p(df_train["weight"])
df_train["mpg"] = np.log1p(df_train["mpg"])

mpg_mean = np.mean(df_train["mpg"])
horsepower_mean = np.mean(df_train["horsepower"])
weight_mean = np.mean(df_train["weight"])
acceleration_mean = np.mean(df_train["acceleration"])

mpg_std = np.std(df_train["mpg"])
horsepower_std = np.std(df_train["horsepower"])
weight_std = np.std(df_train["weight"])
acceleration_std = np.std(df_train["acceleration"])

df_train["horsepower"] = (df_train["horsepower"] - horsepower_mean) / horsepower_std
df_train["weight"] = (df_train["weight"] - weight_mean) / weight_std
df_train["acceleration"] = (df_train["acceleration"] - acceleration_mean) / acceleration_std
df_train["mpg"] = (df_train["mpg"] - mpg_mean) / mpg_std

In [182]:
df_test["horsepower"] = np.log1p(df_test["horsepower"])
df_test["weight"] = np.log1p(df_test["weight"])

df_test["horsepower"] = (df_test["horsepower"] - horsepower_mean) / horsepower_std
df_test["weight"] = (df_test["weight"] - weight_mean) / weight_std
df_test["acceleration"] = (df_test["acceleration"] - acceleration_mean) / acceleration_std

In [183]:
y_train = df_train.pop("mpg").to_numpy()
X_train = df_train.to_numpy()
y_test = df_test.pop("mpg").to_numpy()
X_test = df_test.to_numpy()

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((318, 5), (318,), (80, 5), (80,))

In [184]:
n_samples, n_features = X_train.shape

In [185]:
df_train

Unnamed: 0,horsepower,weight,acceleration,model_time_period,origin_us
3,1.293338,0.659813,-1.319334,0,1
18,-0.301345,-1.047851,-0.413182,0,0
376,-1.069149,-1.228691,0.927922,2,0
248,-1.440887,-1.650016,0.275493,1,0
177,-0.072956,-0.207478,-0.231952,1,0
...,...,...,...,...,...
71,-0.010757,-0.726794,-0.775643,0,0
106,1.839990,1.627401,-1.138103,0,1
270,-0.072956,-0.453455,-0.304444,1,0
348,-1.343570,-1.184798,0.601707,2,0


In [214]:
lambd = 0.1
alpha = 0.5
learning_rate = 0.01

beta = np.zeros(n_features)

for _ in range(100):

    residual = y_train - X_train @ beta

    ols_term = - 2 / n_samples * X_train.T @ residual
    l1_term = lambd * alpha * np.sign(beta)
    l2_term = 2 * lambd * (1 - alpha) * beta

    gradient = ols_term + l1_term + l2_term
    beta -= learning_rate * gradient

    loss = np.sum(np.square(residual)) + lambd * (
        alpha * np.sum(np.abs(beta)) + (1 - alpha) * np.sum(np.square(beta))
    )

y_pred = np.expm1(X_test @ beta * mpg_std + mpg_mean)
r2_score(y_test, y_pred)

0.8603362709195747