In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.model_selection import cross_validate, ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import optuna
from optuna import Trial

## Importing Our Data

In [2]:
df = pd.read_fwf(
    "./data/auto-mpg.data",
    names=["mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "model_year", "origin", "car_name"]
)

df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,"""chevrolet chevelle malibu"""
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,"""buick skylark 320"""
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,"""plymouth satellite"""
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,"""amc rebel sst"""
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,"""ford torino"""


In [3]:
cleaned_df = df.copy()

cleaned_df["horsepower"] = cleaned_df["horsepower"].replace("?", np.nan)
cleaned_df["horsepower"] = pd.to_numeric(cleaned_df["horsepower"], errors="coerce")

cleaned_df["car_name"] = cleaned_df["car_name"].str.replace("\"", "")

cleaned_df.dropna(inplace=True)

cleaned_df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


## Try Different Models

In [4]:
selected_features = ["weight", "cylinders", "mpg", "displacement", "acceleration"]

In [5]:
MODEL_DICT = {
    "ridge": Ridge,
    "lasso": Lasso,
    "ols": LinearRegression,
}

In [6]:
X = cleaned_df[selected_features].copy()
y = cleaned_df["horsepower"].copy()

cv_splitter = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

def model_selection_objective(trial: Trial):

    model_type = trial.suggest_categorical("model_type", ["ridge", "lasso", "ols"])
    model = MODEL_DICT[model_type]

    if model_type in ["ridge", "lasso"]:
        alpha = trial.suggest_float("alpha", 0.5, 3.0, step=0.5)
        tol = trial.suggest_float("tol", 0.0001, 0.001, step=0.0001)
        model_instance = model(alpha=alpha, tol=tol, random_state=42)
    else:
        model_instance = model()
    
    scaler = StandardScaler()
    
    model_pipeline = Pipeline([
        ("scaler", scaler),
        ("model", model_instance)
    ])

    cv_results = cross_validate(model_pipeline, X, y, cv=cv_splitter)

    mean_r2_score = cv_results["test_score"].mean()

    return mean_r2_score
    

In [7]:
study = optuna.create_study(
    study_name="auto_mpg_model_selection",
    direction="maximize",
    storage="sqlite:///optuna_studies",
    load_if_exists=True
)
study.optimize(model_selection_objective, n_trials=100, show_progress_bar=True)

[I 2025-04-01 14:43:49,284] A new study created in RDB with name: auto_mpg_model_selection


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-04-01 14:43:49,539] Trial 0 finished with value: 0.8917420179951939 and parameters: {'model_type': 'lasso', 'alpha': 2.5, 'tol': 0.0001}. Best is trial 0 with value: 0.8917420179951939.
[I 2025-04-01 14:43:49,689] Trial 1 finished with value: 0.8953674695550781 and parameters: {'model_type': 'ols'}. Best is trial 1 with value: 0.8953674695550781.
[I 2025-04-01 14:43:49,836] Trial 2 finished with value: 0.8953674695550781 and parameters: {'model_type': 'ols'}. Best is trial 1 with value: 0.8953674695550781.
[I 2025-04-01 14:43:49,973] Trial 3 finished with value: 0.8953674695550781 and parameters: {'model_type': 'ols'}. Best is trial 1 with value: 0.8953674695550781.
[I 2025-04-01 14:43:50,149] Trial 4 finished with value: 0.8955734971194529 and parameters: {'model_type': 'ridge', 'alpha': 1.5, 'tol': 0.0008}. Best is trial 4 with value: 0.8955734971194529.
[I 2025-04-01 14:43:50,324] Trial 5 finished with value: 0.8896585806009559 and parameters: {'model_type': 'lasso', 'alpha'

In [8]:
study.best_params

{'model_type': 'ridge', 'alpha': 3.0, 'tol': 0.0004}

In [9]:
study.best_value

0.8956529576574926

In [10]:
optuna.visualization.plot_slice(study, params=["model_type"])

## Try Different Feature Sets

In [11]:
FEATURE_SETS = {
    "wide_set": ["weight", "cylinders", "mpg", "displacement", "acceleration"],
    "narrow_set": ["acceleration", "mpg", "cylinders"],
    "moderate_set": ["weight", "acceleration", "mpg", "displacement"]
}

In [12]:
cv_splitter = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

def feature_set_selection_objective(trial: Trial):

    model_instance = Ridge(
        alpha=trial.suggest_float("alpha", 0.5, 6.0),
        tol=trial.suggest_float("tol", 0.0001, 0.001),
        random_state=42
    )
    
    feature_set_name = trial.suggest_categorical("feature_set_name", ["wide_set", "narrow_set", "moderate_set"])
    feature_set = FEATURE_SETS[feature_set_name]

    X = cleaned_df[feature_set].copy()
    y = cleaned_df["horsepower"].copy()
    
    scaler = StandardScaler()
    
    model_pipeline = Pipeline([
        ("scaler", scaler),
        ("model", model_instance)
    ])

    cv_results = cross_validate(model_pipeline, X, y, cv=cv_splitter)

    mean_r2_score = cv_results["test_score"].mean()

    return mean_r2_score

In [13]:
fs_study = optuna.create_study(
    study_name="auto_mpg_feature_set_selection_corrected",
    direction="maximize",
    storage="sqlite:///optuna_studies.db",
    load_if_exists=True
)
fs_study.optimize(feature_set_selection_objective, n_trials=100, show_progress_bar=True)

[I 2025-04-01 14:47:59,855] A new study created in RDB with name: auto_mpg_feature_set_selection_corrected


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-04-01 14:48:00,094] Trial 0 finished with value: 0.8947175195177899 and parameters: {'alpha': 2.0884233041939373, 'tol': 0.0005115254280492014, 'feature_set_name': 'moderate_set'}. Best is trial 0 with value: 0.8947175195177899.
[I 2025-04-01 14:48:00,251] Trial 1 finished with value: 0.8436891189231014 and parameters: {'alpha': 3.046653743121223, 'tol': 0.0009174603408450994, 'feature_set_name': 'narrow_set'}. Best is trial 0 with value: 0.8947175195177899.
[I 2025-04-01 14:48:00,447] Trial 2 finished with value: 0.8949574056396923 and parameters: {'alpha': 4.958507092561927, 'tol': 0.00028595309051648957, 'feature_set_name': 'moderate_set'}. Best is trial 2 with value: 0.8949574056396923.
[I 2025-04-01 14:48:00,584] Trial 3 finished with value: 0.8956267105184874 and parameters: {'alpha': 5.106956978035022, 'tol': 0.0008957763271965574, 'feature_set_name': 'wide_set'}. Best is trial 3 with value: 0.8956267105184874.
[I 2025-04-01 14:48:00,730] Trial 4 finished with value: 0.8

In [14]:
fs_study.best_params

{'alpha': 3.6002090856630726,
 'tol': 0.0002022222430951479,
 'feature_set_name': 'wide_set'}

In [15]:
fs_study.best_value

0.8956591142694925

In [16]:
optuna.visualization.plot_slice(study=fs_study, params=["feature_set_name"])

In [17]:
optuna.visualization.plot_param_importances(fs_study)

In [18]:
optuna.visualization.plot_contour(study=fs_study, params=["tol", "alpha"])

## Challenge - Experiment With Different Scalers

Try using no scaler, MinMax or Standard scaling to see if this has any effect on the performance of the model.