In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ShuffleSplit

## Importing Our Data

In [3]:
df = pd.read_fwf(
    "./data/auto-mpg.data",
    names=["mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "model_year", "origin", "car_name"]
)

df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,"""chevrolet chevelle malibu"""
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,"""buick skylark 320"""
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,"""plymouth satellite"""
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,"""amc rebel sst"""
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,"""ford torino"""


In [4]:
cleaned_df = df.copy()

cleaned_df["horsepower"] = cleaned_df["horsepower"].replace("?", np.nan)
cleaned_df["horsepower"] = pd.to_numeric(cleaned_df["horsepower"], errors="coerce")

cleaned_df["car_name"] = cleaned_df["car_name"].str.replace("\"", "")

cleaned_df.dropna(inplace=True)

cleaned_df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [5]:
numeric_columns = cleaned_df.select_dtypes(include=["number"]).columns

scaler = StandardScaler()

scaled_data = scaler.fit_transform(cleaned_df[numeric_columns])

scaled_df = pd.DataFrame(scaled_data, columns=numeric_columns)

scaled_df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,-0.698638,1.483947,1.077290,0.664133,0.620540,-1.285258,-1.625315,-0.716641
1,-1.083498,1.483947,1.488732,1.574594,0.843334,-1.466724,-1.625315,-0.716641
2,-0.698638,1.483947,1.182542,1.184397,0.540382,-1.648189,-1.625315,-0.716641
3,-0.955212,1.483947,1.048584,1.184397,0.536845,-1.285258,-1.625315,-0.716641
4,-0.826925,1.483947,1.029447,0.924265,0.555706,-1.829655,-1.625315,-0.716641
...,...,...,...,...,...,...,...,...
387,0.455941,-0.864014,-0.520637,-0.480448,-0.221125,0.021294,1.636410,-0.716641
388,2.636813,-0.864014,-0.932079,-1.364896,-0.999134,3.287676,1.636410,0.526382
389,1.097374,-0.864014,-0.568479,-0.532474,-0.804632,-1.430430,1.636410,-0.716641
390,0.584228,-0.864014,-0.712005,-0.662540,-0.415627,1.110088,1.636410,-0.716641


In [6]:
cv_splitter = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

## Level 1 - What Even are Hyperparameters?

In [7]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_validate

In [8]:
selected_features = ["weight", "cylinders"]

In [9]:
X = scaled_df[selected_features].copy()
y = scaled_df["horsepower"].copy()

In [10]:
model = Ridge(alpha=2.0, random_state=42)

cv_results = cross_validate(model, X, y, cv=cv_splitter)

In [11]:
cv_results

{'fit_time': array([0.0069747 , 0.00601363, 0.00754523, 0.00399423, 0.00299954]),
 'score_time': array([0.00299072, 0.0015204 , 0.00401139, 0.00200844, 0.00199294]),
 'test_score': array([0.79225901, 0.78886809, 0.69616869, 0.79157424, 0.84993148])}

In [12]:
print(f"Average test r2 score: {cv_results['test_score'].mean()}")

Average test r2 score: 0.7837603015275304


Hyperparameters are the values that you as the data scientist/programmer explicitly feed into your model (e.g. the `alpha` value), whereas parameters are the values determined by the algorithm you are using (e.g. the coefficients for ridge regression)

## Level 2 - 'Manually' Tuning Hyperparameters
But what if we tried different values of `alpha` to see if the $R^2$ score improves as a result of us changing the vals?

In [13]:
model_1 = Ridge(alpha=1.0, random_state=42)
model_2 = Ridge(alpha=0.5, random_state=42)
model_3 = Ridge(alpha=2.0, random_state=42)

for model in [model_1, model_2, model_3]:

    cv_results = cross_validate(model, X, y, cv=cv_splitter)

    print(f"Average test r2 score: {cv_results['test_score'].mean()}")
    print("--------------")

Average test r2 score: 0.783661984163461
--------------
Average test r2 score: 0.7836103251011781
--------------
Average test r2 score: 0.7837603015275304
--------------


## Level 3 - Searching Parameter Combinations

If only we could easily and systematically go through each of our parameter combinations...

In [14]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import halfnorm

### GridsearchCV

In [15]:
params = {
    "alpha": np.linspace(0.5, 5.0, 20),
    "fit_intercept": [True, False],
    "tol": np.linspace(0.0001, 0.001, 20),
    "solver": ["sag", "lsqr"]
}

grid_search = GridSearchCV(
    Ridge(random_state=42),
    param_grid=params,
    cv=cv_splitter,
)
grid_search.fit(X[selected_features], y)

In [16]:
gs_results_df = pd.DataFrame(grid_search.cv_results_)

gs_results_df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_fit_intercept,param_solver,param_tol,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.006415,0.001349,0.003189,0.000745,0.5,True,sag,0.0001,"{'alpha': 0.5, 'fit_intercept': True, 'solver'...",0.792511,0.787768,0.696074,0.792017,0.84983,0.78364,0.049426,1506
1,0.003827,0.000826,0.002191,0.000406,0.5,True,sag,0.000147,"{'alpha': 0.5, 'fit_intercept': True, 'solver'...",0.792511,0.787762,0.696074,0.792017,0.849822,0.783637,0.049424,1508
2,0.003008,1.4e-05,0.001792,0.000746,0.5,True,sag,0.000195,"{'alpha': 0.5, 'fit_intercept': True, 'solver'...",0.792511,0.787762,0.696074,0.792017,0.849825,0.783638,0.049425,1507
3,0.003401,0.001021,0.001399,0.000491,0.5,True,sag,0.000242,"{'alpha': 0.5, 'fit_intercept': True, 'solver'...",0.792511,0.787762,0.696074,0.792011,0.849825,0.783637,0.049425,1515
4,0.002986,0.000634,0.001819,0.000412,0.5,True,sag,0.000289,"{'alpha': 0.5, 'fit_intercept': True, 'solver'...",0.792513,0.787751,0.696074,0.792011,0.849825,0.783635,0.049424,1536


In [17]:
gs_results_df[gs_results_df["rank_test_score"] == 1]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_fit_intercept,param_solver,param_tol,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1561,0.002807,0.000752,0.001915,0.000506,5.0,False,sag,0.000147,"{'alpha': 5.0, 'fit_intercept': False, 'solver...",0.791924,0.794579,0.696672,0.795255,0.85014,0.785714,0.049573,1
1562,0.002705,0.00042,0.001304,0.000403,5.0,False,sag,0.000195,"{'alpha': 5.0, 'fit_intercept': False, 'solver...",0.791924,0.794579,0.696672,0.795255,0.85014,0.785714,0.049573,1
1563,0.002005,1e-05,0.0014,0.000493,5.0,False,sag,0.000242,"{'alpha': 5.0, 'fit_intercept': False, 'solver...",0.791924,0.794579,0.696672,0.795255,0.85014,0.785714,0.049573,1


In [18]:
len(gs_results_df)

1600

### RandomizedSearchCV

In [19]:
params = {
    "alpha": halfnorm(loc=2, scale=1), ##np.linspace(0.5, 5.0, 20),
    "fit_intercept": [True, False],
    "tol": np.linspace(0.0001, 0.01, 20),
    "solver": ["sag", "lsqr"]
}

random_search = RandomizedSearchCV(
    Ridge(random_state=42),
    param_distributions=params,
    n_iter=100,
    cv=cv_splitter,
)
random_search.fit(X[selected_features], y)

In [20]:
rs_results_df = pd.DataFrame(random_search.cv_results_)

rs_results_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_fit_intercept,param_solver,param_tol,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.005005,0.000629,0.002598,0.000489,3.540827,False,sag,0.002705,"{'alpha': 3.540826615774715, 'fit_intercept': ...",0.792161,0.793647,0.696384,0.795571,0.850044,0.785561,0.049638,7
1,0.003707,0.001546,0.002702,0.001535,3.769127,False,lsqr,0.001663,"{'alpha': 3.76912735033154, 'fit_intercept': F...",0.792114,0.793763,0.696474,0.795585,0.850066,0.785600,0.049615,5
2,0.004864,0.000707,0.002809,0.001463,2.171758,True,lsqr,0.005832,"{'alpha': 2.1717582509939133, 'fit_intercept':...",0.792231,0.788990,0.696194,0.791525,0.849943,0.783777,0.049411,84
3,0.003104,0.000715,0.001855,0.000667,2.995896,False,sag,0.006874,"{'alpha': 2.995895584203504, 'fit_intercept': ...",0.792151,0.792134,0.696430,0.796438,0.849492,0.785329,0.049470,35
4,0.002959,0.000611,0.001412,0.000489,2.494416,False,lsqr,0.005311,"{'alpha': 2.494415720578576, 'fit_intercept': ...",0.792323,0.792947,0.696291,0.795934,0.849989,0.785497,0.049655,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.002792,0.000407,0.001603,0.000483,2.260173,False,sag,0.007916,"{'alpha': 2.260173489142671, 'fit_intercept': ...",0.792435,0.791526,0.696355,0.796749,0.849292,0.785271,0.049451,42
96,0.002210,0.000400,0.001600,0.000487,2.780913,False,lsqr,0.000100,"{'alpha': 2.7809125389116995, 'fit_intercept':...",0.792276,0.793134,0.696332,0.795855,0.850007,0.785521,0.049646,14
97,0.003405,0.000812,0.002006,0.000011,2.693121,True,sag,0.004268,"{'alpha': 2.6931210147914113, 'fit_intercept':...",0.792200,0.790215,0.696139,0.791258,0.849918,0.783946,0.049444,51
98,0.003259,0.000609,0.001943,0.000208,2.909960,True,sag,0.000100,"{'alpha': 2.9099600275583413, 'fit_intercept':...",0.792110,0.789499,0.696352,0.791316,0.849988,0.783853,0.049368,65


In [21]:
len(rs_results_df)

100

In [22]:
print(random_search.best_score_)

0.7856287016136547


## Level 4 - Using Frameworks Like Optuna

What if instead of searching the entire space, or randomly searching it we could instead follow an optimal path to find an extrema?

In [23]:
import optuna
from optuna import Trial

In [32]:
def objective(trial: Trial):

    model = Ridge(
        alpha = trial.suggest_float("alpha", 0.5, 5.0),
        fit_intercept = False, ##trial.suggest_categorical("fit_intercept", [True, False]),
        tol = trial.suggest_float("tol", 0.0003, 0.0006),
        solver = trial.suggest_categorical("solver", ["lsqr", "sag"]),
        random_state=42,
    )

    cv_results = cross_validate(model, X, y, cv=cv_splitter)

    mean_test_r2 = cv_results["test_score"].mean()

    return mean_test_r2

In [None]:
study = optuna.create_study(
    study_name="auto_mpg_excl_intercept",
    direction="maximize",
    storage="sqlite:///optuna_studies.db",
    load_if_exists=True
)

study.optimize(objective, n_trials=50, show_progress_bar=True)

[I 2025-04-01 14:34:49,491] A new study created in RDB with name: auto_mpg_excl_intercept


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-01 14:34:49,700] Trial 0 finished with value: 0.785616892739054 and parameters: {'alpha': 3.671718436872947, 'tol': 0.00032832964659830325, 'solver': 'sag'}. Best is trial 0 with value: 0.785616892739054.
[I 2025-04-01 14:34:49,825] Trial 1 finished with value: 0.7855203026490687 and parameters: {'alpha': 2.63807918097344, 'tol': 0.00036858714757797347, 'solver': 'sag'}. Best is trial 0 with value: 0.785616892739054.
[I 2025-04-01 14:34:49,965] Trial 2 finished with value: 0.7856061228497758 and parameters: {'alpha': 3.507223290120653, 'tol': 0.0004527201627096871, 'solver': 'sag'}. Best is trial 0 with value: 0.785616892739054.
[I 2025-04-01 14:34:50,115] Trial 3 finished with value: 0.7854812347189968 and parameters: {'alpha': 2.3160729327401075, 'tol': 0.0005756566526585996, 'solver': 'lsqr'}. Best is trial 0 with value: 0.785616892739054.
[I 2025-04-01 14:34:50,251] Trial 4 finished with value: 0.7853448951982098 and parameters: {'alpha': 0.9202201819216169, 'tol': 0.000

In [34]:
study.best_value

0.7856911876826983

In [35]:
study.best_params

{'alpha': 4.991004943137755, 'tol': 0.0004904284243126708, 'solver': 'lsqr'}

In [36]:
optuna.visualization.plot_param_importances(study)

In [37]:
optuna.visualization.plot_slice(study, params=["alpha"])

In [38]:
optuna.visualization.plot_contour(study, params=["alpha","tol"])