In [61]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.linear_model import Lasso, ElasticNet, Ridge
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
import optuna
from sklearn.metrics import (
    r2_score,
    make_scorer
)

## Regression

- Compare 2 algos
- Cross-validation
- Explain choice for solvers, hyperparameters
- Beat 0.84 on the test set

In [39]:
DATA_PATH = Path("../data/")

In [40]:
X_train = np.load(DATA_PATH/"regression/X_train.npy")
y_train = np.load(DATA_PATH/"regression/y_train.npy")
X_test = np.load(DATA_PATH/"regression/X_test.npy")
y_test = np.load(DATA_PATH/"regression/y_test.npy")

In [41]:
r2 = make_scorer(r2_score)

Lasso with alpha optimized on Optuna + random_state=0 => 0.888

ElasticNet with alpha + ratio optimized on Optuna + random_state=0 => 0.882

Ridge with alpha optimized on Optuna => 0.4

In [62]:
def objective(trial):
    alpha= trial.suggest_float("alpha", 0, 1)
    
    reg_pipeline = make_pipeline(
        RobustScaler(),
        Ridge(alpha=alpha, random_state=0)
    )
    
    scores = cross_val_score(reg_pipeline, X=X_train, y=y_train, cv=5, n_jobs=-1, scoring=r2)
    return np.mean(scores)
    
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2023-06-14 14:16:37,520] A new study created in memory with name: no-name-7b51f5ef-62ba-4b29-8ca2-3c3913629f2a
[I 2023-06-14 14:16:37,587] Trial 0 finished with value: 0.3894990138872698 and parameters: {'alpha': 0.8137729077791601}. Best is trial 0 with value: 0.3894990138872698.
[I 2023-06-14 14:16:37,627] Trial 1 finished with value: 0.3197348468522 and parameters: {'alpha': 0.447876883882666}. Best is trial 0 with value: 0.3894990138872698.
[I 2023-06-14 14:16:37,668] Trial 2 finished with value: 0.16283719942579153 and parameters: {'alpha': 0.06457411893863085}. Best is trial 0 with value: 0.3894990138872698.
[I 2023-06-14 14:16:37,711] Trial 3 finished with value: 0.3525747666013811 and parameters: {'alpha': 0.5938313634014502}. Best is trial 0 with value: 0.3894990138872698.
[I 2023-06-14 14:16:37,752] Trial 4 finished with value: 0.30572201009128247 and parameters: {'alpha': 0.39586472860252286}. Best is trial 0 with value: 0.3894990138872698.
[I 2023-06-14 14:16:37,792] Tri

In [64]:
fig = optuna.visualization.plot_optimization_history(study)
fig.show()

In [57]:
reg_pipeline = make_pipeline(
        RobustScaler(),
        ElasticNet(alpha=study.best_params["alpha"], l1_ratio=study.best_params["l1_ratio"], random_state=0)
)

In [58]:
reg_pipeline.fit(X_train, y_train)

In [59]:
preds = reg_pipeline.predict(X_test)

In [60]:
r2_score(y_test, preds)

0.8821484707668852