In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import os
os.chdir('..')

In [3]:
import pandas as pd
from src.paths import TRANSFORMED_DATA_DIR

df=pd.read_parquet(TRANSFORMED_DATA_DIR/'tabular_data.parquet')

In [4]:
from datetime import datetime
from src.data_split import train_test_split

X_train, y_train, X_test, y_test = train_test_split(
    df,
    cutoff_date=datetime(2023, 10, 1, 0, 0, 0),
    target_column_name='target_rides_next_hour'
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

X_train.shape=(161650, 674)
y_train.shape=(161650,)
X_test.shape=(16165, 674)
y_test.shape=(16165,)


In [5]:
import numpy as np
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import optuna

from src.model import get_pipeline

def objective(trial: optuna.trial.Trial) -> float:
    """
    Given a set of hyper-parameters, it trains a model and computes an average
    validation error based on a TimeSeriesSplit
    """
    # pick hyper-parameters
    hyperparams = {
        "metric": 'mae',
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),   
    }
       
    tss = TimeSeriesSplit(n_splits=2)
    scores = []
    for train_index, val_index in tss.split(X_train):

        # split data for training and validation
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index,:]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # train the model
        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)
        
        # evaluate the model
        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)

        scores.append(mae)
   
    # Return the mean score
    return np.array(scores).mean()

In [7]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5)

[I 2024-02-11 15:18:01,394] A new study created in memory with name: no-name-9bb8fa9a-fda1-4294-a96b-008061eca908




[I 2024-02-11 15:18:10,528] Trial 0 finished with value: 3.6892952323733046 and parameters: {'num_leaves': 14, 'feature_fraction': 0.8183248227189106, 'bagging_fraction': 0.812472300906022, 'min_child_samples': 11}. Best is trial 0 with value: 3.6892952323733046.




[I 2024-02-11 15:19:48,542] Trial 1 finished with value: 3.557603303036331 and parameters: {'num_leaves': 168, 'feature_fraction': 0.29939860862664713, 'bagging_fraction': 0.4674757572979183, 'min_child_samples': 14}. Best is trial 1 with value: 3.557603303036331.




[I 2024-02-11 15:20:10,749] Trial 2 finished with value: 3.6015206671976774 and parameters: {'num_leaves': 72, 'feature_fraction': 0.876910835155098, 'bagging_fraction': 0.22356478408364158, 'min_child_samples': 31}. Best is trial 1 with value: 3.557603303036331.




[I 2024-02-11 15:20:29,545] Trial 3 finished with value: 3.4979287455030565 and parameters: {'num_leaves': 10, 'feature_fraction': 0.4636744141385917, 'bagging_fraction': 0.5732200964986196, 'min_child_samples': 96}. Best is trial 3 with value: 3.4979287455030565.




[I 2024-02-11 15:21:04,920] Trial 4 finished with value: 3.6695839865553075 and parameters: {'num_leaves': 152, 'feature_fraction': 0.5933209188159166, 'bagging_fraction': 0.2862631853845501, 'min_child_samples': 15}. Best is trial 3 with value: 3.4979287455030565.


In [8]:
best_params = study.best_trial.params
print(f'{best_params=}')

best_params={'num_leaves': 10, 'feature_fraction': 0.4636744141385917, 'bagging_fraction': 0.5732200964986196, 'min_child_samples': 96}


In [9]:
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)



In [10]:
predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=2.7514


In [11]:
from src.plot import plot_one_sample

plot_one_sample(
    example_id=2979,
    features=X_test,
    targets=y_test,
    predictions=pd.Series(predictions)
)

In [12]:
plot_one_sample(
    example_id=3979,
    features=X_test,
    targets=y_test,
    predictions=pd.Series(predictions)
)