In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
from src.paths import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'final_data.parquet')
df

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,target_rides_next_hour
0,25.0,29.0,34.0,31.0,32.0,8.0,6.0,4.0,0.0,1.0,...,0.0,6.0,5.0,1.0,4.0,2.0,2.0,2024-01-29,4,0.0
1,1.0,2.0,1.0,0.0,0.0,1.0,0.0,3.0,6.0,3.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,2024-01-30,4,1.0
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,2.0,4.0,5.0,...,1.0,0.0,1.0,4.0,1.0,4.0,6.0,2024-01-31,4,2.0
3,1.0,1.0,2.0,1.0,0.0,1.0,1.0,3.0,2.0,1.0,...,3.0,3.0,3.0,1.0,5.0,3.0,0.0,2024-02-01,4,5.0
4,2.0,4.0,1.0,1.0,0.0,0.0,0.0,1.0,3.0,1.0,...,2.0,2.0,2.0,4.0,4.0,12.0,7.0,2024-02-02,4,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80736,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024-11-26,110,0.0
80737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024-11-27,110,0.0
80738,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024-11-28,110,0.0
80739,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024-11-29,110,0.0


In [3]:
from src.data_split import train_test_split
from datetime import datetime

X_train, y_train, X_test, y_test = train_test_split(
    df,
    cutoff_date=datetime(2024, 8, 1, 0, 0, 0),
    target_column_name="target_rides_next_hour"
)

In [4]:
print(f"{X_train.shape=}")
print(f"{y_train.shape=}")
print(f"{X_test.shape=}")
print(f"{y_test.shape=}")

X_train.shape=(48655, 674)
y_train.shape=(48655,)
X_test.shape=(32086, 674)
y_test.shape=(32086,)


In [6]:
from src.model import get_pipeline
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
import numpy as np

import optuna
def objective(trial: optuna.trial.Trial) -> float:
    """
    Given a set of hyper-parameters, it trains a model and computes an average validation error based on a TimeSeriesSplit
    """

    # pick hyper-parameters
    hyperparams = {
        "metric": "mae",
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),
    }

    tss = TimeSeriesSplit(n_splits=4)
    scores = []
    for train_index, val_index in tss.split(X_train):

        # split data for training and validation
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index,:]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]

        # train the model
        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)

        # evaluate the model
        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)

        scores.append(mae)

    
    # return the mean score
    return np.array(scores).mean()

In [7]:
study = optuna.create_study(direction="minimize") # minimize the mean mae score
study.optimize(objective, n_trials=5)

[I 2025-01-25 15:45:20,169] A new study created in memory with name: no-name-e8d006f0-fb02-474d-856e-7fc3f1157436
[I 2025-01-25 15:45:26,508] Trial 0 finished with value: 3.8896039692475117 and parameters: {'num_leaves': 13, 'feature_fraction': 0.31790628834771123, 'bagging_fraction': 0.8460308791831658, 'min_child_samples': 79}. Best is trial 0 with value: 3.8896039692475117.
[I 2025-01-25 15:45:45,486] Trial 1 finished with value: 3.9423664725363206 and parameters: {'num_leaves': 152, 'feature_fraction': 0.22000396902953467, 'bagging_fraction': 0.543911408980104, 'min_child_samples': 87}. Best is trial 0 with value: 3.8896039692475117.
[I 2025-01-25 15:46:10,423] Trial 2 finished with value: 3.4709888997785097 and parameters: {'num_leaves': 162, 'feature_fraction': 0.747796136261423, 'bagging_fraction': 0.7124241506239087, 'min_child_samples': 64}. Best is trial 2 with value: 3.4709888997785097.
[I 2025-01-25 15:46:48,291] Trial 3 finished with value: 3.433419431984345 and parameters

In [8]:
best_params = study.best_trial.params
print(f"{best_params=}")

best_params={'num_leaves': 214, 'feature_fraction': 0.3862377279974252, 'bagging_fraction': 0.41096457468391273, 'min_child_samples': 42}


#### Once we find the best parameter, re train the model with entire train data from the initial dataset

In [9]:
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)

In [10]:
predictions = pipeline.predict(X_test)
predictions

array([4.47076000e+00, 1.44640263e+01, 4.08446816e+01, ...,
       2.90857014e-02, 2.90857014e-02, 2.90857014e-02], shape=(32086,))

In [11]:
test_mae = mean_absolute_error(y_test, predictions)
print(f"{test_mae=:.4f}")

test_mae=3.0785


In [12]:
from src.plot import plot_one_sample

plot_one_sample(
    features=X_test,
    targets=y_test,
    example_id=2977,
    predictions=pd.Series(predictions)
)

In [14]:
plot_one_sample(
    features=X_test,
    targets=y_test,
    example_id=100,
    predictions=pd.Series(predictions)
)

In [15]:
plot_one_sample(
    features=X_test,
    targets=y_test,
    example_id=32085,
    predictions=pd.Series(predictions)
)