In [2]:
import sys
from pathlib import Path

# Add project root to sys.path so `src/` can be imported
sys.path.append(str(Path("..").resolve()))


In [5]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV
from lightgbm import LGBMRegressor
from dotenv import load_dotenv
from src.data_utils import split_time_series_data
from src.experiment_utils import set_mlflow_tracking, log_model_to_mlflow
from src.temporal_features import TemporalFeatureEngineer

In [6]:

# === Load and prepare data ===
df = pd.read_parquet("../data/transformed/tabular_data.parquet")
df = df.rename(columns={"start_hour": "pickup_hour", "start_station_id": "pickup_location_id"})

# === Split train/test ===
X_train, y_train, X_test, y_test = split_time_series_data(
    df=df,
    cutoff_date=datetime(2024, 4, 1),
    target_column="target"
)

# === Define pipeline ===
pipeline = make_pipeline(
    TemporalFeatureEngineer(),
    LGBMRegressor(random_state=42)
)

# === Param grid for tuning ===
param_distributions = {
    "lgbmregressor__num_leaves": [31, 50, 70, 100],
    "lgbmregressor__learning_rate": [0.01, 0.05, 0.1],
    "lgbmregressor__n_estimators": [100, 200, 300],
}

# === Run RandomizedSearchCV ===
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_iter=10,
    scoring="neg_mean_absolute_error",
    cv=3,
    verbose=2,
    random_state=42,
)

random_search.fit(X_train, y_train)

# === Best model eval ===
best_model = random_search.best_estimator_
y_pred = np.round(best_model.predict(X_test)).astype(int)
mae = mean_absolute_error(y_test, y_pred)
print("📉 Final MAE (rounded):", mae)

# === MLflow log ===
load_dotenv()
mlflow = set_mlflow_tracking()

log_model_to_mlflow(
    model=best_model,
    input_data=X_test,
    experiment_name="LGBM_Hyperparam_Tuned",
    metric_name="mae",
    score=mae,
    params=random_search.best_params_
)


Fitting 3 folds for each of 10 candidates, totalling 30 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005729 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15895
[LightGBM] [Info] Number of data points in the train set: 616, number of used features: 674
[LightGBM] [Info] Start training from score 0.238636
[CV] END lgbmregressor__learning_rate=0.1, lgbmregressor__n_estimators=300, lgbmregressor__num_leaves=100; total time=   0.7s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005779 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18963
[LightGBM] [Info] Number of data points in the train set: 616, number of used features: 674
[LightGBM] [Info] Start training from score 0.620130
[CV] END lgbmregressor__learning_rate=0.1, lgbmregressor__n_estimators=300, lgbmregressor__num_leaves=100; total time=   0.5s
[Li

INFO:src.experiment_utils:MLflow tracking URI and credentials set.


📉 Final MAE (rounded): 1.3666666666666667


2025/05/09 21:29:48 INFO mlflow.tracking.fluent: Experiment with name 'LGBM_Hyperparam_Tuned' does not exist. Creating a new experiment.
INFO:src.experiment_utils:Experiment set to: LGBM_Hyperparam_Tuned
INFO:src.experiment_utils:Logged parameters: {'lgbmregressor__num_leaves': 31, 'lgbmregressor__n_estimators': 300, 'lgbmregressor__learning_rate': 0.01}
INFO:src.experiment_utils:Logged mae: 1.3666666666666667
INFO:src.experiment_utils:Model signature inferred.
  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 2334.24it/s]
2025/05/09 21:29:54 INFO mlflow.models.model: Found the following environment variables used during model inference: [HOPSWORKS_API_KEY]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
Successfully registered model 'Pipeline'.
2025/05/09 21:31:22 INFO mlflow.store.model_registry.abstract_stor

🏃 View run ambitious-turtle-760 at: https://dagshub.com/vidyuthkrishna03/citibike-ride-prediction.mlflow/#/experiments/3/runs/2273e4f0589b4a809a747f85c4a170f2
🧪 View experiment at: https://dagshub.com/vidyuthkrishna03/citibike-ride-prediction.mlflow/#/experiments/3


<mlflow.models.model.ModelInfo at 0x287fa46ded0>