In [1]:
import sys
from pathlib import Path

# Add project root to sys.path so `src/` can be imported
sys.path.append(str(Path("..").resolve()))


In [2]:
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
from datetime import datetime
from dotenv import load_dotenv
from src.data_utils import split_time_series_data
from src.experiment_utils import set_mlflow_tracking, log_model_to_mlflow
import numpy as np

In [3]:
# === 1. Load tabular data ===
df = pd.read_parquet("../data/transformed/tabular_data.parquet")

# Rename cols for compatibility with shared pipeline
df = df.rename(columns={
    "start_hour": "pickup_hour",
    "start_station_id": "pickup_location_id"
})


In [4]:
# === 2. Train/Test split ===
X_train, y_train, X_test, y_test = split_time_series_data(
    df=df,
    cutoff_date=datetime(2024, 4, 1),
    target_column="target"
)


In [5]:
# === 3. Drop non-feature columns ===
drop_cols = ["pickup_hour", "pickup_location_id"]
X_train_model = X_train.drop(columns=drop_cols)
X_test_model = X_test.drop(columns=drop_cols)


In [6]:
# === 4. Train LightGBM Model ===
model = lgb.LGBMRegressor(random_state=42)
model.fit(X_train_model, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009085 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19926
[LightGBM] [Info] Number of data points in the train set: 924, number of used features: 672
[LightGBM] [Info] Start training from score 0.490260


In [7]:
# === 5. Predict & Evaluate ===
predictions = model.predict(X_test_model).astype(int)
mae = mean_absolute_error(y_test, predictions)
print(f"📉 MAE (LightGBM, all 28-day lags): {mae:.4f}")

📉 MAE (LightGBM, all 28-day lags): 1.1889


In [33]:
# === 6. MLflow Logging ===
load_dotenv()
mlflow = set_mlflow_tracking()

log_model_to_mlflow(
    model=model,
    input_data=X_test_model,
    experiment_name="LGBM_FullLag_28Days",
    metric_name="mae",
    score=mae,
    params=model.get_params()
)

INFO:src.experiment_utils:MLflow tracking URI and credentials set.
INFO:src.experiment_utils:Experiment set to: LGBM_FullLag_28Days
INFO:src.experiment_utils:Logged parameters: {'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.1, 'max_depth': -1, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 100, 'n_jobs': None, 'num_leaves': 31, 'objective': None, 'random_state': 42, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'subsample': 1.0, 'subsample_for_bin': 200000, 'subsample_freq': 0}
INFO:src.experiment_utils:Logged mae: 1.1888888888888889

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training d

🏃 View run amazing-dove-57 at: https://dagshub.com/vidyuthkrishna03/citibike-ride-prediction.mlflow/#/experiments/1/runs/1a9e262be85a47829c64f38f20259961
🧪 View experiment at: https://dagshub.com/vidyuthkrishna03/citibike-ride-prediction.mlflow/#/experiments/1


<mlflow.models.model.ModelInfo at 0x2461af83e10>

In [24]:
from src.plot_utils import plot_aggregated_time_series
predictions = pd.Series(predictions, index=X_test.index)

# For one row's prediction (e.g., first row of test set)
plot_aggregated_time_series(X_test, y_test, row_id=50, predictions=predictions).show()



In [8]:
import joblib
from pathlib import Path

# Save the model to a pickle file
model_dir = Path("../models/")
model_dir.mkdir(parents=True, exist_ok=True)

joblib.dump(model, model_dir / "lgbm_model_28day.pkl")
print("✅ Model saved to models/lgbm_model_28day.pkl")


✅ Model saved to models/lgbm_model_28day.pkl
