In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))


In [3]:
import pandas as pd
from src.config import TRANSFORMED_DATA_DIR

# Load Citi Bike 28-day tabular data
df = pd.read_parquet(TRANSFORMED_DATA_DIR / "citibike_tabular_data_28d.parquet")


In [4]:
from datetime import datetime
from src.data_utils import split_time_series_data

X_train, y_train, X_test, y_test = split_time_series_data(
    df,
    cutoff_date=datetime(2023, 9, 1, 0, 0, 0),
    target_column="target"
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)


(657, 674)
(657,)
(366, 674)
(366,)


In [5]:
# Select only lag features (exclude pickup_hour, pickup_location_id)
past_ride_columns = [c for c in X_train.columns if c.startswith("rides_")]
X_train_only_numeric = X_train[past_ride_columns]
X_test_only_numeric = X_test[past_ride_columns]


In [6]:
import lightgbm as lgb

model = lgb.LGBMRegressor()
model.fit(X_train_only_numeric, y_train)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003869 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22889
[LightGBM] [Info] Number of data points in the train set: 657, number of used features: 672
[LightGBM] [Info] Start training from score 13.375951


In [7]:
from sklearn.metrics import mean_absolute_error

predictions = model.predict(X_test_only_numeric)
test_mae = mean_absolute_error(y_test, predictions)
print(f"{test_mae:.4f}")


5.7558


In [8]:
test_mae = mean_absolute_error(y_test, predictions)
print(f"{test_mae:.4f}")

5.7558


In [15]:
import os
import mlflow
from src.experiment_utils import log_model_to_mlflow

# 🔐 Set MLflow tracking credentials directly (bypassing .env)
mlflow.set_tracking_uri("https://dagshub.com/ryallavinuthnareddy/citibikeproject.mlflow")
os.environ["MLFLOW_TRACKING_USERNAME"] = "ryallavinuthnareddy"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "92dd14cdf34bef08d8871bf22dbdc0869a9198cf"  # replace with your token

# ✅ Run logging
log_model_to_mlflow(
    model=model,
    input_data=X_test_only_numeric,  # 🔁 Corrected argument name
    experiment_name="citibikeproject-experiment",
    metric_name="mean_absolute_error",
    model_name="LGBMRegressor-Citibike",
    score=test_mae
)

print("✅ Model successfully logged to MLflow.")


2025/05/10 23:13:50 INFO mlflow.tracking.fluent: Experiment with name 'citibikeproject-experiment' does not exist. Creating a new experiment.
INFO:src.experiment_utils:Experiment set to: citibikeproject-experiment
INFO:src.experiment_utils:Logged mean_absolute_error: 5.755750834863307
INFO:src.experiment_utils:Model signature inferred.
Successfully registered model 'LGBMRegressor-Citibike'.
2025/05/10 23:13:58 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LGBMRegressor-Citibike, version 1
Created version '1' of model 'LGBMRegressor-Citibike'.
INFO:src.experiment_utils:Model logged with name: LGBMRegressor-Citibike


🏃 View run amusing-whale-906 at: https://dagshub.com/ryallavinuthnareddy/citibikeproject.mlflow/#/experiments/4/runs/9a23c2428da34fa3981f6ce5ae383fc3
🧪 View experiment at: https://dagshub.com/ryallavinuthnareddy/citibikeproject.mlflow/#/experiments/4
✅ Model successfully logged to MLflow.
