In [2]:
%load_ext autoreload
%autoreload 2

import sys
import os
from datetime import datetime, timedelta, timezone
import pandas as pd

# Add parent directory to sys.path to access `src` module
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

import src.config as config
from src.inference import get_feature_store, load_model_from_registry, load_metrics_from_registry
from src.pipeline_utils import get_pipeline


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
def fetch_days_data(days):
    current_date = pd.to_datetime(datetime.now(timezone.utc))
    fetch_data_from = current_date - timedelta(days=(365 + days)) 
    fetch_data_to = current_date - timedelta(days=365)
    print("⏳ Fetching historical training data from", fetch_data_from, "to", fetch_data_to)

    fs = get_feature_store()
    fg = fs.get_feature_group(name=config.FEATURE_GROUP_NAME, version=1)

    df = fg.select_all().read()
    df = df[(df["pickup_hour"] >= fetch_data_from) & (df["pickup_hour"] <= fetch_data_to)]
    return df.reset_index(drop=True)

# 6-month training window
ts_data = fetch_days_data(180)


⏳ Fetching historical training data from 2023-11-05 04:58:41.239239+00:00 to 2024-05-03 04:58:41.239239+00:00
2025-05-03 00:58:41,240 INFO: Initializing external client
2025-05-03 00:58:41,240 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-05-03 00:58:41,949 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214683
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (6.26s) 


In [4]:
from src.data_utils import transform_ts_data_info_features_and_target

features, targets = transform_ts_data_info_features_and_target(
    ts_data, window_size=24 * 28, step_size=23
)


In [5]:
pipeline = get_pipeline()
pipeline.fit(features, targets)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.038435 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 171901
[LightGBM] [Info] Number of data points in the train set: 15379, number of used features: 676
[LightGBM] [Info] Start training from score 17.367514


In [6]:
from sklearn.metrics import mean_absolute_error

predictions = pipeline.predict(features)
test_mae = mean_absolute_error(targets, predictions)
print(f"📉 Test MAE after retraining: {test_mae:.4f}")


📉 Test MAE after retraining: 0.0583


In [7]:
import joblib
joblib.dump(pipeline, config.MODELS_DIR / "lgb_model.pkl")


['/Users/rohanjain/github/sp25_taxi-main/models/lgb_model.pkl']

In [9]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema
import hopsworks

project = hopsworks.login()
model_registry = project.get_model_registry()

input_schema = Schema(features)
output_schema = Schema(targets)
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

modelv2 = model_registry.sklearn.create_model(
    name=config.MODEL_NAME,
    metrics={"test_mae": test_mae},
    description="LightGBM regressor V2 (retrained)",
    input_example=features.sample(),
    model_schema=model_schema,
)

modelv2.save(str(config.MODELS_DIR / "lgb_model.pkl"))


2025-05-03 01:00:38,248 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-05-03 01:00:38,250 INFO: Initializing external client
2025-05-03 01:00:38,251 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-05-03 01:00:38,824 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214683


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading /Users/rohanjain/github/sp25_taxi-main/models/lgb_model.pkl: 0.000%|          | 0/321192 elapsed<00:…

Uploading /Users/rohanjain/github/sp25_taxi-main/notebooks/input_example.json: 0.000%|          | 0/2204 elaps…

Uploading /Users/rohanjain/github/sp25_taxi-main/notebooks/model_schema.json: 0.000%|          | 0/48772 elaps…

Model created, explore it at https://c.app.hopsworks.ai:443/p/1214683/models/taxi_demand_predictor_next_hour/6


Model(name: 'taxi_demand_predictor_next_hour', version: 6)

In [10]:
models = model_registry.get_models(name=config.MODEL_NAME)
latest_model = max(models, key=lambda m: m.version)

print(f"✅ Latest Model Version: {latest_model.version}")


✅ Latest Model Version: 6
