In [1]:
%load_ext autoreload
%autoreload 2

In [5]:
%load_ext autoreload
%autoreload 2
import sys
import os
from datetime import datetime, timedelta, timezone
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
import joblib

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

import src.config as config
from src.inference import get_feature_store, get_hopsworks_project
from src.data_utils import transform_ts_data_info_features_and_target

# Fetch recent data for the last 180 days (shifted back 365 days)
def fetch_days_data(days):
    current_date = pd.to_datetime(datetime.now(timezone.utc))
    fetch_data_from = current_date - timedelta(days=(365 + days)) 
    fetch_data_to = current_date - timedelta(days=365)
    print(f"Fetching data from {fetch_data_from} to {fetch_data_to}")
    fs = get_feature_store()
    fg = fs.get_feature_group(
        name=config.FEATURE_GROUP_NAME,
        version=1
    )

    query = fg.select_all()
    df = query.read()
    cond = (df["pickup_hour"] >= fetch_data_from) & (df["pickup_hour"] <= fetch_data_to)
    return df[cond]

# Fetch data for the last 180 days
ts_data = fetch_days_data(180)

# Generate features and targets using the updated transform function
features, targets = transform_ts_data_info_features_and_target(
    ts_data, window_size=24*28, step_size=23
)

# Define the list of models to retrain
models = [
    "baseline_previous_hour",
    "lightgbm_28days_lags",
    "lightgbm_top10_features",
    "gradient_boosting_temporal_features",
    "lightgbm_enhanced_lags_cyclic_temporal_interactions"
]

# Function to get the appropriate pipeline for each model
def get_pipeline(model_name):
    from sklearn.pipeline import Pipeline
    from lightgbm import LGBMRegressor
    from src.inference import BaselineModelPreviousHour

    if model_name == "baseline_previous_hour":
        return Pipeline([
            ("model", BaselineModelPreviousHour())
        ])
    elif model_name == "lightgbm_28days_lags":
        return Pipeline([
            ("model", LGBMRegressor(objective="regression", random_state=42))
        ])
    elif model_name == "lightgbm_top10_features":
        return Pipeline([
            ("model", LGBMRegressor(objective="regression", random_state=42))
        ])
    elif model_name == "gradient_boosting_temporal_features":
        from sklearn.ensemble import GradientBoostingRegressor
        return Pipeline([
            ("model", GradientBoostingRegressor(random_state=42))
        ])
    elif model_name == "lightgbm_enhanced_lags_cyclic_temporal_interactions":
        return Pipeline([
            ("model", LGBMRegressor(objective="regression", random_state=42))
        ])
    else:
        raise ValueError(f"Unknown model name: {model_name}")

# Retrain each model and save to the model registry
project = get_hopsworks_project()
model_registry = project.get_model_registry()

from hsml.schema import Schema
from hsml.model_schema import ModelSchema

for model_name in models:
    print(f"\nRetraining model: {model_name}")
    
    # Get the pipeline for this model
    pipeline = get_pipeline(model_name)
    
    # Fit the pipeline on the recent data
    pipeline.fit(features, targets)
    
    # Calculate MAE on the training data
    predictions = pipeline.predict(features)
    train_mae = mean_absolute_error(targets, predictions)
    print(f"Training MAE for {model_name}: {train_mae:.4f}")
    
    # Save the pipeline locally
    model_path = config.MODELS_DIR / f"{model_name}.pkl"
    joblib.dump(pipeline, model_path)
    
    # Define the model schema
    input_schema = Schema(features)
    output_schema = Schema(targets)
    model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)
    
    # Save the model to the model registry
    model = model_registry.sklearn.create_model(
        name=model_name,
        version=2,  # Increment the version since we're retraining
        metrics={"train_mae": train_mae},
        description=f"Retrained {model_name} model on recent 180 days of data",
        input_example=features.sample(),
        model_schema=model_schema,
    )
    model.save(str(model_path))
    print(f"Saved {model_name} to model registry with version 2")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


AttributeError: module 'hsfs' has no attribute 'hopsworks_udf'

In [8]:
import hopsworks
import hsfs
print(f"hopsworks version: {hopsworks.__version__}")
print(f"hsfs version: {hsfs.__version__}")

[autoreload of hopsworks.client failed: Traceback (most recent call last):
  File "c:\Users\singh\Downloads\CDS500_Applied_ML_DS\Projects\CDA500Final\CDA500PF2\Lib\site-packages\IPython\extensions\autoreload.py", line 276, in check
    superreload(m, reload, self.old_objects)
  File "c:\Users\singh\Downloads\CDS500_Applied_ML_DS\Projects\CDA500Final\CDA500PF2\Lib\site-packages\IPython\extensions\autoreload.py", line 475, in superreload
    module = reload(module)
             ^^^^^^^^^^^^^^
  File "C:\Users\singh\anaconda3\Lib\importlib\__init__.py", line 121, in reload
    raise ImportError(f"parent {parent_name!r} not in sys.modules",
ImportError: parent 'hopsworks' not in sys.modules
]
[autoreload of hopsworks.constants failed: Traceback (most recent call last):
  File "c:\Users\singh\Downloads\CDS500_Applied_ML_DS\Projects\CDA500Final\CDA500PF2\Lib\site-packages\IPython\extensions\autoreload.py", line 276, in check
    superreload(m, reload, self.old_objects)
  File "c:\Users\singh

AttributeError: module 'hsfs' has no attribute 'hopsworks_udf'

In [2]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
import src.config as config

In [3]:
from src.inference import get_feature_store, fetch_predictions
import pandas as pd
from datetime import datetime, timedelta, timezone


def fetch_days_data(days):
    current_date = pd.to_datetime(datetime.now(timezone.utc))
    fetch_data_from = current_date - timedelta(days=(365+days)) 
    fetch_data_to = current_date - timedelta(days=365)
    print(fetch_data_from, fetch_data_to)
    fs = get_feature_store()
    fg = fs.get_feature_group(
        name=config.FEATURE_GROUP_NAME,
        version=1
    )

    query = fg.select_all()
    # query = query.filter((fg.pickup_hour >= fetch_data_from))
    df = query.read()
    cond = (df["pickup_hour"] >= fetch_data_from) & (df["pickup_hour"] <= fetch_data_to)
    return df[cond]

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
ts_data = fetch_days_data(180)

2023-09-07 18:41:40.400565+00:00 2024-03-05 18:41:40.400565+00:00
2025-03-05 13:41:40,403 INFO: Initializing external client
2025-03-05 13:41:40,404 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-05 13:41:41,503 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214689
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (75.99s) 


In [5]:
from src.data_utils import transform_ts_data_info_features_and_target

features, targets = transform_ts_data_info_features_and_target(ts_data, window_size=24*28, step_size=23)

In [6]:
from src.pipeline_utils import get_pipeline
pipeline = get_pipeline()
pipeline.fit(features, targets)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.077087 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 171646
[LightGBM] [Info] Number of data points in the train set: 23660, number of used features: 675
[LightGBM] [Info] Start training from score 16.898225


In [7]:
from sklearn.metrics import mean_absolute_error
predictions = pipeline.predict(features)

In [8]:
test_mae = mean_absolute_error(targets, predictions)
print(f"{test_mae:.4f}")

2.8822


In [9]:
from src.inference import load_metrics_from_registry 

metric = load_metrics_from_registry()

2025-03-05 13:43:17,031 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-03-05 13:43:17,040 INFO: Initializing external client
2025-03-05 13:43:17,041 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-05 13:43:17,659 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214689


In [10]:
metric

{'test_mae': 1.935489834251996}

In [11]:
from src.inference import load_model_from_registry
model = load_model_from_registry()

2025-03-05 13:43:19,404 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-03-05 13:43:19,412 INFO: Initializing external client
2025-03-05 13:43:19,412 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-05 13:43:20,036 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214689
Downloading model artifact (0 dirs, 1 files)... DONE

In [12]:
import joblib  
import src.config
# Save the pipeline  
joblib.dump(pipeline, config.MODELS_DIR / "lgb_model.pkl")

['C:\\Users\\singh\\Downloads\\CDS500_Applied_ML_DS\\Projects\\CDA500P1\\models\\lgb_model.pkl']

In [13]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

input_schema = Schema(features)
output_schema = Schema(targets)
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

In [14]:
from src.inference import get_hopsworks_project

project = get_hopsworks_project()
model_registry = project.get_model_registry()
model_registry

2025-03-05 13:43:22,569 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-03-05 13:43:22,573 INFO: Initializing external client
2025-03-05 13:43:22,573 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-05 13:43:23,289 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214689


ModelRegistry(project: 'CDA500P1')

In [15]:
modelv2 = model_registry.sklearn.create_model(
    name="taxi_demand_predictor_next_hour",
    metrics={"test_mae": test_mae},
    description="LightGBM regressor V2",
    input_example=features.sample(),
    model_schema=model_schema,
)

In [17]:
modelv2.save(r'C:\Users\singh\Downloads\CDS500_Applied_ML_DS\Projects\CDA500P1\models\lgb_model.pkl')

Uploading: 100.000%|██████████| 316486/316486 elapsed<00:02 remaining<00:00  2.07s/it]
Uploading: 100.000%|██████████| 2050/2050 elapsed<00:04 remaining<00:000:08,  2.17s/it]
Uploading: 100.000%|██████████| 51418/51418 elapsed<00:01 remaining<00:00
Model export complete: 100%|██████████| 6/6 [00:16<00:00,  2.82s/it]                   

Model created, explore it at https://c.app.hopsworks.ai:443/p/1214689/models/taxi_demand_predictor_next_hour/7





Model(name: 'taxi_demand_predictor_next_hour', version: 7)

In [18]:
models = model_registry.get_models(name=config.MODEL_NAME)

In [19]:
models

[Model(name: 'taxi_demand_predictor_next_hour', version: 5),
 Model(name: 'taxi_demand_predictor_next_hour', version: 1),
 Model(name: 'taxi_demand_predictor_next_hour', version: 2),
 Model(name: 'taxi_demand_predictor_next_hour', version: 6),
 Model(name: 'taxi_demand_predictor_next_hour', version: 3),
 Model(name: 'taxi_demand_predictor_next_hour', version: 4),
 Model(name: 'taxi_demand_predictor_next_hour', version: 7)]

In [20]:
max(models, key=lambda model: model.version)


Model(name: 'taxi_demand_predictor_next_hour', version: 7)

In [21]:
load_metrics_from_registry()

2025-03-05 13:44:13,991 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-03-05 13:44:13,998 INFO: Initializing external client
2025-03-05 13:44:14,001 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-05 13:44:14,718 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214689


{'test_mae': 2.8821792503718733}