In [3]:
%load_ext autoreload
%autoreload 2

import sys
import os
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import lightgbm as lgb
import hopsworks
import joblib
import mlflow
from sklearn.metrics import mean_absolute_error
import ipywidgets as widgets
from IPython.display import display
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Add parent directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from src.config import TRANSFORMED_DATA_DIR, HOPSWORKS_PROJECT_NAME, HOPSWORKS_API_KEY, FEATURE_GROUP_NAME, FEATURE_GROUP_VERSION
from src.data_utils import transform_ts_data_info_features_and_target, split_time_series_data
from src.plot_utils import plot_aggregated_time_series



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import sys
import os
%load_ext autoreload
%autoreload 2
import hopsworks
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.metrics import mean_absolute_error
import joblib
from hsml.schema import Schema
from hsml.model_schema import ModelSchema
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import GradientBoostingRegressor
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

import src.config as config
from src.data_utils import transform_ts_data_info_features_and_target
from src.data_utils import split_time_series_data

# Connect to Hopsworks
project = hopsworks.login(
    project=config.HOPSWORKS_PROJECT_NAME,
    api_key_value=config.HOPSWORKS_API_KEY
)

# Connect to the feature store
feature_store = project.get_feature_store()

# Retrieve the feature group with historical data
feature_group = feature_store.get_feature_group(
    name=config.FEATURE_GROUP_NAME,
    version=config.FEATURE_GROUP_VERSION
)

# Create or retrieve the feature view
try:
    feature_store.create_feature_view(
        name=config.FEATURE_VIEW_NAME,
        version=config.FEATURE_VIEW_VERSION,
        query=feature_group.select_all(),
    )
    print(f"Feature view '{config.FEATURE_VIEW_NAME}' (version {config.FEATURE_VIEW_VERSION}) created successfully.")
except Exception as e:
    print(f"Error creating feature view: {e}")

feature_view = feature_store.get_feature_view(
    name=config.FEATURE_VIEW_NAME,
    version=config.FEATURE_VIEW_VERSION
)
print(f"Feature view '{config.FEATURE_VIEW_NAME}' (version {config.FEATURE_VIEW_VERSION}) retrieved successfully.")

# Fetch time series data from the feature view
ts_data, _ = feature_view.training_data(
    description="Time-series hourly Citi Bike rides"
)
ts_data = ts_data.sort_values(["start_station_name", "pickup_hour"]).reset_index(drop=True)
ts_data.head()
ts_data["pickup_hour"].min()
ts_data["pickup_hour"].max()

# Remove timezone information if present
ts_data["pickup_hour"] = pd.to_datetime(ts_data["pickup_hour"], errors="coerce").dt.tz_localize(None)
ts_data.info()

# Filter data for the training period: Jan 2024 to Jan 2025
ts_data = ts_data[
    (ts_data["pickup_hour"] >= "2024-01-01") &
    (ts_data["pickup_hour"] < "2025-02-01")
].reset_index(drop=True)
ts_data.shape
ts_data.info()

# Transform time series data into features and targets with a 28-day window
features, targets = transform_ts_data_info_features_and_target(
    ts_data, window_size=24*28, step_size=23
)
features.sort_values(["start_station_name", "pickup_hour"]).head(5)
features_targets = features.copy()
features_targets["target"] = targets
features_targets.shape

# Define the cutoff date (28 days before the end of the training period)
cutoff_date = pd.Timestamp("2025-01-04")  # 28 days before 2025-02-01
X_train, y_train, X_test, y_test = split_time_series_data(
    features_targets,
    cutoff_date=cutoff_date,
    target_column="target"
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

# Train and save all five models
# Model 1: Baseline Model (Previous Hour)
class BaselineModelPreviousHour:
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass

    def predict(self, X_test: pd.DataFrame) -> np.array:
        return X_test["rides_t-1"]

model_1 = BaselineModelPreviousHour()
predictions_1 = model_1.predict(X_test)
test_mae_1 = mean_absolute_error(y_test, predictions_1)
print(f"BaselineModelPreviousHour MAE: {test_mae_1:.4f}")

# Model 2: LightGBM with All Lag Features for Past 28 Days
past_ride_columns = [c for c in X_train.columns if c.startswith("rides_")]
lag_columns_28_days = [f"rides_t-{i}" for i in range(1, 28*24 + 1)]
lag_columns_28_days = [col for col in lag_columns_28_days if col in X_train.columns]
X_train_28_days = X_train[lag_columns_28_days]
X_test_28_days = X_test[lag_columns_28_days]

model_2 = lgb.LGBMRegressor()
model_2.fit(X_train_28_days, y_train)
predictions_2 = model_2.predict(X_test_28_days)
test_mae_2 = mean_absolute_error(y_test, predictions_2)
print(f"LightGBM_28DaysLags MAE: {test_mae_2:.4f}")

# Model 3: LightGBM with Feature Reduction (Top 10 Features)
selector_model = lgb.LGBMRegressor()
selector_model.fit(X_train[past_ride_columns], y_train)

selector = SelectKBest(score_func=f_regression, k=10)
X_train_reduced = selector.fit_transform(X_train[past_ride_columns], y_train)
X_test_reduced = selector.transform(X_test[past_ride_columns])

# Corrected: Convert past_ride_columns to numpy array for boolean indexing
selected_features = np.array(past_ride_columns)[selector.get_support()].tolist()
print(f"Selected Features: {selected_features}")

model_3 = lgb.LGBMRegressor()
model_3.fit(X_train_reduced, y_train)
predictions_3 = model_3.predict(X_test_reduced)
test_mae_3 = mean_absolute_error(y_test, predictions_3)
print(f"LightGBM_Top10Features MAE: {test_mae_3:.4f}")

# Model 4: Gradient Boosting with Temporal Features
X_train_temp = X_train.copy()
X_test_temp = X_test.copy()
X_train_temp["hour"] = X_train_temp["pickup_hour"].dt.hour
X_train_temp["day_of_week"] = X_train_temp["pickup_hour"].dt.dayofweek
X_train_temp["month"] = X_train_temp["pickup_hour"].dt.month
X_test_temp["hour"] = X_test_temp["pickup_hour"].dt.hour
X_test_temp["day_of_week"] = X_test_temp["pickup_hour"].dt.dayofweek
X_test_temp["month"] = X_test_temp["pickup_hour"].dt.month

features_to_use = past_ride_columns + ["hour", "day_of_week", "month"]
X_train_temp = X_train_temp[features_to_use]
X_test_temp = X_test_temp[features_to_use]

model_4 = GradientBoostingRegressor(n_estimators=200, max_depth=5, random_state=42)
model_4.fit(X_train_temp, y_train)
predictions_4 = model_4.predict(X_test_temp)
test_mae_4 = mean_absolute_error(y_test, predictions_4)
print(f"GradientBoosting_TemporalFeatures MAE: {test_mae_4:.4f}")

# Model 5: LightGBM with Broader Lagged Features, Cyclic Temporal Features, Interactions, and Tuning
lag_columns_7_days = [f"rides_t-{i}" for i in range(1, 7*24 + 1)]
lag_columns_7_days = [col for col in lag_columns_7_days if col in X_train[past_ride_columns].columns]
X_train_7_days = X_train[past_ride_columns][lag_columns_7_days]
X_test_7_days = X_test[past_ride_columns][lag_columns_7_days]

X_train_enhanced = X_train_7_days.copy()
X_test_enhanced = X_test_7_days.copy()

X_train_enhanced["hour"] = X_train["pickup_hour"].dt.hour
X_train_enhanced["day_of_week"] = X_train["pickup_hour"].dt.dayofweek
X_train_enhanced["month"] = X_train["pickup_hour"].dt.month
X_test_enhanced["hour"] = X_test["pickup_hour"].dt.hour
X_test_enhanced["day_of_week"] = X_test["pickup_hour"].dt.dayofweek
X_test_enhanced["month"] = X_test["pickup_hour"].dt.month

X_train_enhanced["hour_sin"] = np.sin(2 * np.pi * X_train_enhanced["hour"] / 24)
X_train_enhanced["hour_cos"] = np.cos(2 * np.pi * X_train_enhanced["hour"] / 24)
X_train_enhanced["day_of_week_sin"] = np.sin(2 * np.pi * X_train_enhanced["day_of_week"] / 7)
X_train_enhanced["day_of_week_cos"] = np.cos(2 * np.pi * X_train_enhanced["day_of_week"] / 7)
X_train_enhanced["month_sin"] = np.sin(2 * np.pi * X_train_enhanced["month"] / 12)
X_train_enhanced["month_cos"] = np.cos(2 * np.pi * X_train_enhanced["month"] / 12)
X_test_enhanced["hour_sin"] = np.sin(2 * np.pi * X_test_enhanced["hour"] / 24)
X_test_enhanced["hour_cos"] = np.cos(2 * np.pi * X_test_enhanced["hour"] / 24)
X_test_enhanced["day_of_week_sin"] = np.sin(2 * np.pi * X_test_enhanced["day_of_week"] / 7)
X_test_enhanced["day_of_week_cos"] = np.cos(2 * np.pi * X_test_enhanced["day_of_week"] / 7)
X_test_enhanced["month_sin"] = np.sin(2 * np.pi * X_test_enhanced["month"] / 12)
X_test_enhanced["month_cos"] = np.cos(2 * np.pi * X_test_enhanced["month"] / 12)

X_train_enhanced["is_winter"] = X_train["pickup_hour"].dt.month.isin([12, 1, 2]).astype(int)
X_test_enhanced["is_winter"] = X_test["pickup_hour"].dt.month.isin([12, 1, 2]).astype(int)

X_train_enhanced["is_weekend"] = X_train["pickup_hour"].dt.dayofweek.isin([5, 6]).astype(int)
X_test_enhanced["is_weekend"] = X_test["pickup_hour"].dt.dayofweek.isin([5, 6]).astype(int)

holiday_dates = [(12, 25), (12, 31), (1, 1)]
X_train_enhanced["is_holiday"] = X_train["pickup_hour"].apply(
    lambda x: 1 if (x.month, x.day) in holiday_dates else 0
)
X_test_enhanced["is_holiday"] = X_test["pickup_hour"].apply(
    lambda x: 1 if (x.month, x.day) in holiday_dates else 0
)

trend_lags = [f"rides_t-{i}" for i in range(1, 7*24 + 1)]
trend_lags = [col for col in lag_columns_7_days if col in X_train[past_ride_columns].columns]
X_train_enhanced["trend_7d"] = X_train[past_ride_columns][trend_lags].mean(axis=1)
X_test_enhanced["trend_7d"] = X_test[past_ride_columns][trend_lags].mean(axis=1)

X_train_enhanced["rides_t1_hour"] = X_train_enhanced["rides_t-1"] * X_train_enhanced["hour"]
X_test_enhanced["rides_t1_hour"] = X_test_enhanced["rides_t-1"] * X_test_enhanced["hour"]
X_train_enhanced["rides_t24_day_of_week"] = X_train_enhanced["rides_t-24"] * X_train_enhanced["day_of_week"]
X_test_enhanced["rides_t24_day_of_week"] = X_test_enhanced["rides_t-24"] * X_test_enhanced["day_of_week"]

param_grid = {
    "num_leaves": [31, 50],
    "max_depth": [10, 15],
    "learning_rate": [0.01, 0.05],
    "n_estimators": [200, 500]
}
model_5 = lgb.LGBMRegressor(random_state=42)
grid_search = GridSearchCV(
    estimator=model_5,
    param_grid=param_grid,
    scoring="neg_mean_absolute_error",
    cv=3,
    verbose=2,
    n_jobs=-1
)
grid_search.fit(X_train_enhanced, y_train)

model_5 = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

predictions_5 = model_5.predict(X_test_enhanced)
test_mae_5 = mean_absolute_error(y_test, predictions_5)
print(f"LightGBM_EnhancedLags_CyclicTemporal_Interactions MAE: {test_mae_5:.4f}")

# Save all models to Hopsworks Model Registry
model_registry = project.get_model_registry()

# Model 1: Baseline Model (Previous Hour)
input_schema_1 = Schema(X_train)
output_schema_1 = Schema(targets)
model_schema_1 = ModelSchema(input_schema=input_schema_1, output_schema=output_schema_1)

model_1_registry = model_registry.sklearn.create_model(
    name="baseline_previous_hour",
    metrics={"test_mae": test_mae_1},
    description="Baseline model using the previous hour's ride count",
    input_example=X_train.sample(),
    model_schema=model_schema_1
)
model_1_path = config.MODELS_DIR / "baseline_previous_hour.pkl"
model_1_registry.save(str(model_1_path))

# Model 2: LightGBM with 28 Days Lags
input_schema_2 = Schema(X_train_28_days)
output_schema_2 = Schema(targets)
model_schema_2 = ModelSchema(input_schema=input_schema_2, output_schema=output_schema_2)

model_2_registry = model_registry.sklearn.create_model(
    name="lightgbm_28days_lags",
    metrics={"test_mae": test_mae_2},
    description="LightGBM model using all lag features for the past 28 days",
    input_example=X_train_28_days.sample(),
    model_schema=model_schema_2
)
model_2_path = config.MODELS_DIR / "lightgbm_28days_lags.pkl"
model_2_registry.save(str(model_2_path))

# Model 3: LightGBM with Feature Reduction (Top 10 Features)
input_schema_3 = Schema(X_train_reduced)
output_schema_3 = Schema(targets)
model_schema_3 = ModelSchema(input_schema=input_schema_3, output_schema=output_schema_3)

model_3_registry = model_registry.sklearn.create_model(
    name="lightgbm_top10_features",
    metrics={"test_mae": test_mae_3},
    description="LightGBM model with feature reduction to top 10 features",
    input_example=pd.DataFrame(X_train_reduced, columns=[f"feature_{i}" for i in range(X_train_reduced.shape[1])]).sample(),
    model_schema=model_schema_3
)
model_3_path = config.MODELS_DIR / "lightgbm_top10_features.pkl"
model_3_registry.save(str(model_3_path))

# Model 4: Gradient Boosting with Temporal Features
input_schema_4 = Schema(X_train_temp)
output_schema_4 = Schema(targets)
model_schema_4 = ModelSchema(input_schema=input_schema_4, output_schema=output_schema_4)

model_4_registry = model_registry.sklearn.create_model(
    name="gradient_boosting_temporal_features",
    metrics={"test_mae": test_mae_4},
    description="Gradient Boosting model with temporal features",
    input_example=X_train_temp.sample(),
    model_schema=model_schema_4
)
model_4_path = config.MODELS_DIR / "gradient_boosting_temporal_features.pkl"
model_4_registry.save(str(model_4_path))

# Model 5: LightGBM with Enhanced Lagged Features, Cyclic Temporal Features, and Interactions
input_schema_5 = Schema(X_train_enhanced)
output_schema_5 = Schema(targets)
model_schema_5 = ModelSchema(input_schema=input_schema_5, output_schema=output_schema_5)

model_5_registry = model_registry.sklearn.create_model(
    name="lightgbm_enhanced_lags_cyclic_temporal_interactions",
    metrics={"test_mae": test_mae_5},
    description="LightGBM model with enhanced lagged features, cyclic temporal features, and interactions",
    input_example=X_train_enhanced.sample(),
    model_schema=model_schema_5
)
model_5_path = config.MODELS_DIR / "lightgbm_enhanced_lags_cyclic_temporal_interactions.pkl"
model_5_registry.save(str(model_5_path))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
2025-05-10 04:48:20,586 INFO: Initializing external client
2025-05-10 04:48:20,587 INFO: Base URL: https://c.app.hopsworks.ai:443




To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'


2025-05-10 04:48:21,708 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1225907
Error creating feature view: Metadata operation error: (url: https://c.app.hopsworks.ai/hopsworks-api/api/project/1225907/featurestores/1212511/featureview). Server response: 
HTTP code: 400, HTTP reason: Bad Request, body: b'{"errorCode":270179,"usrMsg":"Feature view: citi_bike_hourly_feature_view, version: 1","errorMsg":"The provided feature view name and version already exists"}', error code: 270179, error msg: The provided feature view name and version already exists, user msg: Feature view: citi_bike_hourly_feature_view, version: 1
Feature view 'citi_bike_hourly_feature_view' (version 1) retrieved successfully.
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.15s) 




Unnamed: 0,pickup_hour,start_station_name,rides
0,2025-04-11 08:00:00+00:00,11 Ave & W 41 St,16
1,2025-04-11 09:00:00+00:00,11 Ave & W 41 St,16
2,2025-04-11 10:00:00+00:00,11 Ave & W 41 St,14
3,2025-04-11 11:00:00+00:00,11 Ave & W 41 St,7
4,2025-04-11 12:00:00+00:00,11 Ave & W 41 St,14


'2024-01-01 00:00:00+00:00'

'2025-05-10 07:00:00+00:00'

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34920 entries, 0 to 34919
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   pickup_hour         34920 non-null  datetime64[ns]
 1   start_station_name  34920 non-null  object        
 2   rides               34920 non-null  int32         
dtypes: datetime64[ns](1), int32(1), object(1)
memory usage: 682.2+ KB


(28584, 3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28584 entries, 0 to 28583
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   pickup_hour         28584 non-null  datetime64[ns]
 1   start_station_name  28584 non-null  object        
 2   rides               28584 non-null  int32         
dtypes: datetime64[ns](1), int32(1), object(1)
memory usage: 558.4+ KB


Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-8,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,start_station_name
0,4,9,2,0,2,0,0,0,1,2,...,7,4,6,8,10,7,2,0,2024-01-29 00:00:00,8 Ave & W 31 St
1,4,1,2,0,1,0,6,27,32,14,...,8,10,22,18,18,17,6,8,2024-01-29 23:00:00,8 Ave & W 31 St
2,3,1,0,1,0,0,1,6,19,45,...,6,17,17,27,19,17,7,13,2024-01-30 22:00:00,8 Ave & W 31 St
3,6,9,12,0,0,0,0,0,10,25,...,6,10,10,21,16,27,16,11,2024-01-31 21:00:00,8 Ave & W 31 St
4,7,14,8,2,1,0,0,0,1,5,...,5,11,11,18,25,34,34,14,2024-02-01 20:00:00,8 Ave & W 31 St


(1158, 675)

(1068, 674)
(1068,)
(90, 674)
(90,)
BaselineModelPreviousHour MAE: 3.5000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005904 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41880
[LightGBM] [Info] Number of data points in the train set: 1068, number of used features: 672
[LightGBM] [Info] Start training from score 17.041199


LightGBM_28DaysLags MAE: 2.8845
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006371 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41880
[LightGBM] [Info] Number of data points in the train set: 1068, number of used features: 672
[LightGBM] [Info] Start training from score 17.041199


Selected Features: ['rides_t-672', 'rides_t-504', 'rides_t-360', 'rides_t-336', 'rides_t-312', 'rides_t-192', 'rides_t-168', 'rides_t-144', 'rides_t-24', 'rides_t-1']
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000151 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 622
[LightGBM] [Info] Number of data points in the train set: 1068, number of used features: 10
[LightGBM] [Info] Start training from score 17.041199


LightGBM_Top10Features MAE: 2.6830


GradientBoosting_TemporalFeatures MAE: 2.8426
Fitting 3 folds for each of 16 candidates, totalling 48 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001838 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11210
[LightGBM] [Info] Number of data points in the train set: 1068, number of used features: 182
[LightGBM] [Info] Start training from score 17.041199


Best Parameters: {'learning_rate': 0.05, 'max_depth': 15, 'n_estimators': 200, 'num_leaves': 31}
LightGBM_EnhancedLags_CyclicTemporal_Interactions MAE: 2.8811


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading: 0.000%|          | 0/56 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/2309 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/51326 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/1225907/models/baseline_previous_hour/4


Model(name: 'baseline_previous_hour', version: 4)

  0%|          | 0/6 [00:00<?, ?it/s]

Uploading: 0.000%|          | 0/306711 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/2323 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/51157 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/1225907/models/lightgbm_28days_lags/4


Model(name: 'lightgbm_28days_lags', version: 4)

  0%|          | 0/6 [00:00<?, ?it/s]

Uploading: 0.000%|          | 0/278432 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/31 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/245 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/1225907/models/lightgbm_top10_features/3


Model(name: 'lightgbm_top10_features', version: 3)

  0%|          | 0/6 [00:00<?, ?it/s]

Uploading: 0.000%|          | 0/1039580 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/2444 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/51372 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/1225907/models/gradient_boosting_temporal_features/3


Model(name: 'gradient_boosting_temporal_features', version: 3)

  0%|          | 0/6 [00:00<?, ?it/s]

Uploading: 0.000%|          | 0/572575 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/1121 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/13997 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/1225907/models/lightgbm_enhanced_lags_cyclic_temporal_interactions/3


Model(name: 'lightgbm_enhanced_lags_cyclic_temporal_interactions', version: 3)

In [None]:
# Connect to Hopsworks
project = hopsworks.login(
    project=HOPSWORKS_PROJECT_NAME,
    api_key_value=HOPSWORKS_API_KEY
)

# Connect to the feature store
feature_store = project.get_feature_store()

# Retrieve the feature group
feature_group = feature_store.get_feature_group(
    name=FEATURE_GROUP_NAME,
    version=FEATURE_GROUP_VERSION
)



In [None]:
# Create a feature view if it doesn't already exist
try:
    feature_store.create_feature_view(
        name=config.FEATURE_VIEW_NAME,
        version=config.FEATURE_VIEW_VERSION,
        query=feature_group.select_all(),
    )
    print(f"Feature view '{config.FEATURE_VIEW_NAME}' (version {config.FEATURE_VIEW_VERSION}) created successfully.")
except Exception as e:
    print(f"Error creating feature view: {e}")

# Retrieve the feature view
try:
    feature_view = feature_store.get_feature_view(
        name=config.FEATURE_VIEW_NAME,
        version=config.FEATURE_VIEW_VERSION,
    )
    print(f"Feature view '{config.FEATURE_VIEW_NAME}' (version {config.FEATURE_VIEW_VERSION}) retrieved successfully.")
except Exception as e:
    print(f"Error retrieving feature view: {e}")

In [None]:
ts_data, _ = feature_view.training_data(
    description="Time-series hourly taxi rides"
)
ts_data = ts_data.sort_values(["pickup_location_id", "pickup_hour"]).reset_index(drop=True)
ts_data.head()
ts_data["pickup_hour"].min()
ts_data["pickup_hour"].max()
ts_data_copy = ts_data.copy()
ts_data["pickup_hour"] = pd.to_datetime(ts_data["pickup_hour"], errors="coerce")
ts_data.info()
ts_data["pickup_hour"] = ts_data["pickup_hour"].dt.tz_localize(None)
ts_data.info()
ts_data["year_month"] = ts_data["pickup_hour"].dt.to_period("M")  # Year-Month format
ts_data

In [None]:
# Group by year_month and count
hour_counts = ts_data.groupby("year_month").size()

# Plot the data
ax = hour_counts.plot(kind="bar", figsize=(10, 6), color="skyblue", edgecolor="black")  # Use 'ax' to store the plot object
ax.set_title("Number of Hours by Year/Month", fontsize=16)
ax.set_xlabel("Year-Month", fontsize=12)
ax.set_ylabel("Count of Hours", fontsize=12)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
ax.grid(axis="y", linestyle="--", alpha=0.7)

plt.tight_layout()  # Adjust layout to prevent overlap
plt.show()

In [None]:
ts_data.head()
gte = ts_data["year_month"] >= pd.Period("2023-01", freq="M")
lte = ts_data["year_month"] <= pd.Period("2023-12", freq="M")
cond = gte & lte
filtered_data = ts_data[cond].reset_index(drop=True)
filtered_data.drop(columns=["year_month"], inplace=True) 
filtered_data.shape
ts_data = filtered_data
ts_data.info()
ts_data.head()["pickup_hour"].values

In [None]:
features, targets = transform_ts_data_info_features_and_target(ts_data, window_size=24*28, step_size=23)
features.sort_values(["pickup_location_id", "pickup_hour"]).head(5)
features[features["pickup_location_id"] == 2].head(5)
features_copy = features.copy()
features_targets = features.copy()
features_targets["target"] = targets
features_targets.shape

In [None]:
# Define the cutoff date as 28 days before today  
cutoff_date = pd.Timestamp(datetime.now() - timedelta(days=28))  
cutoff_date

In [None]:
X_train, y_train, X_test, y_test = split_time_series_data(
    features_targets,
    cutoff_date=cutoff_date,
    target_column="target"
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
X_train.info()
pipeline = get_pipeline()
pipeline.fit(features, targets)
predictions = pipeline.predict(features)
predictions
test_mae = mean_absolute_error(targets, predictions)
print(f"{test_mae:.4f}")

joblib.dump(pipeline, config.MODELS_DIR / "lgb_model.pkl")

In [None]:
input_schema = Schema(features)
output_schema = Schema(targets)
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

In [None]:
model_registry = project.get_model_registry()

model = model_registry.sklearn.create_model(
    name="taxi_demand_predictor_next_hour",
    metrics={"test_mae": test_mae},
    description="LightGBM regressor",
    input_example=features.sample(),
    model_schema=model_schema,
    #include_files=["src/", "requirements.txt"]  # Include directories and files
)
# https://community.hopsworks.ai/t/attributeerror-windowspath-object-has-no-attribute-startswith/1003
# model.save(config.MODELS_DIR / 'lgb_model.pkl')
model.save(r'C:\Users\singh\Downloads\CDS500_Applied_ML_DS\Projects\CDA500P1\models\lgb_model.pkl')

In [None]:
model = load_model_from_registry()
preds = get_model_predictions(model, features_copy)
preds.head(5)
test_mae = mean_absolute_error(targets, preds["predicted_demand"])
print(f"{test_mae:.4f}")