In [1]:
import sys
from pathlib import Path

# Add project root to sys.path so `src/` can be imported
sys.path.append(str(Path("..").resolve()))


In [2]:
import hopsworks
from datetime import datetime, timedelta
import pandas as pd
from src import config
from src.data_utils import transform_ts_data_info_features_and_target, split_time_series_data

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
project = hopsworks.login(
    project=config.HOPSWORKS_PROJECT_NAME,
    api_key_value=config.HOPSWORKS_API_KEY
)

feature_store = project.get_feature_store()
feature_group = feature_store.get_feature_group(
    name=config.FEATURE_GROUP_NAME,
    version=config.FEATURE_GROUP_VERSION
)


2025-05-10 00:48:28,282 INFO: Initializing external client
2025-05-10 00:48:28,283 INFO: Base URL: https://c.app.hopsworks.ai:443




To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'


2025-05-10 00:48:29,200 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1228959


In [4]:
# === 2. Create feature view if it doesn't exist ===
try:
    feature_store.create_feature_view(
        name=config.FEATURE_VIEW_NAME,
        version=config.FEATURE_VIEW_VERSION,
        query=feature_group.select_all(),
    )
    print(f"✅ Created feature view: {config.FEATURE_VIEW_NAME} v{config.FEATURE_VIEW_VERSION}")
except Exception as e:
    print(f"⚠️ Feature view already exists or failed to create: {e}")


Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1228959/fs/1213525/fv/citibike_hourly_features/version/1
✅ Created feature view: citibike_hourly_features v1


In [21]:
# === 3. Retrieve feature view and training data ===
feature_view = feature_store.get_feature_view(
    name=config.FEATURE_VIEW_NAME,
    version=config.FEATURE_VIEW_VERSION
)

ts_data, _ = feature_view.training_data(
    description="Time-series hourly Citi Bike rides"
)


2025-05-10 01:01:11,506 ERROR: Peer CitiBikeRideForecast__vidyuthk is not known. Please register client certificates first.. Detail: Python exception: FlyingDuckException. gRPC client debug context: UNKNOWN:Error received from peer ipv4:51.79.26.27:5005 {created_time:"2025-05-10T05:01:11.4645448+00:00", grpc_status:2, grpc_message:"Peer CitiBikeRideForecast__vidyuthk is not known. Please register client certificates first.. Detail: Python exception: FlyingDuckException"}. Client context: IOError: Server never sent a data message. Detail: Internal
Traceback (most recent call last):
  File "c:\Users\vidyu\AppData\Local\Programs\Python\Python311\Lib\site-packages\hsfs\core\arrow_flight_client.py", line 394, in afs_error_handler_wrapper
    return func(instance, *args, **kw)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vidyu\AppData\Local\Programs\Python\Python311\Lib\site-packages\hsfs\core\arrow_flight_client.py", line 459, in read_query
    return self._get_dataset(
         



In [22]:
ts_data = ts_data.rename(columns={
    "start_hour": "pickup_hour",
    "start_station_id": "pickup_location_id"
})
ts_data = ts_data.sort_values(["pickup_location_id", "pickup_hour"]).reset_index(drop=True)

# === 4. Transform to tabular features/targets ===
features, targets = transform_ts_data_info_features_and_target(
    ts_data, window_size=24 * 28, step_size=24
)

In [23]:
# === 4. Transform to tabular features/targets ===
features, targets = transform_ts_data_info_features_and_target(
    ts_data, window_size=24 * 28, step_size=24
)


In [24]:
features_df, targets = transform_ts_data_info_features_and_target(ts_data)
features_df["target"] = targets

features_df["pickup_hour"] = pd.to_datetime(features_df["pickup_hour"])

cutoff_date = pd.Timestamp(datetime.now(), tz="UTC") - pd.Timedelta(days=28)

X_train, y_train, X_test, y_test = split_time_series_data(
    features_df,
    cutoff_date=cutoff_date,
    target_column="target"
)


In [25]:
# === 6. Preview shapes ===
print("Shapes:")
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)

Shapes:
X_train: (1014, 674)
y_train: (1014,)
X_test: (0, 674)
y_test: (0,)


In [26]:
lag_cols = [col for col in X_train.columns if col.startswith("rides_t-")]
X_train[lag_cols] = X_train[lag_cols].apply(pd.to_numeric, errors="coerce")


In [28]:
y_train = pd.to_numeric(y_train, errors="coerce")


In [29]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error
import joblib
from pathlib import Path
import numpy as np

# Train the model
model = LGBMRegressor(random_state=42)
model.fit(X_train.drop(columns=["pickup_hour", "pickup_location_id"]), y_train)

# Predict (on train set only, since test is empty for now)
train_preds = np.round(model.predict(X_train.drop(columns=["pickup_hour", "pickup_location_id"]))).astype(int)
train_mae = mean_absolute_error(y_train, train_preds)
print(f"📉 MAE on training set: {train_mae:.4f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008458 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22808
[LightGBM] [Info] Number of data points in the train set: 1014, number of used features: 672
[LightGBM] [Info] Start training from score 0.565089
📉 MAE on training set: 0.0355


In [30]:
# Save the trained model
Path("models").mkdir(exist_ok=True)
joblib.dump(model, "models/lgbm_model_28day.pkl")
print("✅ Model saved to models/lgbm_model_28day.pkl")


✅ Model saved to models/lgbm_model_28day.pkl


In [32]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema
from hsml.model import Model
from datetime import datetime

In [35]:
import os
# === 1. Login ===
project = hopsworks.login(
    project=os.getenv("HOPSWORKS_PROJECT_NAME"),
    api_key_value=os.getenv("HOPSWORKS_API_KEY")
)

model_registry = project.get_model_registry()

2025-05-10 01:06:16,800 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-05-10 01:06:16,805 INFO: Initializing external client
2025-05-10 01:06:16,805 INFO: Base URL: https://c.app.hopsworks.ai:443




To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'


2025-05-10 01:06:17,545 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1228959


In [36]:
# === 3. Define input/output schema ===
input_schema = Schema(X_train.drop(columns=["pickup_hour", "pickup_location_id"]))
output_schema = Schema(pd.DataFrame(y_train))

model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

In [37]:
# === 4. Register ===
model_name = "citibike_hourly_predictor"

registered_model = model_registry.python.create_model(
    name=model_name,
    metrics={"mae": float(train_mae)},
    model_schema=model_schema,
    description="LGBM trained on 28 days of hourly Citi Bike data"
)

registered_model.save("models/lgbm_model_28day.pkl")
print(f"✅ Registered model '{model_name}' to Hopsworks")

Uploading: 100.000%|██████████| 312112/312112 elapsed<00:02 remaining<00:00  1.43it/s]
Uploading: 100.000%|██████████| 51157/51157 elapsed<00:01 remaining<00:0007,  1.99s/it]
Model export complete: 100%|██████████| 6/6 [00:12<00:00,  2.02s/it]                   

Model created, explore it at https://c.app.hopsworks.ai:443/p/1228959/models/citibike_hourly_predictor/1
✅ Registered model 'citibike_hourly_predictor' to Hopsworks



