In [58]:
%reload_ext autoreload
%autoreload 2

In [59]:
import src.config as config

In [60]:
import hopsworks

# connect to project
project = hopsworks.login(
    project=config.HOPSWORKS_PROJECT_NAME,
    api_key_value=config.HOPSWORKS_API_KEY
)
project









In [61]:
# connect to feature store
feature_store = project.get_feature_store()
feature_store



In [62]:
# connect to feature group
feature_group = feature_store.get_feature_group(
    name=config.FEATURE_GROUP_NAME,
    version=config.FEATURE_GROUP_VERSION,
)
feature_group



In [63]:
# create a feature view 

try:
    # create a feature view if it doesnt exist
    feature_store.create_feature_view(
        name=config.FEATURE_VIEW_NAME,
        version=config.FEATURE_VIEW_VERSION,
        query=feature_group.select_all()
    )
except:
    print("Feature view already exists. Skip creation")



In [64]:
# get feature view
feature_view = feature_store.get_feature_view(
    name=config.FEATURE_VIEW_NAME,
    version=config.FEATURE_VIEW_VERSION
)
feature_view



In [65]:
# fetch data from feature store using feature view
ts_data, _ = feature_view.training_data(
    description='Time-series hourly taxi rides'
)





In [66]:
ts_data



In [67]:
# drop `pickup_ts` column
ts_data.drop('pickup_ts', axis=1, inplace=True)

# sort by `pickup_location_id` and `pickup_hour`
ts_data.sort_values(by=['pickup_location_id', 'pickup_hour'], inplace=True)
ts_data



In [68]:
# from src.plot import plot_ts
from typing import Optional, List
import pandas as pd
import plotly.express as px 

def plot_ts(
    ts_data: pd.DataFrame,
    locations: Optional[List[int]] = None
    ):
    """
    Plot time-series data
    """
    ts_data_to_plot = ts_data[ts_data.pickup_location_id.isin(locations)] if locations else ts_data

    fig = px.line(
        ts_data_to_plot,
        x="pickup_hour",
        y="rides",
        color='pickup_location_id',
        template='none',
    )

    fig.show()

plot_ts(ts_data, locations=[43])



### Converting timeseries data into feature and target

In [69]:
from src.data import transform_ts_data_into_features_and_target

features, targets = transform_ts_data_into_features_and_target(
    ts_data,
    input_seq_len=24*28, # one month
    step_size=23,
)

features_and_target = features.copy()
features_and_target['target_rides_next_hour'] = targets

print(f'{features_and_target.shape=}')







In [70]:
features_and_target



In [71]:
features_and_target.dtypes



In [72]:
from datetime import datetime, timedelta

features_and_target['pickup_hour'] = pd.to_datetime(features_and_target['pickup_hour'])

In [73]:
from datetime import date, timedelta, datetime
from pytz import timezone
import pandas as pd
from src.data_split import train_test_split

# training data -> from January 2022 up until 4 months ago
# testing data -> last 3 months

# cutoff_date = pd.to_datetime(date.today() - timedelta(days=28*6), utc=True)



cutoff_date = pd.to_datetime(datetime.utcnow() - timedelta(days=28*4))

# Set the time to midnight
cutoff_date = pd.to_datetime(cutoff_date.replace(hour=0, minute=0, second=0, microsecond=0)).tz_localize('UTC')
print(f'{cutoff_date=}')

# real_current_date = datetime.utcnow()

# simulated_current_date = real_current_date - timedelta(days=365)
# simulated_current_date = pd.to_datetime(simulated_current_date).floor('H')
# print(f"Simulated current date: {simulated_current_date}")

# # Ensure simulated_current_date has the same timezone as pickup_hour
# simulated_current_date = simulated_current_date.tz_localize('UTC')

# # Filter out any data after the simulated current date
# features_and_target = features_and_target[features_and_target['pickup_hour'] < simulated_current_date]

# # Calculate the training/testing cutoff (4 months before simulated date)
# cutoff_date = simulated_current_date - timedelta(days=28*4)
# cutoff_date = pd.to_datetime(cutoff_date.replace(hour=0, minute=0, second=0, microsecond=0))
# print(f"Training/testing cutoff date: {cutoff_date}")





In [74]:
X_train, y_train, X_test, y_test = train_test_split(
    features_and_target,
    cutoff_date,
    target_column_name='target_rides_next_hour'
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')



### Training LightGBM model

In [75]:
from src.model import get_pipeline
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
import numpy as np
import optuna

def objective(trial: optuna.trial.Trial) -> float:
    """
    Given a set of hyper-parameters, it trains a model and computes an average validation error based on a TimeSeriesSplit
    """

    # pick hyper-parameters
    hyperparams = {
        "metric": "mae",
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),
    }

    X_train.sort_values('pickup_hour', inplace=True)


    tss = TimeSeriesSplit(n_splits=4)
    scores = []
    for train_index, val_index in tss.split(X_train):

        # split data for training and validation
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index,:]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]

        # train the model
        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)

        # evaluate the model
        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)

        scores.append(mae)

    
    # return the mean score
    return np.array(scores).mean()

In [76]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=1)





In [77]:
best_params = study.best_trial.params
print(f'{best_params=}')



In [78]:
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)



In [79]:
predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')



In [80]:
import joblib
from src.paths import MODELS_DIR

joblib.dump(pipeline, MODELS_DIR / 'model.pkl')



### Hopsworks Model schema - format for the input data and output data
here we are providing example of input schema and output schema using X_train and y_train

In [81]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

input_schema = Schema(X_train)
output_schema = Schema(y_train)
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

### Hopsworks Model Registry

In [82]:
model_registry = project.get_model_registry()

model = model_registry.sklearn.create_model(
    name="taxi_demand_forecaster_next_hour",
    metrics={"test_mae": test_mae},
    description="LightGBM regressor wiht a bit of hyper-parameter tuning",
    input_example=X_train.sample(),
    model_schema=model_schema
)

model.save(str(MODELS_DIR / 'model.pkl'))











