## Model Building with K-Means Clustered Time Series

In [None]:
import sagemaker
import numpy as np
import pandas as pd
import random

In [None]:
# set random seeds for reproducibility
np.random.seed(42)
random.seed(42)

In [None]:
sagemaker_session = sagemaker.Session()

In [None]:
s3_bucket = "sand-test-central-481423469601-us-east-1"
s3_prefix = "deepar_model"

In [None]:
role = sagemaker.get_execution_role()

In [None]:
region = sagemaker_session.boto_region_name

In [None]:
image_name = sagemaker.image_uris.retrieve("forecasting-deepar", region)

### Trip Start

In [None]:
# 1 day clusters
#s3_data_path_start = "s3://{}/{}/data_start_euc21".format(s3_bucket, s3_prefix)
#s3_output_path_start = "s3://{}/{}/output_start_euc21".format(s3_bucket, s3_prefix)

# 15-min clusters
s3_data_path_start = "s3://{}/{}/data_start_euc21_15min".format(s3_bucket, s3_prefix)
s3_output_path_start = "s3://{}/{}/output_start_euc21_15min".format(s3_bucket, s3_prefix)

**Training**

For clusters at 1 day intervals: Takes about 1 hour to train (54 mins) and to produce the below metrics on the test set

- RMSE: 1.2711722886593542
- mean_absolute_QuantileLoss: 59961.946376903164
- mean_wQuantileLoss: 1.0011678751236084
- wQuantileLoss[0.1]: 0.20084385777026378
- wQuantileLoss[0.2]: 0.40159530110872704
- wQuantileLoss[0.3]: 0.6020017034606088
- wQuantileLoss[0.4]: 0.8020904218030295
- wQuantileLoss[0.5]: 1.001947622389905
- wQuantileLoss[0.6]: 1.201635075207404
- wQuantileLoss[0.7]: 1.4011514511140977
- wQuantileLoss[0.8]: 1.600361343000989
- wQuantileLoss[0.9]: 1.798884100257451

For clusters at 15 minute intervals: Takes about 1 hour to train (70 mins) and to produce the below metrics on the test set

- RMSE: 1.2073422933655307
- mean_absolute_QuantileLoss: 59905.82156096457
- mean_wQuantileLoss: 1.0002307747439487
- wQuantileLoss[0.1]: 0.2007753558905601
- wQuantileLoss[0.2]: 0.40090675049026275
- wQuantileLoss[0.3]: 0.6009025786623517
- wQuantileLoss[0.4]: 0.8008060014892137
- wQuantileLoss[0.5]: 1.0006416589221518
- wQuantileLoss[0.6]: 1.2004270502086751
- wQuantileLoss[0.7]: 1.400073054592201
- wQuantileLoss[0.8]: 1.5994305009123087
- wQuantileLoss[0.9]: 1.7981140215278137

In [None]:
start_estimator = sagemaker.estimator.Estimator(
    image_uri = image_name,
    sagemaker_session = sagemaker_session,
    role = role,
    instance_count = 1,
    instance_type = "ml.c5.2xlarge",
    base_job_name = "deepar-euc21-start",
    output_path = s3_output_path_start,
)

In [None]:
freq = "15min"
context_length = 4 * 24 * 3
prediction_length = 4 * 24 * 3

In [None]:
start_hyperparameters = {
    "time_freq": freq,
    "epochs": "400",
    "early_stopping_patience": "40",
    "mini_batch_size": "64",
    "learning_rate": "5E-4",
    "context_length": str(context_length),
    "prediction_length": str(prediction_length),
}

In [None]:
start_estimator.set_hyperparameters(**start_hyperparameters)

In [None]:
%%time
start_data_channels = {"train": "{}/train_start/".format(s3_data_path_start), "test": "{}/test_start/".format(s3_data_path_start)}

#start_estimator.fit(inputs = start_data_channels, wait = True)

**Prediction**

In [None]:
import matplotlib.pyplot as plt
from sagemaker.serverless import ServerlessInferenceConfig

from deepar_model_utils import DeepARPredictor
from deepar_model_utils import get_station_data
from deepar_model_utils import prep_station_data

Reference for following code: [stackoverflow](https://stackoverflow.com/questions/56255154/how-to-use-a-pretrained-model-from-s3-to-predict-some-data), [Model docs](https://sagemaker.readthedocs.io/en/stable/api/inference/model.html)

In [None]:
start_file = "s3://{}/model_trips_start_station_20208029_20220831.csv".format(s3_bucket)

trips_start = pd.read_csv(start_file, parse_dates = True)
trips_start.shape

In [None]:
trips_start_all_group = prep_station_data(trips_start, "start station id", "starttime")
print(sum(trips_start_all_group["size"]))

In [None]:
# 1 day clusters
#start_clusters = pd.read_csv("s3://{}/start_clusters_euc21.csv".format(s3_bucket))

# 15-min clusters
start_clusters = pd.read_csv("s3://{}/start_clusters_euc21_15min.csv".format(s3_bucket))

In [None]:
trip_start_model = sagemaker.model.Model(
    # 1 day clusters
    #model_data = "{}/deepar-euc21-start-2022-10-30-15-24-15-866/output/model.tar.gz".format(s3_output_path_start),
    # 15-min clusters
    model_data = "{}/deepar-euc21-start-2022-10-30-18-31-32-130/output/model.tar.gz".format(s3_output_path_start),
    image_uri = image_name,
    role = role,
    predictor_cls = DeepARPredictor,
    name = "deepar-euc21-startmodel")

serverless_config = ServerlessInferenceConfig()

#start_predictor = trip_start_model.deploy(initial_instance_count = 1, 
#                                          instance_type = "ml.m5.large", 
#                                          endpoint_name = "deepar-euc21-startendpoint", 
#                                          serverless_inference_config = serverless_config)

In [None]:
print(trip_start_model.name) # model name
print(trip_start_model.endpoint_name) # endpoint name

In [None]:
station = 68
station_cluster = start_clusters[start_clusters["station id"] == station]["cluster"].tolist()
freq = "15min" # group and sum trips by a set increment
train_max_date = "2022-08-28 23:45:00" # make sure all series end at the same time
test_start = "2022-08-29 00:00:00"
max_date = "2022-08-31 23:45:00" # make sure all series end at the same time

In [None]:
# stations to try: [177, 436, 572, 67]
observed_start = get_station_data(trips_start_all_group, "start station id", "starttime", station, freq, max_date).loc[test_start:]["size"]
predicted_start = start_predictor.predict(ts = get_station_data(trips_start_all_group, "start station id", "starttime", station, freq, train_max_date)["size"], 
                                          cat = station_cluster,
                                          quantiles = [0.025, 0.9, 0.975])

plt.figure(figsize = (12, 6))
observed_start.plot(label = "observed")
p2_5 = predicted_start["0.025"]
p97_5 = predicted_start["0.975"]
plt.fill_between(p2_5.index, p2_5, p97_5, color = "#808080", alpha = 0.5, label = "95% CI")
predicted_start["0.9"].plot(label = "predicted")

plt.xlabel("date")
plt.ylabel("trip count")
plt.title(str(station))

plt.legend()
plt.show()

**Calculate RMSE**

In [None]:
from sklearn.metrics import mean_squared_error

- Quantile = 0.85; RMSE = 1.24
- Quantile = 0.90; RMSE = 1.24
- Quantile = 0.95; RMSE = 1.23
- Quantile = 0.99; RMSE = 2.66

In [None]:
quantile = 0.99
y_true = []
y_pred = []

for station in trips_start_all_group["start station id"].unique():
    y_true += get_station_data(trips_start_all_group, "start station id", "starttime", station, freq, max_date, cluster = True, min_date = test_start).loc[test_start:]["size"].tolist()
    y_pred += start_predictor.predict(ts = get_station_data(trips_start_all_group, "start station id", "starttime", station, freq, train_max_date)["size"], 
                                      cat = start_clusters[start_clusters["station id"] == station]["cluster"].tolist(),
                                      quantiles = [quantile])[str(quantile)].tolist()
mean_squared_error(y_true, y_pred, squared = False)

In [None]:
start_predictor.delete_model()
start_predictor.delete_endpoint()

### Trip Stop

**Note: did not fit a cluster model for trip stop station because the model from start station was not better than the POC.**

In [None]:
s3_data_path_stop = "s3://{}/{}/data_stop_euc21".format(s3_bucket, s3_prefix)
s3_output_path_stop = "s3://{}/{}/output_stop_euc21".format(s3_bucket, s3_prefix)

**Training**

Takes about X hours to train (Y mins) and to produce the below metrics on the test set

In [None]:
stop_estimator = sagemaker.estimator.Estimator(
    image_uri = image_name,
    sagemaker_session = sagemaker_session,
    role = role,
    instance_count = 1,
    instance_type = "ml.c5.2xlarge",
    base_job_name = "deepar-euc21-stop",
    output_path = s3_output_path_stop,
)

In [None]:
freq = "15min"
context_length = 4 * 24 * 3
prediction_length = 4 * 24 * 3

In [None]:
stop_hyperparameters = {
    "time_freq": freq,
    "epochs": "400",
    "early_stopping_patience": "40",
    "mini_batch_size": "64",
    "learning_rate": "5E-4",
    "context_length": str(context_length),
    "prediction_length": str(prediction_length),
}

In [None]:
stop_estimator.set_hyperparameters(**stop_hyperparameters)

In [None]:
%%time
stop_data_channels = {"train": "{}/train_stop/".format(s3_data_path_stop), "test": "{}/test_stop/".format(s3_data_path_stop)}

#stop_estimator.fit(inputs = stop_data_channels, wait = True)

**Prediction**

In [None]:
import matplotlib.pyplot as plt

from deepar_model_utils import DeepARPredictor
from deepar_model_utils import get_station_data
from deepar_model_utils import prep_station_data

Reference for following code: [stackoverflow](https://stackoverflow.com/questions/56255154/how-to-use-a-pretrained-model-from-s3-to-predict-some-data), [sagemaker.Model docs](https://sagemaker.readthedocs.io/en/stable/api/inference/model.html])

In [None]:
stop_file = "s3://{}/model_trips_stop_station_20208029_20220831.csv".format(s3_bucket)

trips_stop = pd.read_csv(stop_file, parse_dates = True)
trips_stop.shape

In [None]:
trips_stop_all_group = prep_station_data(trips_stop, "end station id", "stoptime")
print(sum(trips_stop_all_group["size"]))

In [None]:
stop_clusters = pd.read_csv("stop_clusters_euc21.csv")

In [None]:
trip_stop_model = sagemaker.model.Model(
    model_data = "{}//output/model.tar.gz".format(s3_output_path_stop),
    image_uri = image_name,
    role = role,
    predictor_cls = DeepARPredictor,
    name = "deepar-euc21-stopmodel")

#stop_predictor = trip_stop_model.deploy(initial_instance_count = 1, 
#                                        instance_type = "ml.m5.large", 
#                                        endpoint_name = "deepar-euc21-stopendpoint")

In [None]:
#print(trip_stop_model.name) # model name
#print(trip_stop_model.endpoint_name) # endpoint name

In [None]:
station = 67
station_cluster = stop_clusters[stop_clusters["station id"] == station]["cluster"].tolist()
freq = "15min" # group and sum trips by a set increment
train_max_date = "2022-08-28 23:45:00" # make sure all series end at the same time
test_start = "2022-08-29 00:00:00"
max_date = "2022-08-31 23:45:00" # make sure all series end at the same time

In [None]:
# stations to try: [177, 436, 572, 67]
# note: station 572 results in error because there is no data for that station prior to 8/29/2022
observed_stop = get_station_data(trips_stop_all_group, "end station id", "stoptime", station, freq, max_date).loc[test_start:]["size"]
predicted_stop = stop_predictor.predict(ts = get_station_data(trips_stop_all_group, "end station id", "stoptime", station, freq, train_max_date)["size"], 
                                        cat = station_cluster, 
                                        quantiles = [0.025, 0.9, 0.975])

plt.figure(figsize = (12, 6))
observed_stop.plot(label = "observed")
p2_5 = predicted_stop["0.025"]
p97_5 = predicted_stop["0.975"]
plt.fill_between(p2_5.index, p2_5, p97_5, color = "#808080", alpha = 0.5, label = "95% CI")
predicted_stop["0.9"].plot(label = "predicted")

plt.xlabel("date")
plt.ylabel("trip count")
plt.title(str(station))

plt.legend()
plt.show()

In [None]:
stop_predictor.delete_model()
stop_predictor.delete_endpoint()