## Model Building

In [None]:
import sagemaker
import numpy as np
import pandas as pd
import random

In [None]:
# set random seeds for reproducibility
np.random.seed(42)
random.seed(42)

In [None]:
sagemaker_session = sagemaker.Session()

In [None]:
s3_bucket = ""
s3_prefix = "deepar_model"

role = sagemaker.get_execution_role()
#role = ""

In [None]:
region = sagemaker_session.boto_region_name

s3_data_path_start = "s3://{}/{}/data_start_poc".format(s3_bucket, s3_prefix)
s3_output_path_start = "s3://{}/{}/output_start_poc".format(s3_bucket, s3_prefix)

s3_data_path_stop = "s3://{}/{}/data_stop_poc".format(s3_bucket, s3_prefix)
s3_output_path_stop = "s3://{}/{}/output_stop_poc".format(s3_bucket, s3_prefix)

In [None]:
image_name = sagemaker.image_uris.retrieve("forecasting-deepar", region)

### Trip Start

**Training**

- takes about 1 hour to train and to produce the below metrics on the test set
- RMSE: 1.2466856959864956
- mean_absolute_QuantileLoss: 59924.505764324516
- mean_wQuantileLoss: 1.000542739670148
- wQuantileLoss[0.1]: 0.20091203062302534
- wQuantileLoss[0.2]: 0.40111006030543794
- wQuantileLoss[0.3]: 0.6011151921433796
- wQuantileLoss[0.4]: 0.8009969021696943
- wQuantileLoss[0.5]: 1.0008156214438477
- wQuantileLoss[0.6]: 1.2006446410813532
- wQuantileLoss[0.7]: 1.4003967124746162
- wQuantileLoss[0.8]: 1.599928056774745
- wQuantileLoss[0.9]: 1.7989654400152346

In [None]:
start_estimator = sagemaker.estimator.Estimator(
    image_uri = image_name,
    sagemaker_session = sagemaker_session,
    role = role,
    instance_count = 1,
    instance_type = "ml.c5.2xlarge",
    base_job_name = "deepar-poc-start",
    output_path = s3_output_path_start,
)

In [None]:
freq = "15min"
context_length = 4 * 24 * 3
prediction_length = 4 * 24 * 3

In [None]:
start_hyperparameters = {
    "time_freq": freq,
    "epochs": "400",
    "early_stopping_patience": "40",
    "mini_batch_size": "64",
    "learning_rate": "5E-4",
    "context_length": str(context_length),
    "prediction_length": str(prediction_length),
}

In [None]:
start_estimator.set_hyperparameters(**start_hyperparameters)

In [None]:
%%time
start_data_channels = {"train": "{}/train_start/".format(s3_data_path_start), "test": "{}/test_start/".format(s3_data_path_start)}

#start_estimator.fit(inputs = start_data_channels, wait = True)

**Prediction**

In [None]:
import matplotlib.pyplot as plt

from deepar_model_utils import DeepARPredictor
from deepar_model_utils import get_station_data
from deepar_model_utils import prep_station_data

Reference for following code: [stackoverflow](https://stackoverflow.com/questions/56255154/how-to-use-a-pretrained-model-from-s3-to-predict-some-data), [Model docs](https://sagemaker.readthedocs.io/en/stable/api/inference/model.html])

In [None]:
start_file = "s3://{}/model_trips_start_station_20208029_20220831.csv".format(s3_bucket)

trips_start = pd.read_csv(start_file, parse_dates = True)
trips_start.shape

In [None]:
trips_start_all_group = prep_station_data(trips_start, "start station id", "starttime")
print(sum(trips_start_all_group["size"]))

In [None]:
trip_start_model = sagemaker.model.Model(
    model_data = "{}/deepar-poc-start-2022-10-19-16-11-42-997/output/model.tar.gz".format(s3_output_path),
    image_uri = image_name,
    role = role,
    predictor_cls = DeepARPredictor)

#start_predictor = trip_start_model.deploy(initial_instance_count = 1, instance_type = "ml.m5.large")

In [None]:
#trip_start_model.name # model name
#trip_start_model.endpoint_name # endpoint name

In [None]:
station = 572
freq = "15min" # group and sum trips by a set increment
train_start = "2022-08-29 00:00:00"
train_max_date = "2022-08-28 23:45:00" # make sure all series end at the same time
max_date = "2022-08-31 23:45:00" # make sure all series end at the same time

In [None]:
# stations to try: [177, 436, 572, 67]
observed_start = get_station_data(trips_start_all_group, "start station id", "starttime", station, freq, max_date).loc[train_start:]["size"]
predicted_start = start_predictor.predict(ts = get_station_data(trips_start_all_group, "start station id", "starttime", station, freq, train_max_date)["size"], quantiles = [0.025, 0.9, 0.975])

plt.figure(figsize = (12, 6))
observed_start.plot(label = "observed")
p2_5 = predicted_start["0.025"]
p97_5 = predicted_start["0.975"]
plt.fill_between(p2_5.index, p2_5, p97_5, color = "#808080", alpha = 0.5, label = "95% CI")
predicted_start["0.9"].plot(label = "predicted")

plt.xlabel("date")
plt.ylabel("trip count")
plt.title(str(station))

plt.legend()
plt.show()

In [None]:
start_predictor.delete_model()
start_predictor.delete_endpoint()

### Trip Stop

**Training**