## Model Building

In [2]:
import sagemaker
import numpy as np
import random

In [3]:
# set random seeds for reproducibility
np.random.seed(42)
random.seed(42)

In [4]:
sagemaker_session = sagemaker.Session()

In [5]:
s3_bucket = "sand-test-central-481423469601-us-east-1"
s3_prefix = "deepar_model"

role = sagemaker.get_execution_role()
#role = ""

### Trip Start

**Training**

In [6]:
region = sagemaker_session.boto_region_name

s3_data_path = "s3://{}/{}/data_start_poc".format(s3_bucket, s3_prefix)
s3_output_path = "s3://{}/{}/output_start_poc".format(s3_bucket, s3_prefix)

In [7]:
image_name = sagemaker.image_uris.retrieve("forecasting-deepar", region)

In [11]:
estimator = sagemaker.estimator.Estimator(
    image_uri = image_name,
    sagemaker_session = sagemaker_session,
    role = role,
    instance_count = 1,
    instance_type = "ml.c5.2xlarge",
    base_job_name = "deepar-poc-start",
    output_path = s3_output_path,
)

In [12]:
freq = "15min"
context_length = 4 * 24 * 3
prediction_length = 4 * 24 * 3

In [13]:
hyperparameters = {
    "time_freq": freq,
    "epochs": "400",
    "early_stopping_patience": "40",
    "mini_batch_size": "64",
    "learning_rate": "5E-4",
    "context_length": str(context_length),
    "prediction_length": str(prediction_length),
}

In [14]:
estimator.set_hyperparameters(**hyperparameters)

In [15]:
%%time
data_channels = {"train": "{}/train_start/".format(s3_data_path), "test": "{}/test_start/".format(s3_data_path)}

estimator.fit(inputs = data_channels, wait = True)

2022-10-19 16:11:43 Starting - Starting the training job...
2022-10-19 16:12:06 Starting - Preparing the instances for trainingProfilerReport-1666195902: InProgress
.........
2022-10-19 16:13:28 Downloading - Downloading input data...
2022-10-19 16:14:07 Training - Downloading the training image..............[34mArguments: train[0m
  if num_device is 1 and 'dist' not in kvstore:[0m
  from collections import Mapping, MutableMapping, Sequence[0m
[34m[10/19/2022 16:16:25 INFO 139946820503360] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-input.json: {'_kvstore': 'auto', '_num_gpus': 'auto', '_num_kv_servers': 'auto', '_tuning_objective_metric': '', 'cardinality': 'auto', 'dropout_rate': '0.10', 'early_stopping_patience': '', 'embedding_dimension': '10', 'learning_rate': '0.001', 'likelihood': 'student-t', 'mini_batch_size': '128', 'num_cells': '40', 'num_dynamic_feat': 'auto', 'num_eval_samples': '100', 'num_layers': '2', 'test

**Prediction**

In [29]:
from deepar_model_utils import DeepARPredictor
from deepar_model_utils import get_station_data
from deepar_model_utils import prep_station_data

Reference for following code: https://stackoverflow.com/questions/56255154/how-to-use-a-pretrained-model-from-s3-to-predict-some-data

TO DO:
- upload data to s3 bucket
- test to make sure series to predict is correct
- predict + plot predictions

In [26]:
trip_start_model = sagemaker.model.Model(
    model_data = "{}/deepar-poc-start-2022-10-19-16-11-42-997/output/model.tar.gz".format(s3_output_path),
    image_uri = image_name,
    role = role)

#predictor = trip_start_model.deploy(initial_instance_count = 1, instance_type = "ml.m5.large", predictor_cls = DeepARPredictor)

In [None]:
start_file = "s3://{}/model_trips_start_station_20208029_20220831.csv".format(s3_bucket)

trips_start = pd.read_csv(start_file, parse_dates = True)
trips_start.shape

In [None]:
station = 177
freq = "15min" # group and sum trips by a set increment
max_date = "2022-08-28 23:45:00" # make sure all series end at the same time

In [None]:
trips_start_all_group = prep_station_data(trips_start, "start station id", "starttime")
print(sum(trips_start_all_group["size"]))

In [None]:
predictor.predict(ts = get_station_data(trips_start_all_group, "start station id", "starttime", station, freq, max_date), quantiles = [0.10, 0.5, 0.90]).head()

In [None]:
predictor.delete_model()
predictor.delete_endpoint()