## Part VII.2 - Train DeepAR model and publish to model store

University of San Diego - MS Applied AI

AAI-540 Team 5

October 21, 2024

In [1]:
# setup environment
%run 0-Environment_Setup.ipynb

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
Stored 's3_datalake_path_csv' (str)
Stored 'local_data_path_csv' (str)
Stored 's3_datalake_path_parquet' (str)


In [18]:
import json
from time import gmtime, strftime

In [19]:
# configure model image and output path
image_name = sagemaker.image_uris.retrieve("forecasting-deepar", region)
s3_output_path = s3_deepar_gold_dataset_path + "/output"

INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: 1.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


### Run training job with vanilla hyperparameters

In [20]:
# initialize estimator
estimator = sagemaker.estimator.Estimator(
    image_uri=image_name,
    sagemaker_session=sess,
    role=role,
    train_instance_count=1,
    train_instance_type="ml.m5.xlarge",
    base_job_name="store-sales-forecasting-deepar",
    output_path=s3_output_path,
)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [21]:
# define vanilla hyperparameters
hyperparameters = {
    "time_freq": deepar_freq,
    "epochs": "400",
    "early_stopping_patience": "40",
    "mini_batch_size": "64",
    "learning_rate": "5E-4",
    "context_length": str(deepar_context_length),
    "prediction_length": str(deepar_prediction_length),
}

In [22]:
# set hyperparameters to model
estimator.set_hyperparameters(**hyperparameters)

In [23]:
# train the model
data_channels = {"train": "{}/train/".format(s3_deepar_gold_dataset_path), "test": "{}/val/".format(s3_deepar_gold_dataset_path)}
history = estimator.fit(inputs=data_channels, wait=True)

INFO:sagemaker:Creating training-job with name: store-sales-forecasting-deepar-2024-10-01-04-02-13-037


2024-10-01 04:02:15 Starting - Starting the training job...
2024-10-01 04:02:30 Starting - Preparing the instances for training...
2024-10-01 04:03:09 Downloading - Downloading input data...
2024-10-01 04:03:29 Downloading - Downloading the training image........................
2024-10-01 04:07:32 Training - Training image download completed. Training in progress..Docker entrypoint called with argument(s): train
Running default environment configuration script
Running custom environment configuration script
  if num_device is 1 and 'dist' not in kvstore:
[10/01/2024 04:07:44 INFO 140492683822912] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-input.json: {'_kvstore': 'auto', '_num_gpus': 'auto', '_num_kv_servers': 'auto', '_tuning_objective_metric': '', 'cardinality': 'auto', 'dropout_rate': '0.10', 'early_stopping_patience': '', 'embedding_dimension': '10', 'learning_rate': '0.001', 'likelihood': 'student-t', 'mini_batch_size': 

### Hyperparameter Tuning

In [29]:
from sagemaker.tuner import HyperparameterTuner
from sagemaker.tuner import ContinuousParameter, IntegerParameter

In [38]:
# initialize estimator
tuned_estimator = sagemaker.estimator.Estimator(
    image_uri=image_name,
    sagemaker_session=sess,
    role=role,
    train_instance_count=1,
    train_instance_type="ml.m5.xlarge",
    base_job_name="store-sales-forecasting-deepar",
    output_path=s3_output_path,
    hyperparameters={"time_freq": deepar_freq, "prediction_length": str(deepar_prediction_length)},
)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [40]:
hyperparameter_ranges = {    
    "epochs": IntegerParameter(50, 1000),
    "context_length": IntegerParameter(1, 200),
    "mini_batch_size": IntegerParameter(32, 1028),
    "learning_rate": ContinuousParameter(0.00001, 0.1, scaling_type="Logarithmic"),
    "num_cells": IntegerParameter(30, 200)
}

In [41]:
# run hyperparameter tuner to search for optimal hyperparameters
training_job_name = "deepar-hyperparameter-tuning-job"
tuner = HyperparameterTuner(
    tuned_estimator,
    "test:RMSE",
    hyperparameter_ranges,
    [{"Name": "test:RMSE", "Regex": "test:RMSE: ([0-9\\.]+)"}],
    max_jobs=10,
    max_parallel_jobs=2,
    objective_type="Minimize",
    base_tuning_job_name=training_job_name,
)
tuner.fit(inputs=data_channels, logs=True)

INFO:sagemaker:Creating hyperparameter tuning job with name: deepar-hyperparamete-241001-0503


........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [8]:
# capture the tuning job name for monitoring
tuning_job_name  = "deepar-hyperparamete-241001-0503"

In [12]:
# Monitor status of the tuning job
from pprint import pprint

tuning_job_result = sm.describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuning_job_name
)

status = tuning_job_result["HyperParameterTuningJobStatus"]
if status != "Completed":
    print("Reminder: the tuning job has not been completed.")

job_count = tuning_job_result["TrainingJobStatusCounters"]["Completed"]
print("%d training jobs have completed" % job_count)

objective = tuning_job_result["HyperParameterTuningJobConfig"]["HyperParameterTuningJobObjective"]
is_minimize = objective["Type"] != "Maximize"
objective_name = objective["MetricName"]

# print out full job
pprint(tuning_job_result)

Reminder: the tuning job has not been completed.
7 training jobs have completed
{'BestTrainingJob': {'CreationTime': datetime.datetime(2024, 10, 1, 5, 3, 39, tzinfo=tzlocal()),
                     'FinalHyperParameterTuningJobObjectiveMetric': {'MetricName': 'test:RMSE',
                                                                     'Value': 4681.43701171875},
                     'ObjectiveStatus': 'Succeeded',
                     'TrainingEndTime': datetime.datetime(2024, 10, 1, 5, 30, 50, tzinfo=tzlocal()),
                     'TrainingJobArn': 'arn:aws:sagemaker:us-east-1:053585949834:training-job/deepar-hyperparamete-241001-0503-001-918206fd',
                     'TrainingJobName': 'deepar-hyperparamete-241001-0503-001-918206fd',
                     'TrainingJobStatus': 'Completed',
                     'TrainingStartTime': datetime.datetime(2024, 10, 1, 5, 4, 24, tzinfo=tzlocal()),
                     'TunedHyperParameters': {'context_length': '3',
                   

In [6]:
# Check best training run from tuning job
if tuning_job_result.get("BestTrainingJob", None):
    print("Best model found so far:")
    pprint(tuning_job_result["BestTrainingJob"])
else:
    print("No training jobs have reported results yet.")

Best model found so far:
{'CreationTime': datetime.datetime(2024, 10, 1, 5, 3, 39, tzinfo=tzlocal()),
 'FinalHyperParameterTuningJobObjectiveMetric': {'MetricName': 'test:RMSE',
                                                 'Value': 4681.43701171875},
 'ObjectiveStatus': 'Succeeded',
 'TrainingEndTime': datetime.datetime(2024, 10, 1, 5, 30, 50, tzinfo=tzlocal()),
 'TrainingJobArn': 'arn:aws:sagemaker:us-east-1:053585949834:training-job/deepar-hyperparamete-241001-0503-001-918206fd',
 'TrainingJobName': 'deepar-hyperparamete-241001-0503-001-918206fd',
 'TrainingJobStatus': 'Completed',
 'TrainingStartTime': datetime.datetime(2024, 10, 1, 5, 4, 24, tzinfo=tzlocal()),
 'TunedHyperParameters': {'context_length': '3',
                          'epochs': '101',
                          'learning_rate': '0.0006294407061415784',
                          'mini_batch_size': '1024',
                          'num_cells': '97'}}


In [9]:
tuner = sagemaker.HyperparameterTuningJobAnalytics(tuning_job_name)

full_df = tuner.dataframe()

if len(full_df) > 0:
    df = full_df[full_df["FinalObjectiveValue"] > -float("inf")]
    if len(df) > 0:
        df = df.sort_values("FinalObjectiveValue", ascending=is_minimize)
        print("Number of training jobs with valid objective: %d" % len(df))
        print({"lowest": min(df["FinalObjectiveValue"]), "highest": max(df["FinalObjectiveValue"])})
        pd.set_option("display.max_colwidth", None)  # Don't truncate TrainingJobName
    else:
        print("No training jobs have reported valid results yet.")

df

Number of training jobs with valid objective: 7
{'lowest': 4681.43701171875, 'highest': 9720.05859375}


Unnamed: 0,context_length,epochs,learning_rate,mini_batch_size,num_cells,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
9,3.0,101.0,0.000629,1024.0,97.0,deepar-hyperparamete-241001-0503-001-918206fd,Completed,4681.437012,2024-10-01 05:04:24+00:00,2024-10-01 05:30:50+00:00,1586.0
6,2.0,223.0,0.006197,819.0,146.0,deepar-hyperparamete-241001-0503-004-d1d4c01b,Completed,4798.756836,2024-10-01 05:43:53+00:00,2024-10-01 06:24:11+00:00,2418.0
7,1.0,819.0,0.001059,380.0,182.0,deepar-hyperparamete-241001-0503-003-215f7d5a,Completed,4963.909668,2024-10-01 05:33:24+00:00,2024-10-01 06:49:46+00:00,4582.0
8,3.0,542.0,0.010141,284.0,102.0,deepar-hyperparamete-241001-0503-002-7c75891b,Completed,5310.428223,2024-10-01 05:04:29+00:00,2024-10-01 05:43:34+00:00,2345.0
3,6.0,759.0,0.000242,1028.0,34.0,deepar-hyperparamete-241001-0503-007-0525f32c,Completed,5316.124023,2024-10-01 06:50:07+00:00,2024-10-01 09:07:54+00:00,8267.0
5,1.0,426.0,0.001845,184.0,52.0,deepar-hyperparamete-241001-0503-005-4e1dbda4,Completed,5653.986816,2024-10-01 06:26:11+00:00,2024-10-01 06:41:45+00:00,934.0
1,1.0,154.0,1e-05,929.0,78.0,deepar-hyperparamete-241001-0503-009-274cf983,Completed,9720.058594,2024-10-01 09:11:21+00:00,2024-10-01 09:38:16+00:00,1615.0
