In [1]:
import boto3
import botocore.exceptions
import dateutil.parser
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import sagemaker
import sys
import time

from datetime import datetime
from sagemaker import get_execution_role

%matplotlib inline
plt.style.use('Solarize_Light2')
prop_cycle = plt.rcParams['axes.prop_cycle']
colors = prop_cycle.by_key()['color']

In [2]:
BUCKET = sagemaker.Session().default_bucket()
PREFIX = 'Hanoi_retail'
region = 'eu-west-1'

session = boto3.Session(region_name=region) 
forecast = session.client(service_name='forecast') 
forecastquery = session.client(service_name='forecastquery')
role = get_execution_role()

In [3]:
# Paths to local data:
DATA = 'data'
PROCESSED_DATA = 'data/processed'

os.makedirs(PROCESSED_DATA, exist_ok=True)

In [32]:
hourly_df = pd.read_csv(os.path.join(DATA, 'forecast.csv'), header=None, parse_dates=[0])
hourly_df.columns = ['timestamp','item_id','qty']
hourly_df.head()

Unnamed: 0,timestamp,item_id,qty
0,2019-01-01,271554,2
1,2019-01-01,495366,1
2,2019-01-02,495360,1
3,2019-01-03,525764,165
4,2019-01-05,517242,1


In [33]:
print(np.min(hourly_df['timestamp']))
print(np.max(hourly_df['timestamp']))

2019-01-01 00:00:00
2019-12-31 00:00:00


In [41]:
training_start = pd.to_datetime('2019-01-01')
training_end   = pd.to_datetime('2019-07-28')
testing_start  = pd.to_datetime('2019-07-29')
testing_end    = pd.to_datetime('2019-12-31')

In [42]:
hourly_df

Unnamed: 0,timestamp,item_id,qty
0,2019-01-01,271554,2
1,2019-01-01,495366,1
2,2019-01-02,495360,1
3,2019-01-03,525764,165
4,2019-01-05,517242,1
...,...,...,...
15013,2019-12-15,271554,1
15014,2019-12-17,395581,1
15015,2019-12-24,271554,1
15016,2019-12-25,269765,1


In [43]:
training_df = hourly_df[(hourly_df['timestamp'] >=training_start) & (hourly_df['timestamp'] < training_end)]
testing_df= hourly_df[(hourly_df['timestamp'] >=testing_start) & (hourly_df['timestamp'] < testing_end)]

In [44]:
training_df.to_csv(f"{PROCESSED_DATA}/train.csv", header=False, index=False)
testing_df.to_csv(f"{PROCESSED_DATA}/test.csv", header=False, index=False)

In [45]:

!ls -lh $PROCESSED_DATA/

total 296K
-rw-rw-r-- 1 ec2-user ec2-user  54K May 12 14:25 test.csv
-rw-rw-r-- 1 ec2-user ec2-user 239K May 12 14:25 train.csv


In [46]:
KEY = f'{PREFIX}/train.csv'
boto3.Session().resource('s3').Bucket(BUCKET).Object(KEY).upload_file(f'{PROCESSED_DATA}/train.csv')

In [47]:
DATASET_FREQUENCY = "D" 
TIMESTAMP_FORMAT = "yyyy-MM-dd"

In [48]:
project = 'Hanoi_retail_forecast'
datasetName = project + '_ds'
datasetGroupName = project + '_dsg'
s3DataPath = "s3://" + BUCKET + "/" + KEY

In [49]:
%%time

# Let's try to create a dataset group:
try:
    create_dataset_group_response = forecast.create_dataset_group(
        DatasetGroupName=datasetGroupName,
        Domain="CUSTOM",
    )
    datasetGroupArn = create_dataset_group_response['DatasetGroupArn']
    
except Exception as e:
    error_code = e.response['Error']['Code']
    
    # If the dataset group already exists, we get its ARN:
    if (error_code == 'ResourceAlreadyExistsException'):
        print('A dataset group with this name already exists, you can use it to create and ingest new datasets')
        
        # List all the existing dataset groups:
        forecast_dsg_list = forecast.list_dataset_groups()

        # Loop through all the Forecast dataset groups:
        for dsg in forecast_dsg_list['DatasetGroups']:
            # Get the project name (the string after the first delimiter in the ARN)
            dsg_name = dsg['DatasetGroupArn'].split('/')[1]

            # Once we find it, we store the ARN and break out of the loop:
            if (dsg_name == datasetGroupName):
                datasetGroupArn = dsg['DatasetGroupArn']
                break
                
    else:
        raise
        
print(f'- Dataset group name: {datasetGroupName}')
print(f'- Dataset group ARN: {datasetGroupArn}')

- Dataset group name: Hanoi_retail_forecast_dsg
- Dataset group ARN: arn:aws:forecast:eu-west-1:800285456211:dataset-group/Hanoi_retail_forecast_dsg
CPU times: user 13.2 ms, sys: 0 ns, total: 13.2 ms
Wall time: 80.6 ms


In [50]:
forecast.describe_dataset_group(DatasetGroupArn=datasetGroupArn)

{'DatasetGroupName': 'Hanoi_retail_forecast_dsg',
 'DatasetGroupArn': 'arn:aws:forecast:eu-west-1:800285456211:dataset-group/Hanoi_retail_forecast_dsg',
 'DatasetArns': [],
 'Domain': 'CUSTOM',
 'Status': 'ACTIVE',
 'CreationTime': datetime.datetime(2021, 5, 12, 14, 27, 14, 921000, tzinfo=tzlocal()),
 'LastModificationTime': datetime.datetime(2021, 5, 12, 14, 27, 14, 921000, tzinfo=tzlocal()),
 'ResponseMetadata': {'RequestId': 'eec3ae50-6806-464c-b352-4e7f62d4facd',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1',
   'date': 'Wed, 12 May 2021 14:27:20 GMT',
   'x-amzn-requestid': 'eec3ae50-6806-464c-b352-4e7f62d4facd',
   'content-length': '273',
   'connection': 'keep-alive'},
  'RetryAttempts': 0}}

In [51]:
schema = {
   "Attributes": [
      { "AttributeName": "timestamp",    "AttributeType": "timestamp" },
      { "AttributeName": "item_id",      "AttributeType": "string"    },
        { "AttributeName": "target_value", "AttributeType": "float"     }
   ]
}

In [52]:
training_df.head()

Unnamed: 0,timestamp,item_id,qty
0,2019-01-01,271554,2
1,2019-01-01,495366,1
2,2019-01-02,495360,1
3,2019-01-03,525764,165
4,2019-01-05,517242,1


In [53]:
%%time 

try:
    response = forecast.create_dataset(
        Domain='CUSTOM',
        DatasetType='TARGET_TIME_SERIES',
        DatasetName=datasetName,
        DataFrequency=DATASET_FREQUENCY, 
        Schema=schema
    )
    datasetArn = response['DatasetArn']
    
except Exception as e:
    error_code = e.response['Error']['Code']

    # If the dataset group already exists, we get its ARN:
    if (error_code == 'ResourceAlreadyExistsException'):
        print('A dataset with this name already exists, you can use it to ingest new data into it:')
        
        # List all the existing datasets:
        forecast_ds_list = forecast.list_datasets()

        # Loop through all the Forecast datasets:
        for ds in forecast_ds_list['Datasets']:
            # Get the project name (the string after the first delimiter in the ARN)
            ds_name = ds['DatasetArn'].split('/')[1]

            # Once we find it, we store the ARN and break out of the loop:
            if (ds_name == datasetName):
                datasetArn = ds['DatasetArn']
                break
                
    else:
        raise
        
print(f'- Dataset name: {datasetName}')
print(f'- Dataset ARN: {datasetArn}')

- Dataset name: Hanoi_retail_forecast_ds
- Dataset ARN: arn:aws:forecast:eu-west-1:800285456211:dataset/Hanoi_retail_forecast_ds
CPU times: user 4.9 ms, sys: 0 ns, total: 4.9 ms
Wall time: 58.6 ms


In [54]:
forecast.describe_dataset(DatasetArn=datasetArn)

{'DatasetArn': 'arn:aws:forecast:eu-west-1:800285456211:dataset/Hanoi_retail_forecast_ds',
 'DatasetName': 'Hanoi_retail_forecast_ds',
 'Domain': 'CUSTOM',
 'DatasetType': 'TARGET_TIME_SERIES',
 'DataFrequency': 'D',
 'Schema': {'Attributes': [{'AttributeName': 'timestamp',
    'AttributeType': 'timestamp'},
   {'AttributeName': 'item_id', 'AttributeType': 'string'},
   {'AttributeName': 'target_value', 'AttributeType': 'float'}]},
 'EncryptionConfig': {},
 'Status': 'ACTIVE',
 'CreationTime': datetime.datetime(2021, 5, 12, 14, 28, 1, 132000, tzinfo=tzlocal()),
 'LastModificationTime': datetime.datetime(2021, 5, 12, 14, 28, 1, 132000, tzinfo=tzlocal()),
 'ResponseMetadata': {'RequestId': '2f594310-3a01-4e47-ae21-c856c0775d55',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1',
   'date': 'Wed, 12 May 2021 14:28:29 GMT',
   'x-amzn-requestid': '2f594310-3a01-4e47-ae21-c856c0775d55',
   'content-length': '509',
   'connection': 'keep-alive'},
  'Retr

In [55]:
forecast.update_dataset_group(DatasetGroupArn=datasetGroupArn, DatasetArns=[datasetArn])

{'ResponseMetadata': {'RequestId': '1d12445d-b671-405a-a557-5fc48e7d6d97',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1',
   'date': 'Wed, 12 May 2021 14:28:36 GMT',
   'x-amzn-requestid': '1d12445d-b671-405a-a557-5fc48e7d6d97',
   'content-length': '2',
   'connection': 'keep-alive'},
  'RetryAttempts': 0}}

In [56]:
def get_or_create_iam_role(role_name):
    iam = boto3.client("iam")

    assume_role_policy_document = {
        "Version": "2012-10-17",
        "Statement": [
            {
              "Effect": "Allow",
              "Principal": {
                "Service": "forecast.amazonaws.com"
              },
              "Action": "sts:AssumeRole"
            }
        ]
    }

    try:
        create_role_response = iam.create_role(
            RoleName = role_name,
            AssumeRolePolicyDocument = json.dumps(assume_role_policy_document)
        )
        role_arn = create_role_response["Role"]["Arn"]
        print("Created", role_arn)
        
    except iam.exceptions.EntityAlreadyExistsException:
        print("The role " + role_name + " exists, ignore to create it")
        role_arn = boto3.resource('iam').Role(role_name).arn

    print("Attaching policies")

    iam.attach_role_policy(
        RoleName = role_name,
        PolicyArn = "arn:aws:iam::aws:policy/AmazonForecastFullAccess"
    )

    iam.attach_role_policy(
        RoleName=role_name,
        PolicyArn='arn:aws:iam::aws:policy/AmazonS3FullAccess',
    )

    print("Waiting for a minute to allow IAM role policy attachment to propagate")
    time.sleep(60)

    print("Done.")
    return role_arn

In [57]:
# Create the role to provide to Amazon Forecast.
role_name = "ForecastNotebook-RetailPrediction"
role_arn = get_or_create_iam_role(role_name=role_name)

The role ForecastNotebook-RetailPrediction exists, ignore to create it
Attaching policies
Waiting for a minute to allow IAM role policy attachment to propagate
Done.


In [58]:
datasetImportJobName = 'Hanoi_retail_dataset_import_job'
ds_import_job_response = forecast.create_dataset_import_job(
    DatasetImportJobName=datasetImportJobName,
    DatasetArn=datasetArn,
    DataSource= {
        "S3Config" : {
            "Path": s3DataPath,
            "RoleArn": role_arn
        }
    },
    TimestampFormat=TIMESTAMP_FORMAT
)

In [59]:
ds_import_job_arn=ds_import_job_response['DatasetImportJobArn']
print(ds_import_job_arn)

arn:aws:forecast:eu-west-1:800285456211:dataset-import-job/Hanoi_retail_forecast_ds/Hanoi_retail_dataset_import_job


In [60]:
while True:
    status = forecast.describe_dataset_import_job(DatasetImportJobArn=ds_import_job_arn)['Status']
    print(str(pd.to_datetime(datetime.now()))[:19], "| Data ingestion:", status)
    
    if status in ('ACTIVE', 'CREATE_FAILED'): break
    time.sleep(60)

print(status)

2021-05-12 14:30:44 | Data ingestion: CREATE_PENDING
2021-05-12 14:31:45 | Data ingestion: CREATE_IN_PROGRESS
2021-05-12 14:32:45 | Data ingestion: CREATE_IN_PROGRESS
2021-05-12 14:33:45 | Data ingestion: CREATE_IN_PROGRESS
2021-05-12 14:34:45 | Data ingestion: CREATE_IN_PROGRESS
2021-05-12 14:35:45 | Data ingestion: CREATE_IN_PROGRESS
2021-05-12 14:36:45 | Data ingestion: CREATE_IN_PROGRESS
2021-05-12 14:37:45 | Data ingestion: CREATE_IN_PROGRESS
2021-05-12 14:38:45 | Data ingestion: CREATE_IN_PROGRESS
2021-05-12 14:39:45 | Data ingestion: CREATE_IN_PROGRESS
2021-05-12 14:40:45 | Data ingestion: CREATE_IN_PROGRESS
2021-05-12 14:41:45 | Data ingestion: CREATE_IN_PROGRESS
2021-05-12 14:42:46 | Data ingestion: CREATE_IN_PROGRESS
2021-05-12 14:43:46 | Data ingestion: CREATE_IN_PROGRESS
2021-05-12 14:44:46 | Data ingestion: CREATE_IN_PROGRESS
2021-05-12 14:45:46 | Data ingestion: CREATE_IN_PROGRESS
2021-05-12 14:46:46 | Data ingestion: CREATE_IN_PROGRESS
2021-05-12 14:47:46 | Data ingestio

In [61]:
forecast.describe_dataset_import_job(DatasetImportJobArn=ds_import_job_arn)

{'DatasetImportJobName': 'Hanoi_retail_dataset_import_job',
 'DatasetImportJobArn': 'arn:aws:forecast:eu-west-1:800285456211:dataset-import-job/Hanoi_retail_forecast_ds/Hanoi_retail_dataset_import_job',
 'DatasetArn': 'arn:aws:forecast:eu-west-1:800285456211:dataset/Hanoi_retail_forecast_ds',
 'TimestampFormat': 'yyyy-MM-dd',
 'UseGeolocationForTimeZone': False,
 'DataSource': {'S3Config': {'Path': 's3://sagemaker-eu-west-1-800285456211/Hanoi_retail/train.csv',
   'RoleArn': 'arn:aws:iam::800285456211:role/ForecastNotebook-RetailPrediction'}},
 'FieldStatistics': {'item_id': {'Count': 12212,
   'CountDistinct': 15,
   'CountNull': 0},
  'target_value': {'Count': 12212,
   'CountDistinct': 44,
   'CountNull': 0,
   'CountNan': 0,
   'Min': '1.0',
   'Max': '1390.0',
   'Avg': 1.756550933508025,
   'Stddev': 17.986180251417164},
  'timestamp': {'Count': 12212,
   'CountDistinct': 206,
   'CountNull': 0,
   'Min': '2019-01-01T00:00:00Z',
   'Max': '2019-07-27T00:00:00Z'}},
 'DataSize': 0.

In [62]:
len(training_df['item_id'].unique())

15

In [64]:
predictorName= project + '_arima'
forecastHorizon = 22
algorithmArn = 'arn:aws:forecast:::algorithm/ARIMA'

In [65]:
try:
    create_predictor_response = forecast.create_predictor(
        PredictorName = predictorName, 
        AlgorithmArn = algorithmArn,
        ForecastHorizon = forecastHorizon,
        PerformAutoML = False,
        PerformHPO = False,
        EvaluationParameters= {
            "NumberOfBacktestWindows": 1, 
            "BackTestWindowOffset": 24
        }, 
        InputDataConfig = {"DatasetGroupArn": datasetGroupArn},
        FeaturizationConfig = {
            "ForecastFrequency": "W", 
            "Featurizations": [{
                "AttributeName": "target_value", 
                "FeaturizationPipeline": [{
                    "FeaturizationMethodName": "filling", 
                    "FeaturizationMethodParameters": {
                        "frontfill": "none", 
                        "middlefill": "zero", 
                        "backfill": "zero"
                    }
                }]
            }]
        }
    )

    predictor_arn = create_predictor_response['PredictorArn']
    
except Exception as e:
    error_code = e.response['Error']['Code']

    # If the predictor already exists, we get its ARN:
    if (error_code == 'ResourceAlreadyExistsException'):
        print('A predictor with this name already exists, you can query it to check its status or request a forecast:')
        
        # List all the existing predictors:
        forecast_pred_list = forecast.list_predictors()

        # Loop through all the Forecast predictors:
        for predictor in forecast_pred_list['Predictors']:
            # Get the project name (the string after the first delimiter in the ARN)
            predictor_name = predictor['PredictorArn'].split('/')[1]

            # Once we find it, we store the ARN and break out of the loop:
            if (predictor_name == predictorName):
                predictor_arn = predictor['PredictorArn']
                break
                   
    else:
        raise
        
print(f'- Predictor name: {predictorName}')
print(f'- Predictor ARN: {predictor_arn}')

- Predictor name: Hanoi_retail_forecast_arima
- Predictor ARN: arn:aws:forecast:eu-west-1:800285456211:predictor/Hanoi_retail_forecast_arima


In [66]:
while True:
    status = forecast.describe_predictor(PredictorArn=predictor_arn)['Status']
    print(str(pd.to_datetime(datetime.now()))[:19], "| Model training:", status)
    
    if status in ('ACTIVE', 'CREATE_FAILED'): break
    time.sleep(60)

print(status)

2021-05-12 14:53:03 | Model training: CREATE_PENDING
2021-05-12 14:54:03 | Model training: CREATE_IN_PROGRESS
2021-05-12 14:55:03 | Model training: CREATE_IN_PROGRESS
2021-05-12 14:56:03 | Model training: CREATE_IN_PROGRESS
2021-05-12 14:57:03 | Model training: CREATE_IN_PROGRESS
2021-05-12 14:58:04 | Model training: CREATE_IN_PROGRESS
2021-05-12 14:59:04 | Model training: CREATE_IN_PROGRESS
2021-05-12 15:00:04 | Model training: CREATE_IN_PROGRESS
2021-05-12 15:01:04 | Model training: CREATE_IN_PROGRESS
2021-05-12 15:02:04 | Model training: CREATE_IN_PROGRESS
2021-05-12 15:03:04 | Model training: CREATE_IN_PROGRESS
2021-05-12 15:04:04 | Model training: CREATE_IN_PROGRESS
2021-05-12 15:05:04 | Model training: ACTIVE
ACTIVE


In [67]:
forecast.get_accuracy_metrics(PredictorArn=predictor_arn)

{'PredictorEvaluationResults': [{'AlgorithmArn': 'arn:aws:forecast:::algorithm/ARIMA',
   'TestWindows': [{'EvaluationType': 'SUMMARY',
     'Metrics': {'RMSE': 133.1655870299217,
      'WeightedQuantileLosses': [{'Quantile': 0.9,
        'LossValue': 0.7514790056270116},
       {'Quantile': 0.5, 'LossValue': 1.5957938525538167},
       {'Quantile': 0.1, 'LossValue': 0.33914930485405254}],
      'ErrorMetrics': [{'ForecastType': 'mean',
        'WAPE': 1.5957938525538167,
        'RMSE': 133.1655870299217}]}},
    {'TestWindowStart': datetime.datetime(2019, 2, 16, 0, 0, tzinfo=tzlocal()),
     'TestWindowEnd': datetime.datetime(2019, 7, 20, 0, 0, tzinfo=tzlocal()),
     'ItemCount': 15,
     'EvaluationType': 'COMPUTED',
     'Metrics': {'RMSE': 133.1655870299217,
      'WeightedQuantileLosses': [{'Quantile': 0.9,
        'LossValue': 0.7514790056270115},
       {'Quantile': 0.5, 'LossValue': 1.5957938525538164},
       {'Quantile': 0.1, 'LossValue': 0.33914930485405254}],
      'Error

In [68]:
forecastName = project + '_arima_forecast'

In [69]:
try:
    create_forecast_response = forecast.create_forecast(
        ForecastName=forecastName,
        PredictorArn=predictor_arn
    )
    forecast_arn = create_forecast_response['ForecastArn']
    
except Exception as e:
    error_code = e.response['Error']['Code']

    # If the predictor already exists, we get its ARN:
    if (error_code == 'ResourceAlreadyExistsException'):
        print('A forecast with this name already exists, you can use it to obtain a prediction:')
        
        # List all the existing predictors:
        forecast_fc_list = forecast.list_forecasts()

        # Loop through all the Forecast forecasts:
        for fc in forecast_fc_list['Forecasts']:
            # Get the project name (the string after the first delimiter in the ARN)
            fc_name = fc['ForecastArn'].split('/')[1]

            # Once we find it, we store the ARN and break out of the loop:
            if (fc_name == forecastName):
                forecast_arn = fc['ForecastArn']
                break
                
    else:
        raise
        
print(f'- Forecast name: {forecastName}')
print(f'- Forecast ARN: {forecast_arn}')

- Forecast name: Hanoi_retail_forecast_arima_forecast
- Forecast ARN: arn:aws:forecast:eu-west-1:800285456211:forecast/Hanoi_retail_forecast_arima_forecast


In [70]:
while True:
    status = forecast.describe_forecast(ForecastArn=forecast_arn)['Status']
    print(str(pd.to_datetime(datetime.now()))[:19], "| Forecast generation:", status)
    
    if status in ('ACTIVE', 'CREATE_FAILED'): break
    time.sleep(60)

print(status)

2021-05-12 15:06:38 | Forecast generation: CREATE_IN_PROGRESS
2021-05-12 15:07:38 | Forecast generation: CREATE_IN_PROGRESS
2021-05-12 15:08:38 | Forecast generation: CREATE_IN_PROGRESS
2021-05-12 15:09:38 | Forecast generation: CREATE_IN_PROGRESS
2021-05-12 15:10:38 | Forecast generation: CREATE_IN_PROGRESS
2021-05-12 15:11:38 | Forecast generation: CREATE_IN_PROGRESS
2021-05-12 15:12:38 | Forecast generation: CREATE_IN_PROGRESS
2021-05-12 15:13:38 | Forecast generation: CREATE_IN_PROGRESS
2021-05-12 15:14:38 | Forecast generation: CREATE_IN_PROGRESS
2021-05-12 15:15:38 | Forecast generation: CREATE_IN_PROGRESS
2021-05-12 15:16:38 | Forecast generation: ACTIVE
ACTIVE


In [71]:
shop_id = '495366'

print(forecast_arn)
print()
forecastResponse = forecastquery.query_forecast(
    ForecastArn=forecast_arn,
    Filters={'item_id': shop_id}
)
print(forecastResponse)

arn:aws:forecast:eu-west-1:800285456211:forecast/Hanoi_retail_forecast_arima_forecast

{'Forecast': {'Predictions': {'p10': [{'Timestamp': '2019-07-29T00:00:00', 'Value': 49.127620697021484}, {'Timestamp': '2019-08-05T00:00:00', 'Value': 48.49508285522461}, {'Timestamp': '2019-08-12T00:00:00', 'Value': 47.87256622314453}, {'Timestamp': '2019-08-19T00:00:00', 'Value': 47.25960922241211}, {'Timestamp': '2019-08-26T00:00:00', 'Value': 46.65578079223633}, {'Timestamp': '2019-09-02T00:00:00', 'Value': 46.06068801879883}, {'Timestamp': '2019-09-09T00:00:00', 'Value': 45.47396469116211}, {'Timestamp': '2019-09-16T00:00:00', 'Value': 44.895259857177734}, {'Timestamp': '2019-09-23T00:00:00', 'Value': 44.32426071166992}, {'Timestamp': '2019-09-30T00:00:00', 'Value': 43.76066589355469}, {'Timestamp': '2019-10-07T00:00:00', 'Value': 43.204193115234375}, {'Timestamp': '2019-10-14T00:00:00', 'Value': 42.654579162597656}, {'Timestamp': '2019-10-21T00:00:00', 'Value': 42.111576080322266}, {'Timestamp'