#  First let us setup Amazon Forecast<a class="anchor" id="setup">

This section sets up the permissions and relevant endpoints.

In [None]:
%load_ext autoreload
%autoreload 2
from util.fcst_utils import *
from util.hint import hint
import warnings, boto3, s3fs, json

plt.rcParams['figure.figsize'] = (15.0, 5.0)
warnings.filterwarnings('ignore')

Now we wil create a boto3 session. Boto3 allows us to interact with all of the AWS services via python.
The region for the session is the defaulted region for your account. You can choose any of the 6 regions where the forecast service is available.

Once we have the boto3 session, we csan create the clients for forecast and forecast query.

In [None]:
# get boto3 session and s3 client
session = boto3.Session()  # can specifify region here i.e. region='us-west-2'

# get the s3, forecast and forecast-query clients
forecast = session.client('forecast')
forecast_query = session.client('forecastquery')

We also need the role arn when creating various forecast entities, the unique s3 bucket where all our forecast data will be stored and the project prefix for all the forecast entities we will be creating.

In [None]:
role_arn = get_or_create_role_arn()
account_id = session.client('sts').get_caller_identity().get('Account')
project=f"forecastKE"  # 숫자, 기호 등 안 됨

bucket_name="<your-bucket-name>" # bucket_name="forecast-kysim"
s3_data_path = f"s3://{bucket_name}/data"

Finally, we need the parameters that we will pass to the Forecast service that determine how to process the time series data. This includes:

In [None]:
timeseries_frequency = "D" # hourly frequency data
forecast_horizon = 10  # forecast 24 hours into future
timestamp_format = "yyyy-MM-dd" # timestamp format

# Step 1: Preparing the Datasets<a class="anchor" id="prepare">

In [None]:
df = pd.read_csv("../data/kcc demo.csv", dtype = object)
df.head()

In [None]:
df['count'] = df['actual_pax'].astype('int')
df['hol_yn'] = df['hol_yn'].astype('int')
df['datetime'] = pd.to_datetime(df['op_date'])

In [None]:
df = df.loc[df['item_id']=='KE0023YC']
df.head()

We take about two and a half week's of hourly data for demonstration, just for the purpose that there's no missing data in the whole range.

In [None]:
df_small = df[-2*7*24-24*3:]
df_small['item_id'] = "KE0023YC"

Let us plot the time series first.

In [None]:
df_small.plot(x='datetime', y='count', figsize=(15, 8))

We can see that the target time series seem to have a drop over weekends. Next let's plot both the target time series and the related time series that indicates whether today is a `workday` or not. More precisely, $r_t = 1$ if $t$ is a work day and 0 if not.

In [None]:
plt.figure(figsize=(15, 8))
ax = plt.gca()
df_small.plot(x='datetime', y='count', ax=ax);
#ax2 = ax.twinx()
#df_small.plot(x='datetime', y='hol_yn', color='red', ax=ax2);

In [None]:
target_df = df_small[['item_id', 'datetime', 'count']][:-10]
#related_df = df_small[['item_id', 'datetime', 'hol_yn']]

In [None]:
target_df.head(5)

As we can see, the length of the related time series is equal to the length of the target time series plus the forecast horizon. 

In [None]:
print(len(target_df), len(related_df))
assert len(target_df) + 10 == len(related_df), "length doesn't match"

Next we check whether there are "holes" in the related time series.  

In [None]:
assert len(related_df) == len(pd.date_range(
    start=list(related_df['datetime'])[0],
    end=list(related_df['datetime'])[-1],
    freq=timeseries_frequency
)), "missing entries in the related time series"

Everything looks fine, and we plot both time series again. As it can be seen, the related time series (indicator of whether the current day is a workday or not) is longer than the target time series.  The binary working day indicator feature is a good example of a related time series, since it is known at all future time points.  Other examples of related time series include holiday and promotion features.

In [None]:
plt.figure(figsize=(15, 10))
ax = plt.gca()
target_df.plot(x='datetime', y='count', ax=ax);
#ax2 = ax.twinx()
#related_df.plot(x='datetime', y='hol_yn', color='red', ax=ax2);

In [None]:
target_df.to_csv("../data/ke0023yc_target.csv", index= False, header = False)
#related_df.to_csv("../data/ke0023yc_related.csv", index= False, header = False)

If you don't have this bucket `amazon-forecast-data-{account_id}`, create it first on S3.

In [None]:
# sync data to s3
# '!' is used to make calls to the os shell
# where we then us the aws command line
#!aws s3 mb s3://$bucket_name
!aws s3 sync ../data $s3_data_path

## Step 2a. Creating a Dataset Group<a class="anchor" id="create">
First let's create a dataset group and then update it later to add our datasets. Since this is **RETAIL** use case we will specify that as the domain.

In [None]:
dataset_group = f"{project}_dataset_group"


print(dataset_group)
create_dataset_group_response = forecast.create_dataset_group(Domain="RETAIL",
                                                          DatasetGroupName=dataset_group,
                                                          DatasetArns=[])

dataset_group_arn = create_dataset_group_response['DatasetGroupArn']

forecast.describe_dataset_group(DatasetGroupArn=dataset_group_arn)

## Step 2b. Creating a Target Dataset<a class="anchor" id="target">
Now we will define a target time series. This is a required dataset to use the service. For this exmaple, the number of items sold, or `demand` is the target value we will be forecasting. 
  
First, we specify the name and schema of our dataset. Make sure the order of the attributes (columns) matches the raw data in the files. We follow the same three attribute format as the above example.

In [None]:
name = f"{project}_target_dataset"
schema = {
    "Attributes": [
        {
            "AttributeName": "item_id", 
            "AttributeType": "string"
        },
        {
            "AttributeName": "timestamp", 
            "AttributeType": "timestamp"
        },
        {
            "AttributeName": "demand", 
            "AttributeType": "float"
        }
    ]
}

Now that we have a schema and name we can create the target data set. This only sets up the definition of the dataset. No data has been imported to Forecast yet. Data import will happen later when we create the import jobs.

In [None]:
response = forecast.create_dataset(Domain="RETAIL",
                               DatasetType='TARGET_TIME_SERIES',
                               DatasetName=name,
                               DataFrequency=timeseries_frequency,
                               Schema=schema
)

target_dataset_arn = response['DatasetArn']
forecast.describe_dataset(DatasetArn=target_dataset_arn)

## Step 2c. Creating a Related Dataset<a class="anchor" id="related">
Now we will create a related time series dataset using the related price data for the items. The method call is very simliar to the above, except you will have `price` instead of `demand`.

In [None]:
name = f"{project}_related_dataset"
schema = {
    "Attributes": [
        {
            "AttributeName": "item_id", 
            "AttributeType": "string"
        },
        {
            "AttributeName": "timestamp", 
            "AttributeType": "timestamp"
        },
        {
            "AttributeName": "price", 
            "AttributeType": "float"
        }
    ]
}

response = forecast.create_dataset(Domain="RETAIL",
                               DatasetType='RELATED_TIME_SERIES',
                               DatasetName=name,
                               DataFrequency=timeseries_frequency,
                               Schema=schema
)



related_dataset_arn = response['DatasetArn']
forecast.describe_dataset(DatasetArn=related_dataset_arn)

## Step 2d. Updating the dataset group with the datasets we created<a class="anchor" id="update">
You can have multiple datasets under the same dataset group. Update it with the datasets we created before.

In [None]:
#dataset_arns = [target_dataset_arn, related_dataset_arn]
dataset_arns = [target_dataset_arn] # Non-daily flight 문제
forecast.update_dataset_group(DatasetGroupArn=dataset_group_arn, DatasetArns=dataset_arns)
forecast.describe_dataset_group(DatasetGroupArn=dataset_group_arn)

## Step 2e. Creating a Target Time Series Dataset Import Job<a class="anchor" id="targetImport">
    
Now that we that we have defined the target time series, will still need to create an import job to actually load the data into Amazon Forecast from s3.

In [None]:
s3_path = f"{s3_data_path}/ke0023yc_target.csv"

response = forecast.create_dataset_import_job(
    DatasetImportJobName=dataset_group,
    DatasetArn=target_dataset_arn,
    DataSource= {
        "S3Config" : {
            "Path": s3_path,
            "RoleArn": role_arn
        } 
    },
    TimestampFormat= timestamp_format
)

target_dataset_import_job_arn = response['DatasetImportJobArn']

## Step 2f. Creating a Related Time Series Dataset Import Job<a class="anchor" id="relatedImport">

In [None]:
s3_path = f"{s3_data_path}/ke0023yc_related.csv"
response = forecast.create_dataset_import_job(
    DatasetImportJobName=dataset_group,
    DatasetArn=related_dataset_arn,
    DataSource= {
        "S3Config" : {
            "Path": s3_path,
            "RoleArn": role_arn
        } 
    },
    TimestampFormat= timestamp_format
)
related_dataset_import_job_arn = response['DatasetImportJobArn']

We now need to wait for all the import jobs to finish. We will use a simple blocking `wait` method that checks the import job status.

In [None]:
assert(wait(lambda: forecast.describe_dataset_import_job(DatasetImportJobArn=target_dataset_import_job_arn)))
#assert(wait(lambda: forecast.describe_dataset_import_job(DatasetImportJobArn=related_dataset_import_job_arn)))

In [None]:
base_algorithm_arn = 'arn:aws:forecast:::algorithm/'

## Step 3a. Choosing DeepAR+<a class="anchor" id="DeepAR">
    
Amazon Forecast DeepAR+ is a supervised learning algorithm for forecasting scalar (one-dimensional) time series using recurrent neural networks (RNNs). Classical forecasting methods, such as autoregressive integrated moving average (ARIMA) or exponential smoothing (ETS), fit a single model to each individual time series, and then use that model to extrapolate the time series into the future. In many applications, however, you have many similar time series across a set of cross-sectional units. These time-series groupings demand different products, server loads, and requests for web pages. In this case, it can be beneficial to train a single model jointly over all of the time series. DeepAR+ takes this approach. When your dataset contains hundreds of feature time series, the DeepAR+ algorithm outperforms the standard ARIMA and ETS methods. You can also use the trained model for generating forecasts for new time series that are similar to the ones it has been trained on.

For more on DeepAR+, see the [Amazon Forecast Doumentation](https://docs.aws.amazon.com/forecast/latest/dg/aws-forecast-recipe-deeparplus.html).

In [None]:
algorithm_arn = f'{base_algorithm_arn}Deep_AR_Plus'
predictor_name = f'{project}_Deep_AR_Pls'

response = forecast.create_predictor(
    PredictorName = predictor_name,
    AlgorithmArn = algorithm_arn,
    ForecastHorizon = forecast_horizon,
    PerformAutoML = False,
    PerformHPO = False,
    InputDataConfig = {'DatasetGroupArn': dataset_group_arn},
    FeaturizationConfig = {'ForecastFrequency': timeseries_frequency}
)

predictor_arn_deep_ar = response['PredictorArn']

## Step 3b.  Choosing Prophet<a class="anchor" id="prophet">
    
Prophet is a popular local Bayesian structural time series model. The Amazon Forecast Prophet algorithm uses the Prophet class of the Python implementation of Prophet.

#### How Prophet Works
Prophet is especially useful for datasets that:

* Contain an extended time period (months or years) of detailed historical observations (hourly, daily, or weekly)
* Have multiple strong seasonalities
* Include previously known important, but irregular, events
* Have missing data points or large outliers
* Have non-linear growth trends that are approaching a limit

Prophet is an additive regression model with a piecewise linear or logistic growth curve trend. It includes a yearly seasonal component modeled using Fourier series and a weekly seasonal component modeled using dummy variables.

For more information, see the [Amazon Forecast Documentation](https://docs.aws.amazon.com/forecast/latest/dg/aws-forecast-recipe-prophet.html).

Prophet Hyperparameters and Related Time Series
Amazon Forecast uses the default Prophet hyperparameters. Prophet also supports related time-series as features, provided to Amazon Forecast in the related time-series CSV file.

In [None]:
algorithm_arn = f'{base_algorithm_arn}Prophet'
predictor_name = f'{project}_Prophet'

response = forecast.create_predictor(
    PredictorName = predictor_name,
    AlgorithmArn = algorithm_arn,
    ForecastHorizon = forecast_horizon,
    PerformAutoML = False,
    PerformHPO = False,
    InputDataConfig = {'DatasetGroupArn': dataset_group_arn},
    FeaturizationConfig = {'ForecastFrequency': timeseries_frequency}
)

predictor_arn_prophet = response['PredictorArn']

### Optional: Want to try another Predictor?
If you would like to try different predictors, check out the [Amazon Forecast Documentation](https://docs.aws.amazon.com/forecast/latest/dg/aws-forecast-choosing-recipes.html), and create another predictor.

In [None]:
algorithm_arn =f'{base_algorithm_arn}ARIMA'
predictor_name=f'{project}_ARIMA'

response = forecast.create_predictor(
    PredictorName = predictor_name,
    AlgorithmArn = algorithm_arn,
    ForecastHorizon = forecast_horizon,
    PerformAutoML = False,
    PerformHPO = False,
    InputDataConfig = {'DatasetGroupArn': dataset_group_arn},
    FeaturizationConfig = {'ForecastFrequency': timeseries_frequency}
)


predictor_arn_arima= response['PredictorArn']

We now need to wait for both predictors to complete training.

In [None]:
wait(lambda: forecast.describe_predictor(PredictorArn=predictor_arn_prophet))
forecast.describe_predictor(PredictorArn=predictor_arn_prophet)

wait(lambda: forecast.describe_predictor(PredictorArn=predictor_arn_arima))
forecast.describe_predictor(PredictorArn=predictor_arn_arima)

wait(lambda: forecast.describe_predictor(PredictorArn=predictor_arn_deep_ar))
forecast.describe_predictor(PredictorArn=predictor_arn_deep_ar)

### Get the Error Metrics

Now that we have trained predictors, we can get the error metrics for them. 

In [None]:
error_metrics_deep_ar_plus = forecast.get_accuracy_metrics(PredictorArn=predictor_arn_deep_ar)
print(error_metrics_deep_ar_plus)

error_metrics_prophet = forecast.get_accuracy_metrics(PredictorArn=predictor_arn_prophet)
print(error_metrics_prophet)

error_metrics_other = forecast.get_accuracy_metrics(PredictorArn=predictor_arn_arima)
print(error_metrics_arima)

In [None]:
def extract_summary_metrics(metric_response, predictor_name):
    df = pd.DataFrame(metric_response['PredictorEvaluationResults']
                 [0]['TestWindows'][0]['Metrics']['WeightedQuantileLosses'])
    df['Predictor'] = predictor_name
    return df

In [None]:
deep_ar_metrics = extract_summary_metrics(error_metrics_deep_ar_plus, "DeepAR")
prophet_metrics = extract_summary_metrics(error_metrics_prophet, "Prophet")

arima_metrics = extract_summary_metrics(error_metrics_other, "ARIMA")

In [None]:
metrics = [deep_ar_metrics, prophet_metrics, arima_metrics] # arima added
#if predictor_arn_arima:
#    metrics.append(arima_metrics)
    
pd.concat(metrics) \
        .pivot(index='Quantile', columns='Predictor', values='LossValue').plot.bar();

As we mentioned before, if you only have a handful of time series (in this case, only 1) with a small number of examples, the neural network models (DeepAR+) are not the best choice. Here, we clearly see that DeepAR+ behaves worse than Prophet in the case of a single time series. 

# Step 5. Creating a Forecast<a class="anchor" id="forecast">

The `create_forecast` method uses the predictor to create a forecast. In the response, you will get the Amazon Resource Name (ARN) of the forecast. You use this ARN to retrieve and export the forecast. 

In [None]:
forecast_name = f'{project}_deep_ar_plus'
response = forecast.create_forecast(
    ForecastName=forecast_name,
    PredictorArn=predictor_arn_deep_ar
)

forecast_arn_deep_ar = response['ForecastArn']

Now create a forecast using the prophet dataset and the optional predictor if you created one.

In [None]:
forecast_name = f'{project}_prophet'

response = forecast.create_forecast(
    ForecastName=forecast_name,
    PredictorArn=predictor_arn_prophet
)

forecast_arn_prophet = response['ForecastArn']

In [None]:
forecast_name = f'{project}_arima'
response = forecast.create_forecast(
    ForecastName=forecast_name,
    PredictorArn=predictor_arn_other
)
forecast_arn_arima = response['ForecastArn']

Now we need to wait for the forecasts to be finish being created.

In [None]:
wait(lambda: forecast.describe_forecast(ForecastArn=forecast_arn_deep_ar))
forecast.describe_forecast(ForecastArn=forecast_arn_deep_ar)

wait(lambda: forecast.describe_forecast(ForecastArn=forecast_arn_prophet))
forecast.describe_forecast(ForecastArn=forecast_arn_prophet)

wait(lambda: forecast.describe_forecast(ForecastArn=forecast_arn_arima))
forecast.describe_forecast(ForecastArn=forecast_arn_arima)

# Step 6. Querying the Forecasts<a class="anchor" id="query">

To query the forecasts that have been created,  use the following parameters.

* **start-date** and **end-date** – Specifies an optional date range to retrieve the forecast for. If you don't specify these parameters, the operation returns the entire forecast for bike_12.
* **filters** – Specifies the item_id filter to retrieve the electricity forecast for bike_12.

Because this is an hourly forecast, the response shows hourly forecast values. In the response, note the following:

* **mean** – For the specific date and time, the mean is the predicted mean value.
* **p90, p50, and p10** – Specify the confidence level that the actual value will be below the listed value at the specified date and time. 

For more information about this operation, see [QueryForecast](https://docs.aws.amazon.com/forecast/latest/dg/API_forecastquery_QueryForecast.html).

In [None]:
item_id = 'ke0023yc'

response = forecast_query.query_forecast(
    ForecastArn=forecast_arn_deep_ar,
    Filters={"item_id": item_id}
)

The response is a json structure:

In [None]:
print(json.dumps(response))

We will use a utility function already created for you to plot the actual values against the predicted values.

In [None]:
plt.figure(figsize=(30,10))
fname = f'../data/ke0023yc.csv'
actual = load_exact_sol(fname, item_id)

plot_forecasts(response, actual)
plt.title("DeepAR Forecast");

Now query the forecast for prophet=, and if you created it, your optional predictor.

In [None]:
plt.figure(figsize=(30,10))
response = forecast_query.query_forecast(
    ForecastArn=forecast_arn_prophet,
    Filters={"item_id": item_id}
)

plot_forecasts(response, actual)
plt.title("Prophet Forecast");

In [None]:
plt.figure(figsize=(30,10))
response = forecast_query.query_forecast(
    ForecastArn=forecast_arn_arima,
    Filters={"item_id": item_id}
)
plot_forecasts(response, actual)
plt.title("ARIMA Forecast");

# Step 7. Exporting your Forecasts<a class="anchor" id="export">

In [None]:
name = f'{project}_forecast_export_deep_ar_plus'
s3_path = f"{s3_data_path}/{name}"

response = forecast.create_forecast_export_job(
    ForecastExportJobName=name,
    ForecastArn=forecast_arn_deep_ar,
    Destination={
        "S3Config" : {
            "Path": s3_path,
             "RoleArn": role_arn
        }
    }
)

forecast_export_arn_deep_ar = response['ForecastExportJobArn']

Now create uour own export job for the prophet model

In [None]:
name = f'{project}_forecast_export_prophet'
s3_path = f"{s3_data_path}/{name}"

response = forecast.create_forecast_export_job(
    ForecastExportJobName=name,
    ForecastArn=forecast_arn_prophet,
    Destination={
        "S3Config" : {
            "Path": s3_path,
             "RoleArn": role_arn
        }
    }
)

forecast_export_arn_prophet = response['ForecastExportJobArn']

In [None]:
name = f'{project}_forecast_export_arima'
s3_path = f"{s3_data_path}/{name}"

response = forecast.create_forecast_export_job(
    ForecastExportJobName=name,
    ForecastArn=forecast_arn_arima,
    Destination={
        "S3Config" : {
            "Path": s3_path,
             "RoleArn": role_arn
        }
    }
)

forecast_export_arn_arima = response['ForecastExportJobArn']