In [None]:
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# AI Platform (Unified) Model Builder SDK: AutoML Forecasting Model Training Example

To use this Colaboratory notebook, you copy the notebook to your own Google Drive and open it with Colaboratory (or Colab). You can run each step, or cell, and see its results. To run a cell, use Shift+Enter. Colab automatically displays the return value of the last line in each cell. For more information about running notebooks in Colab, see the [Colab welcome page](https://colab.research.google.com/notebooks/welcome.ipynb).

This notebook demonstrates how to create an AutoML model based on a time series dataset. It will require you provide a bucket where the dataset will be stored.

Note: you may incur charges for training, prediction, storage or usage of other GCP products in connection with testing this SDK.

# Authenticate and Install Model Builder SDK

This section will authenticate and setup your environment to use an experimental version of the Model Builder SDK that contains support for AutoML Forecasting. Refer to the [Model Builder SDK User Guide](https://docs.google.com/document/d/1tFhzwCbR1jU-_BLkxmEriA9Y3NDWse2YYEYyKXkRybk) for an overview and detailed documentation can be downloaded from [here](https://storage.cloud.google.com/python-aiplatform/forecasting/v0.1/docs-0.6.0.zip).

After the SDK installation the kernel will be automatically restarted. You may see this error message `Your session crashed for an unknown reason` which is normal.

In [1]:
%%capture
%env SDK_VERSION=0.6.0

In [2]:
import sys
if 'google.colab' in sys.modules:
  from google.colab import auth
  auth.authenticate_user()

In [3]:
%%capture
# hides output
!gsutil cp gs://python-aiplatform/forecasting/v0.1/google-cloud-aiplatform-${SDK_VERSION}.tar.gz .
!pip3 install google-cloud-aiplatform-${SDK_VERSION}.tar.gz
import IPython
app = IPython.Application.instance()
app.kernel.do_shutdown(True)

### Enter your project and GCS bucket

Enter your Project Id in the cell below. Then run the cell to make sure the Cloud SDK uses the right project for all the commands in this notebook.

In [1]:
MY_PROJECT = 'pytorch-tpu-nfs'

In [2]:
MY_STAGING_BUCKET = 'automl-samples'  # bucket should be in same region as AI Platform (Unified)

The dataset we are using is the Rossman forecasting dataset.

In [3]:
gcs_csv_path = 'gs://automl-samples/forcasting-datasets/rossman_train.csv'

# Initialize Model Builder SDK

Authenticate and initialize the *client* for AI Platform (Unified)

In [4]:
import sys
if 'google.colab' in sys.modules:
  from google.colab import auth
  auth.authenticate_user()

In [5]:
from google.cloud import aiplatform

aiplatform.init(project=MY_PROJECT, staging_bucket=MY_STAGING_BUCKET)

# Create a Managed Time Series Dataset from CSV

This section will create a dataset from a CSV file stored on your GCS bucket

In [6]:
ds = aiplatform.datasets.TimeSeriesDataset.create(
    display_name='rossman',
    gcs_source=[gcs_csv_path])

ds.resource_name

INFO:google.cloud.aiplatform.datasets.dataset:Creating TimeSeriesDataset
INFO:google.cloud.aiplatform.datasets.dataset:Create TimeSeriesDataset backing LRO: projects/64701051322/locations/us-central1/datasets/3953676687715074048/operations/7793223518711185408
INFO:google.cloud.aiplatform.datasets.dataset:TimeSeriesDataset created. Resource name: projects/64701051322/locations/us-central1/datasets/3953676687715074048
INFO:google.cloud.aiplatform.datasets.dataset:To use this TimeSeriesDataset in another session:
INFO:google.cloud.aiplatform.datasets.dataset:ds = aiplatform.TimeSeriesDataset('projects/64701051322/locations/us-central1/datasets/3953676687715074048')


'projects/64701051322/locations/us-central1/datasets/3953676687715074048'

# Launch a Training Job to Create a Model

Once we have defined your training script, we will create a model.

In [7]:
!gsutil ls gs://automl-samples/forcasting-datasets/

gs://automl-samples/forcasting-datasets/rossman_predict.csv
gs://automl-samples/forcasting-datasets/rossman_train.csv


In [8]:
job = aiplatform.AutoMLForecastingTrainingJob(
    display_name='train-rossman-automl_1',
    optimization_objective='minimize-mae',    
    column_transformations=[
        {"timestamp": {"column_name": "Date"}},
        {"categorical": {"column_name": "DayOfWeek"}},
        {"categorical": {"column_name": "Open"}},
        {"categorical": {"column_name": "Promo"}},
        {"numeric": {"column_name": "Sales"}},
        {"categorical": {"column_name": "SchoolHoliday"}},
        {"categorical": {"column_name": "StateHoliday"}},
    ]
)

# This will take around an hour to run
model = job.run(
    dataset=ds,
    target_column='Sales',
    time_column='Date',
    time_series_identifier_column='Store',
    available_at_forecast_columns=['Date', 'StateHoliday', 'SchoolHoliday', 'DayOfWeek', 'Open', 'Promo'],
    unavailable_at_forecast_columns=['Sales'],
    time_series_attribute_columns=[],
    forecast_horizon=10.0,
    data_granularity_unit='day',
    data_granularity_count=1,
    weight_column=None,
    budget_milli_node_hours=1000,
    model_display_name='rossman-forecast-model', 
    predefined_split_column_name=None
)

INFO:google.cloud.aiplatform.training_jobs:View Training:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/4038594169751470080?project=64701051322
INFO:google.cloud.aiplatform.training_jobs:AutoMLForecastingTrainingJob projects/64701051322/locations/us-central1/trainingPipelines/4038594169751470080 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.training_jobs:AutoMLForecastingTrainingJob  run. Resource name: projects/64701051322/locations/us-central1/trainingPipelines/4038594169751470080
INFO:google.cloud.aiplatform.training_jobs:Model available at projects/64701051322/locations/us-central1/models/6392842078866374656


# Fetch Model Evaluation Metrics

In [9]:
import pandas as pd

list_evaluation_pager = model.api_client.list_model_evaluations(parent=model.resource_name)
for model_evaluation in list_evaluation_pager:
  metrics_dict = {m[0]: m[1] for m in model_evaluation.metrics.items()}
  df = pd.DataFrame(metrics_dict.items(), columns=['Metric', 'Value'])
  print(df.to_string(index=False))

                     Metric      Value
    rootMeanSquaredLogError        NaN
                   rSquared   0.887067
meanAbsolutePercentageError        inf
       rootMeanSquaredError 709.745540
          meanAbsoluteError 489.048860


# Batch Prediction


In [13]:
MY_STAGING_BUCKET='gs://automl-samples'

In [14]:
BATCH_PREDICT_SOURCE = 'gs://automl-samples/forcasting-datasets/rossman_predict.csv'
BATCH_PREDICT_DESTINATION_PREFIX = f'{MY_STAGING_BUCKET}/prediction'
model.batch_predict(
   gcs_source=BATCH_PREDICT_SOURCE,
   instances_format='csv',
   gcs_destination_prefix=BATCH_PREDICT_DESTINATION_PREFIX,
   predictions_format='csv',
   job_display_name='predict-rossman-automl_1')

INFO:google.cloud.aiplatform.jobs:Creating BatchPredictionJob
INFO:google.cloud.aiplatform.jobs:BatchPredictionJob created. Resource name: projects/64701051322/locations/us-central1/batchPredictionJobs/7695517067176312832
INFO:google.cloud.aiplatform.jobs:To use this BatchPredictionJob in another session:
INFO:google.cloud.aiplatform.jobs:bpj = aiplatform.BatchPredictionJob('projects/64701051322/locations/us-central1/batchPredictionJobs/7695517067176312832')
INFO:google.cloud.aiplatform.jobs:View Batch Prediction Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/batch-predictions/7695517067176312832?project=64701051322
INFO:google.cloud.aiplatform.jobs:BatchPredictionJob projects/64701051322/locations/us-central1/batchPredictionJobs/7695517067176312832 current state:
JobState.JOB_STATE_PENDING
INFO:google.cloud.aiplatform.jobs:BatchPredictionJob  run. Resource name: projects/64701051322/locations/us-central1/batchPredictionJobs/7695517067176312832


<google.cloud.aiplatform.jobs.BatchPredictionJob object at 0x7f404c230190> 
resource name: projects/64701051322/locations/us-central1/batchPredictionJobs/7695517067176312832