In [None]:
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<!-- <table align="left">

  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/ai-platform-samples/blob/master/ai-platform-unified/notebooks/notebook_template.ipynb"">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/ai-platform-samples/blob/master/ai-platform-unified/notebooks/notebook_template.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
</table> -->

# Orchestrating a workflow to train and deploy a PyTorch text classification model using [Vertex Pipelines](https://cloud.google.com/vertex-ai/docs/pipelines/introduction)

## Overview

Pytorch resnet CIFAR10 e2e example using [Vertex Pipelines](https://cloud.google.com/vertex-ai/docs/pipelines).

This `kfp.v2` code currently does not run on OSS KFP, but will be able to soon once the 'v2 compatibility mode' feature is supported.


### Set up your local development environment

**If you are using Colab or Google Cloud Notebooks**, your environment already meets
all the requirements to run this notebook. You can skip this step.

**Otherwise**, make sure your environment meets this notebook's requirements.
You need the following:

* The Google Cloud SDK
* Git
* Python 3
* virtualenv
* Jupyter notebook running in a virtual environment with Python 3

The Google Cloud guide to [Setting up a Python development
environment](https://cloud.google.com/python/setup) and the [Jupyter
installation guide](https://jupyter.org/install) provide detailed instructions
for meeting these requirements. The following steps provide a condensed set of
instructions:

1. [Install and initialize the Cloud SDK.](https://cloud.google.com/sdk/docs/)

1. [Install Python 3.](https://cloud.google.com/python/setup#installing_python)

1. [Install
   virtualenv](https://cloud.google.com/python/setup#installing_and_using_virtualenv)
   and create a virtual environment that uses Python 3. Activate the virtual environment.

1. To install Jupyter, run `pip install jupyter` on the
command-line in a terminal shell.

1. To launch Jupyter, run `jupyter notebook` on the command-line in a terminal shell.

1. Open this notebook in the Jupyter Notebook Dashboard.

### Install additional packages



On colab, authenticate first:

In [None]:
import sys
if 'google.colab' in sys.modules:
  from google.colab import auth
  auth.authenticate_user()

Then, install the libraries.

In [None]:
import os

# The Google Cloud Notebook product has specific requirements
IS_GOOGLE_CLOUD_NOTEBOOK = os.path.exists("/opt/deeplearning/metadata/env_version")

# Google Cloud Notebook requires dependencies to be installed with '--user'
USER_FLAG = ""
if IS_GOOGLE_CLOUD_NOTEBOOK:
    USER_FLAG = "--user"

In [None]:
!pip -q install {USER_FLAG} --upgrade pytorch-lightning==1.4.9 pytorch-forecasting==0.9.1

We will be using [Vertex SDK for Python](https://cloud.google.com/vertex-ai/docs/start/client-libraries#python) to interact with Vertex AI services. The high-level aiplatform library is designed to simplify common data science workflows by using wrapper classes and opinionated defaults.

#### Install Vertex SDK for Python

In [None]:
!pip -q install {USER_FLAG} --upgrade kfp
!pip -q install {USER_FLAG} --upgrade google-cloud-pipeline-components 
!pip -q install {USER_FLAG} --upgrade google-cloud-aiplatform

### Restart the kernel

After you install the additional packages, you need to restart the notebook kernel so it can find the packages.

In [None]:
# Automatically restart kernel after installs
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

Check the versions of the packages you installed.  The KFP SDK version should be >=1.6.

In [None]:
!python3 -c "import kfp; print('kfp version: {}'.format(kfp.__version__))"
!python3 -c "import google_cloud_pipeline_components; print('google_cloud_pipeline_components version: {}'.format(google_cloud_pipeline_components.__version__))"

## Before you begin

This notebook does not require a GPU runtime.

### Set up your Google Cloud project

**The following steps are required, regardless of your notebook environment.**

1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.

1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

1. [Enable the Vertex AI API and Compute Engine API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com,compute_component). 
Also [enable the Cloud Build API](https://console.cloud.google.com/flows/enableapi?apiid=cloudbuild.googleapis.com).

1. If you are running this notebook locally, you will need to install the [Cloud SDK](https://cloud.google.com/sdk).

1. Enter your project ID in the cell below. Then run the cell to make sure the
Cloud SDK uses the right project for all the commands in this notebook.

**Note**: Jupyter runs lines prefixed with `!` as shell commands, and it interpolates Python variables prefixed with `$` into these commands.

#### Set your project ID

**If you don't know your project ID**, you may be able to get your project ID using `gcloud`.

In [None]:
import os
# Get your Google Cloud project ID from gcloud
shell_output=!gcloud config list --format 'value(core.project)' 2>/dev/null

try:
    PROJECT_ID = shell_output[0]
except IndexError:
    PROJECT_ID = None

# Get your Google Cloud project ID from gcloud
if not os.getenv("IS_TESTING"):
    shell_output=!gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID: ", PROJECT_ID)

Otherwise, set your project ID here.

In [None]:
if PROJECT_ID == "" or PROJECT_ID is None:
    PROJECT_ID = "python-docs-samples-tests"  # @param {type:"string"}

#### Timestamp

If you are in a live tutorial session, you might be using a shared test account or project. To avoid name collisions between users on resources created, you create a timestamp for each instance session, and append it onto the name of resources you create in this tutorial.

In [None]:
from datetime import datetime

def get_timestamp():
    return datetime.now().strftime("%Y%m%d%H%M%S")

TIMESTAMP = get_timestamp()
print(f"TIMESTAMP = {TIMESTAMP}")

### Authenticate your Google Cloud account

**If you are using AI Platform Notebooks**, your environment is already
authenticated. Skip this step.

**If you are using Colab**, run the cell below and follow the instructions
when prompted to authenticate your account via oAuth.

**Otherwise**, follow these steps:

1. In the Cloud Console, go to the [**Create service account key**
   page](https://console.cloud.google.com/apis/credentials/serviceaccountkey).

2. Click **Create service account**.

3. In the **Service account name** field, enter a name, and
   click **Create**.

4. In the **Grant this service account access to project** section, click the **Role** drop-down list. Type "AI Platform"
into the filter box, and select
   **AI Platform Administrator**. Type "Storage Object Admin" into the filter box, and select **Storage Object Admin**.

5. Click *Create*. A JSON file that contains your key downloads to your
local environment.

6. Enter the path to your service account key as the
`GOOGLE_APPLICATION_CREDENTIALS` variable in the cell below and run the cell.

In [None]:
import os
import sys

# If you are running this notebook in Colab, run this cell and follow the
# instructions to authenticate your GCP account. This provides access to your
# Cloud Storage bucket and lets you submit training jobs and prediction
# requests.

# If on AI Platform, then don't execute this code
if not os.path.exists("/opt/deeplearning/metadata/env_version"):
    if "google.colab" in sys.modules:
        from google.colab import auth as google_auth

        google_auth.authenticate_user()

    # If you are running this notebook locally, replace the string below with the
    # path to your service account key and run this cell to authenticate your GCP
    # account.
    elif not os.getenv("IS_TESTING"):
        %env GOOGLE_APPLICATION_CREDENTIALS ''

### Create a Cloud Storage bucket as necessary

You will need a Cloud Storage bucket for this example.  If you don't have one that you want to use, you can make one now.


Set the name of your Cloud Storage bucket below. It must be unique across all
Cloud Storage buckets.

You may also change the `REGION` variable, which is used for operations
throughout the rest of this notebook. Make sure to [choose a region where AI Platform (Unified) services are
available](https://cloud.google.com/ai-platform-unified/docs/general/locations#available_regions). You may
not use a Multi-Regional Storage bucket for training with AI Platform.

**Change the bucket name below** before running the next cell.

In [None]:
BUCKET = "<bucket_name_without_gs>" 
BUCKET_NAME = f"gs://{BUCKET}"  # @param {type:"string"}
REGION = "us-central1"  # @param {type:"string"}

In [None]:
if BUCKET_NAME == "" or BUCKET_NAME is None or BUCKET_NAME == "gs://[your-bucket-name]":
    BUCKET_NAME = "gs://" + PROJECT_ID + "aip-" + TIMESTAMP

**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [None]:
! gsutil mb -l $REGION $BUCKET_NAME

Finally, validate access to your Cloud Storage bucket by examining its contents:

In [None]:
! gsutil ls -al $BUCKET_NAME

### Import libraries and define constants



Define some constants. See the "Before you begin" section of the Managed Pipelines User Guide for information on creating your API key.


In [None]:
APP_NAME = "climate-forecast"

In [None]:
PATH=%env PATH
%env PATH={PATH}:/home/jupyter/.local/bin

PIPELINE_ROOT = f'{BUCKET_NAME}/pipeline_root/{APP_NAME}'

PIPELINE_ROOT

Do some imports:

In [None]:
import json
from typing import NamedTuple, List

from google_cloud_pipeline_components import aiplatform as aip_components
from google.cloud import aiplatform
from google.cloud.aiplatform import pipeline_jobs

from kfp.v2 import compiler
from kfp.v2 import dsl
from kfp.v2.dsl import (
    component,
    InputPath,
    OutputPath,
    Input,
    Output,
    Artifact,
    Dataset,
    Model,
    ClassificationMetrics,
    Metrics,
)
from kfp.v2.google.client import AIPlatformClient
from kfp.v2.google import experimental

In [None]:
!mkdir trainer/ predictor/ pipelines/

## Define the pipeline components



### 1. Component to run training job on Vertex AI

In [None]:
%%writefile trainer/task.py

# !pip install pytorch-lightning==1.4.9
# !pip install pytorch-forecasting==0.9.1

import os
import warnings

warnings.filterwarnings("ignore")  # avoid printing out absolute paths

import copy
from pathlib import Path
import warnings

import numpy as np
import pandas as pd
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
import torch

from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import SMAPE, PoissonLoss, QuantileLoss
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters


MODEL_DIR = os.getenv("AIP_MODEL_DIR", "/tmp").replace("gs://", "/gcs/")
print(f"MODEL_DIR = {MODEL_DIR}")

# read source dataset
url_data_src = 'https://storage.googleapis.com/tensorflow/tf-keras-datasets/jena_climate_2009_2016.csv.zip'
data = pd.read_csv(url_data_src, compression='zip')
data.columns = ['Date_Time', 'p__mbar', 'T__degC', 'Tpot__K', 'Tdew__degC', 'rh__percent',
                'VPmax__mbar', 'VPact__mbar', 'VPdef__mbar', 'sh__g_per_kg', 'H2OC__mmol_per_mol',
                'rho__gm_per_cubic_m', 'wv__m_per_s', 'max_w__vm_per_s', 'wd__deg'
               ]


# transform dataset
data['Date_Time'] = pd.to_datetime(data['Date_Time'])
data['series'] = 'Temp'
# add time index
data["time_idx"] =  data["Date_Time"].dt.year*365*24 + data["Date_Time"].dt.dayofyear * 24 + data["Date_Time"].dt.hour
data["time_idx"] -= data["time_idx"].min()

# create dataset and dataloaders
pl.seed_everything(42)
max_prediction_length = 24
max_encoder_length = 120
training_cutoff = data["time_idx"].max() - 100*max_prediction_length

time_varying_known_reals = [
    'p__mbar',
    'Tpot__K',
    'Tdew__degC',
    'rh__percent',
    'VPmax__mbar',
    'VPact__mbar',
    'VPdef__mbar',
    'sh__g_per_kg',
    'H2OC__mmol_per_mol',
    'rho__gm_per_cubic_m',
    'wv__m_per_s',
    'max_w__vm_per_s',
    'wd__deg',
    'time_idx'
]
    
training = TimeSeriesDataSet(
    data[lambda x: x.time_idx <= training_cutoff],
    time_idx="time_idx",
    target="T__degC",
    #categorical_encoders={"series": NaNLabelEncoder().fit(data.series)},
    group_ids=["series"],
    # only unknown variable is "value" - and N-Beats can also not take any additional variables
    time_varying_unknown_reals=["T__degC"],
    time_varying_known_reals=time_varying_known_reals,
    min_encoder_length=max_encoder_length // 2,  # keep encoder length long (as it is in the validation set)
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
    allow_missing_timesteps=True,
)

validation = TimeSeriesDataSet(
    data[lambda x: x.time_idx > training_cutoff],
    time_idx="time_idx",
    target="T__degC",
    #categorical_encoders={"series": NaNLabelEncoder().fit(data.series)},
    group_ids=["series"],
    # only unknown variable is "value" - and N-Beats can also not take any additional variables
    time_varying_unknown_reals=["T__degC"],
    time_varying_known_reals=time_varying_known_reals,
    min_encoder_length=max_encoder_length // 2,  # keep encoder length long (as it is in the validation set)
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
    allow_missing_timesteps=True,
)

# create validation set (predict=True) which means to predict the last max_prediction_length points in time
# for each series
validation = TimeSeriesDataSet.from_dataset(validation, data, predict=False, stop_randomization=True)

# create dataloaders for model
batch_size = 128  # set this between 32 to 128
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=0)

# configure network and trainer
pl.seed_everything(42)
trainer = pl.Trainer(
    gpus=1,
    # clipping gradients is a hyperparameter and important to prevent divergance
    # of the gradient for recurrent neural networks
    gradient_clip_val=0.1,
    default_root_dir=MODEL_DIR
)

tft = TemporalFusionTransformer.from_dataset(
    training,
    # not meaningful for finding the learning rate but otherwise very important
    learning_rate=0.03,
    hidden_size=16,  # most important hyperparameter apart from learning rate
    # number of attention heads. Set to up to 4 for large datasets
    attention_head_size=1,
    dropout=0.1,  # between 0.1 and 0.3 are good values
    hidden_continuous_size=8,  # set to <= hidden_size
    output_size=7,  # 7 quantiles by default
    loss=QuantileLoss(),
    # reduce learning rate if no improvement in validation loss after x epochs
    reduce_on_plateau_patience=4,
)
print(f"Number of parameters in network: {tft.size()/1e3:.1f}k")

# find optimal learning rate
res = trainer.tuner.lr_find(
    tft,
    train_dataloaders=train_dataloader,
    val_dataloaders=val_dataloader,
    max_lr=10.0,
    min_lr=1e-6,
)

print(f"suggested learning rate: {res.suggestion()}")
# fig = res.plot(show=True, suggest=True)
# fig.show()

# train model
# configure network and trainer
early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=10, verbose=False, mode="min")
lr_logger = LearningRateMonitor()  # log the learning rate
logger = TensorBoardLogger(MODEL_DIR)  # logging results to a tensorboard

trainer = pl.Trainer(
    max_epochs=1,
    gpus=1,
    weights_summary="top",
    gradient_clip_val=0.1,
    #limit_train_batches=30,  # coment in for training, running valiation every 30 batches
    #fast_dev_run=True,  # comment in to check that networkor dataset has no serious bugs
    callbacks=[lr_logger, early_stop_callback],
    logger=logger,
)


tft = TemporalFusionTransformer.from_dataset(
    training,
    learning_rate=res.suggestion(),
    hidden_size=16,
    attention_head_size=1,
    dropout=0.1,
    hidden_continuous_size=8,
    output_size=7,  # 7 quantiles by default
    loss=QuantileLoss(),
    log_interval=0,  # uncomment for learning rate finder and otherwise, e.g. to 10 for logging every 10 batches
    reduce_on_plateau_patience=4,
)
print(f"Number of parameters in network: {tft.size()/1e3:.1f}k")

# fit network
trainer.fit(
    tft,
    train_dataloaders=train_dataloader,
    val_dataloaders=val_dataloader
)

# save the model
if trainer.global_rank == 0:
    torch.save(tft.state_dict(), f'{MODEL_DIR}/tft_model.pt')



In [None]:
@component(
    # base_image="gcr.io/google.com/cloudsdktool/cloud-sdk:latest", 
    base_image="python:3.7",
    packages_to_install=["google-cloud-aiplatform", "pandas", "fsspec"],
    output_component_file="./pipelines/submit_custom_training_job.yaml")
def submit_custom_training_job( 
    project: str,
    bucket: str,
    training_container_image_uri: str,
    training_script_path: str,
    model: Output[Model],
    model_display_name: str
):

    from google.cloud import aiplatform
    from datetime import datetime
    import logging

    import pandas as pd

    TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

    logging.info(f"Model artifacts located at model.uri = {model.uri}")
    logging.info(f"Model artifacts located at model.path = {model.path}")

    # initialize vertex sdk
    aiplatform.init(
        project=project, 
        staging_bucket=bucket
    )

    JOB_NAME = f"{model_display_name}-pytorch-{TIMESTAMP}"

    # configure the job with container image spec
    job = aiplatform.CustomTrainingJob(
        display_name=JOB_NAME, 
        container_uri=training_container_image_uri,
        script_path=training_script_path,
        requirements=["pytorch-lightning==1.4.9", "pytorch-forecasting==0.9.1"]
    )

    # define worker pool specs
    train_replica_count = 1
    train_machine_type = "n1-standard-4"
    train_accelerator_type = "NVIDIA_TESLA_T4"
    train_accelerator_count = 1

    # output directories
    # base_output_dir = f"{bucket}/{JOB_NAME}"

    # submit the custom job to Vertex training service
    job_response = job.run(
        replica_count=train_replica_count,
        machine_type=train_machine_type,
        accelerator_type=train_accelerator_type,
        accelerator_count=train_accelerator_count,
        base_output_dir=model.uri
    )

    start, end = job._gca_resource.start_time, job._gca_resource.end_time

    logging.info(f"Model artifacts located at {model.uri}/model/{model_display_name}")
    model.metadata["framework"] = "pytorch"

    logging.info(f"Model artifacts located at model.uri = {model.uri}")
    logging.info(f"Model artifacts located at model.path = {model.path}")
    model.metadata["time_to_train_in_seconds"] = (end - start).total_seconds()

    # fetch metrics from the training job run
    # metrics_uri = f"{model.path}/model/{model_display_name}/all_results.json"
    # logging.info(f"Reading and logging metrics from {metrics_uri}")
    # metrics_df = pd.read_json(metrics_uri, typ='series')
    # for k,v  in metrics_df.items():
    #     logging.info(f'     {k} -> {v}')
    #     metrics.log_metric(k, v)

    # capture eval metric
    # eval_metric = metrics_df[eval_metric_key] if eval_metric_key in metrics_df.keys() else None
    # eval_loss = metrics_df["eval_loss"] if "eval_loss" in metrics_df.keys() else None
    # logging.info(f'     {eval_metric_key} -> {eval_metric}')
    # logging.info(f'     "eval_loss" -> {eval_loss}')

    # return (eval_metric, eval_loss)

In [None]:
!gsutil cp ./trainer/task.py {BUCKET_NAME}/{APP_NAME}/train/

In [None]:
!echo {BUCKET_NAME}/{APP_NAME}/train/

### 2. Component to create serving container image

In [None]:
%%writefile predictor/forecast_handler.py

import os
import json

import torch
from ts.torch_handler.base_handler import BaseHandler

import copy
from pathlib import Path
import warnings

import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch

from pytorch_forecasting import TimeSeriesDataSet

import logging
logger = logging.getLogger(__name__)

class ForecastHandler(BaseHandler):
    def __init__(self):
        super(ForecastHandler, self).__init__()
        self.initialized = False
        
    def initialize(self, ctx):
        """ Loads the model.pt file and initialized the model object.
        Instantiates Tokenizer for preprocessor to use
        Loads labels to name mapping file for post-processing inference response
        """
        self.manifest = ctx.manifest

        properties = ctx.system_properties
        model_dir = properties.get("model_dir")
        self.device = torch.device("cuda:" + str(properties.get("gpu_id")) if torch.cuda.is_available() else "cpu")

        # Read model serialize/pt file
        serialized_file = self.manifest["model"]["serializedFile"]
        model_pt_path = os.path.join(model_dir, serialized_file)
        if not os.path.isfile(model_pt_path):
            raise RuntimeError("Missing the model.pt or pytorch_model.bin file")
        
        # Load model
        self.model = torch.load(model_pt_path)
        self.model.to(self.device)
        self.model.eval()
        logger.debug('Forecasting model from path {0} loaded successfully'.format(model_dir))

        self.initialized = True
        
    def preprocess(self, inputs):
        results = []
        for example in inputs:
            data = example['data']
            
            # Creating forecasting model expects either a dataloader or a timeseriese dataset
            # or a pandas dataframe
            # Eventual input to the model is dictionary of features
            max_prediction_length = 24
            max_encoder_length = 120
            data = pd.DataFrame.from_dict(data)

            # Adding time_idx, time_idx is NOT a feature for the model
            # And is only used in the dataset creation 
            #data["time_idx"] =  data["Date_Time"].dt.year*365*24 + data["Date_Time"].dt.dayofyear * 24 + data["Date_Time"].dt.hour
            #data["time_idx"] -= data["time_idx"].min()
            
            # Adding prediction length entries to all dataset creation
            # It requires at least encoder length + prediction length inputs
            data = pd.concat([data, data.tail(max_prediction_length)], ignore_index=True)
            data['time_idx'] = np.arange(data.shape[0])

            time_varying_known_reals = [
                'p__mbar',
                'Tpot__K',
                'Tdew__degC',
                'rh__percent',
                'VPmax__mbar',
                'VPact__mbar',
                'VPdef__mbar',
                'sh__g_per_kg',
                'H2OC__mmol_per_mol',
                'rho__gm_per_cubic_m',
                'wv__m_per_s',
                'max_w__vm_per_s',
                'wd__deg',
                'time_idx'
            ]
            inference_set = TimeSeriesDataSet(
                data,
                time_idx="time_idx",
                target="T__degC",
                group_ids=["series"],
                time_varying_unknown_reals=["T__degC"],
                time_varying_known_reals=time_varying_known_reals,
                max_encoder_length=max_encoder_length,
                max_prediction_length=max_prediction_length,
                min_encoder_length=max_encoder_length,
                min_prediction_length=max_prediction_length,
                add_relative_time_idx=True,
                add_target_scales=True,
                add_encoder_length=True,
                allow_missing_timesteps=True,
                randomize_length=False,
            )
            #inference_set = TimeSeriesDataSet.from_dataset(inference_set, data, predict=False, stop_randomization=True)
            results.append(inference_set)
        logger.debug('Done creating the inference set(s).')
        
        return results      

    def inference(self, inputs):
        results = []
        for inf_set in inputs:
            results.append(self.model.predict(inf_set))
        return torch.stack(results, dim=0)
    
    def postprocess(self, inputs):
        return inputs.tolist()
        #return inputs

In [None]:
%%bash -s $APP_NAME

APP_NAME=$1

cat << EOF > ./predictor/Dockerfile

FROM pytorch/torchserve:latest-cpu

# install dependencies
RUN pip3 install --upgrade pip
RUN pip3 install pytorch_lightning==1.4.9 pytorch_forecasting

# copy model artifacts, custom handler and other dependencies
COPY ./forecast_handler.py /home/model-server/
COPY ./model/ /home/model-server/
RUN ls -ltrR /home/model-server/

# create torchserve configuration file
USER root
RUN printf "\nservice_envelope=json" >> /home/model-server/config.properties
RUN printf "\ninference_address=http://0.0.0.0:7080" >> /home/model-server/config.properties
RUN printf "\nmanagement_address=http://0.0.0.0:7081" >> /home/model-server/config.properties
USER model-server

# expose health and prediction listener ports from the image
EXPOSE 7080
EXPOSE 7081

# create model archive file packaging model artifacts and dependencies
RUN torch-model-archiver -f \
  --model-name=$APP_NAME \
  --version=1.0 \
  --serialized-file=/home/model-server/tft_model.pt \
  --handler=/home/model-server/forecast_handler.py \
  --export-path=/home/model-server/model-store

# run Torchserve HTTP serve to respond to prediction requests
CMD ["torchserve", \
     "--start", \
     "--ts-config=/home/model-server/config.properties", \
     "--models", \
     "$APP_NAME=$APP_NAME.mar", \
     "--model-store", \
     "/home/model-server/model-store"]
EOF

echo "Writing ./predictor/Dockerfile"

In [None]:
!gsutil cp ./predictor/Dockerfile {BUCKET_NAME}/{APP_NAME}/serve/
!gsutil cp ./predictor/forecast_handler.py {BUCKET_NAME}/{APP_NAME}/serve/predictor/

In [None]:
!gsutil ls -lR {BUCKET_NAME}/{APP_NAME}/serve/

In [None]:
@component(
    base_image="gcr.io/google.com/cloudsdktool/cloud-sdk:latest",
    # base_image="python:3.7",
    packages_to_install=["google-cloud-build"],
    output_component_file="./pipelines/build_custom_serving_image.yaml",
)
def build_custom_serving_image(
    project: str,
    gs_model_artifacts: Input[Model],
    gs_serving_dependencies_path: str,
    model_display_name: str
)-> NamedTuple(
    "Outputs",
    [
        ("serving_container_uri", str)
    ],
):

    from datetime import datetime, timedelta
    from collections import namedtuple
    import logging
    import os

    from google.cloud.devtools import cloudbuild_v1 as cloudbuild
    from google.protobuf.duration_pb2 import Duration

    logging.getLogger().setLevel(logging.INFO)
    build_client = cloudbuild.services.cloud_build.CloudBuildClient()

    logging.info(f"gs_serving_dependencies_path: {gs_serving_dependencies_path}")
    logging.info(f"gs_model_artifacts.uri: {gs_model_artifacts.uri}")
    logging.info(f"gs_model_artifacts.path: {gs_model_artifacts.path}")

    gs_dockerfile_path = os.path.join(gs_serving_dependencies_path, 'Dockerfile')
    gs_predictor_src_path = os.path.join(gs_serving_dependencies_path, 'predictor/')

    build_version = datetime.now().strftime("%Y%m%d%H%M%S")
    image_name = f"pytorch_predict_{model_display_name}"
    serving_image_uri = f"gcr.io/{project}/{image_name}:{build_version}"
    logging.info(f"serving_image_uri: {serving_image_uri}")

    duration = Duration()

    build = cloudbuild.Build(
        images=[serving_image_uri],
        timeout=duration.FromTimedelta(td=timedelta(minutes=30))
    )

    build.steps = [
        {
            "name": "gcr.io/cloud-builders/gcloud",
            "entrypoint": "bash",
            "args": ["-c", "mkdir model"]
        },
        {
            "name": "gcr.io/cloud-builders/gsutil",
            "args": ["cp", "-r", f"{gs_model_artifacts.uri}/model", "."]
        },
        {
            "name": "gcr.io/cloud-builders/gsutil",
            "args": ["cp", f"{gs_predictor_src_path}forecast_handler.py", "."]
        },
        {
            "name": "gcr.io/cloud-builders/gsutil",
            "args": ["cp", gs_dockerfile_path, "Dockerfile"]
        },
        {
            "name": "gcr.io/cloud-builders/gcloud",
            "entrypoint": "bash",
            "args": ["-c", "ls -ltrR ."]
        },
        {
          "name": 'gcr.io/cloud-builders/docker',
          "args": [ 'build', '-t', serving_image_uri, '.' ]
        }
    ]
    operation = build_client.create_build(project_id=project, build=build, timeout=1800)
    logging.info("IN PROGRESS:")
    logging.info(operation.metadata)

    result = operation.result()
    # Print the completed status
    logging.info("RESULT:", result.status)

    outputs = namedtuple("Outputs", ["serving_container_uri"])

    return outputs(serving_image_uri)

### 4. Component to test model deployment making online prediction requests

In [None]:
@component(
    base_image="gcr.io/google.com/cloudsdktool/cloud-sdk:latest",
    # base_image="python:3.7",
    # packages_to_install=["google-cloud-build", "google-cloud-aiplatform"],
    packages_to_install=["google-cloud-aiplatform"],
    output_component_file="./pipelines/make_prediction_request.yaml")
def make_prediction_request( 
    project: str,
    bucket: str,
    endpoint: str,
    instances: list
    ):
    from datetime import datetime
    import logging
    import base64
    import ast

    from google.cloud import aiplatform

    logging.getLogger().setLevel(logging.INFO)
    aiplatform.init(project=project, staging_bucket=bucket)
    logging.info(f"Endpoint: {endpoint}")
    endpoint = ast.literal_eval(endpoint)
    endpoint_uri = endpoint["resources"][0]["resourceUri"]
    logging.info(f"Endpoint URI: {endpoint_uri}")
    _endpoint = aiplatform.Endpoint("/".join(endpoint_uri.split("/")[4:-2]))

    for instance in instances:
        # if not isinstance(instance, (bytes, bytearray)):
        #     instance = instance.encode()
        # logging.info(f"Input list: {instance.decode('utf-8')}")
        # b64_encoded = base64.b64encode(instance)
        # test_instance = [{"data": {"b64": f"{str(b64_encoded.decode('utf-8'))}"}}]
        response = _endpoint.predict(instances=instance)
        logging.info(f"Prediction response: {response.predictions}")

In [None]:
uri = "https://us-central1-aiplatform.googleapis.com/v1/projects/560224572293/locations/us-central1/endpoints/8159546158469873664/operations/5813651307910660096"


## Define Pipeline

In [None]:
@dsl.pipeline(
    name="pytorch-text-classifier-pipeline",
    pipeline_root=PIPELINE_ROOT,
)
def pytorch_forecasting_pipeline(
    project: str,
    bucket: str,
    region: str,
    training_container_image_uri: str,
    gs_train_script_path: str,
    gs_serving_dependencies_path: str,
    model_display_name: str,
    serving_container_health_route: str,
    serving_container_predict_route: str,
    serving_container_ports: list,
    pipeline_job_id: str,
    pipeline_name: str,
    is_hp_tuning_enabled: str = 'n',
    tensorboard_instance: str = None
):
    run_train_task = submit_custom_training_job(
        project=project,
        bucket=bucket,
        training_container_image_uri=training_container_image_uri,
        training_script_path=gs_train_script_path,
        model_display_name=model_display_name
    ).set_caching_options(True).set_display_name("Train forecasting model")
    
    # build custom container for serving predictions using 
    # the trained model artifacts served by TorchServe
    build_custom_serving_image_task = build_custom_serving_image(
        project=project,
        gs_model_artifacts=run_train_task.outputs["model"],
        gs_serving_dependencies_path=gs_serving_dependencies_path,
        model_display_name=model_display_name
    ).set_caching_options(True).set_display_name("Build TorchServe serving image")

    # upload model to vertex ai
    # NOTE: model artifacts and the prediction handler are part of the container
    serving_container_image_uri = build_custom_serving_image_task.outputs['serving_container_uri']
    model_upload_task = aip_components.ModelUploadOp(
        project=project, 
        display_name=model_display_name,
        serving_container_image_uri=serving_container_image_uri,
        serving_container_predict_route=serving_container_predict_route,
        serving_container_health_route=serving_container_health_route,
        serving_container_ports=serving_container_ports
    ).set_caching_options(True).set_display_name("Upload model")

    # create endpoint to deploy one or more models
    # An endpoint provides a service URL where the prediction requests are sent
    endpoint_create_task = aip_components.EndpointCreateOp(
        project=project,
        display_name=model_display_name,
    ).set_caching_options(True).set_display_name("Create endpoint")

    # deploy models to endpoint to associates physical resources with the model 
    # so it can serve online predictions 
    model_deploy_task = aip_components.ModelDeployOp(
        endpoint=endpoint_create_task.outputs["endpoint"],
        model=model_upload_task.outputs["model"],
        deployed_model_display_name=model_display_name,
        dedicated_resources_machine_type="n1-standard-4",
        dedicated_resources_min_replica_count=1,
        dedicated_resources_max_replica_count=1,
        traffic_split='{ "0": 100 }'
    ).set_caching_options(True).set_display_name("Deploy model to endpoint")

    # test model deployment by making online prediction requests
    df_test = [
        [996.52, -8.02, 265.40, -8.90, 93.3, 3.33, 3.11, 0.22, 1.94, 3.12, 1307.75, 1.03, 1.75, 152.3]
    ]
    predict_test_instances_task = make_prediction_request(
        project,
        bucket,
        model_deploy_task.outputs['gcp_resources'],
        df_test
    ).set_caching_options(True).set_display_name("Test model deployment")


In [None]:
PIPELINE_JSON_SPEC_PATH = './pipelines/pytorch_climate_forecast_pipeline_spec.json'

In [None]:
compiler.Compiler().compile(
    pipeline_func=pytorch_forecasting_pipeline,
    package_path=PIPELINE_JSON_SPEC_PATH)

In [None]:
aiplatform.init(project=PROJECT_ID, location=REGION)

In [None]:
PIPELINE_NAME = f"pytorch-climate-forecasting-pipeline"
PIPELINE_JOB_ID = f"pipeline-{APP_NAME}-{get_timestamp()}"
TRAIN_APP_CODE_PATH = f"/gcs/{BUCKET}/{APP_NAME}/train/task.py"
# {BUCKET_NAME}/{APP_NAME}/train/"
SERVE_DEPENDENCIES_PATH = f"{BUCKET_NAME}/{APP_NAME}/serve/"
pipeline_params = {
    "training_container_image_uri": "us-docker.pkg.dev/vertex-ai/training/pytorch-gpu.1-9:latest",
    "gs_train_script_path": TRAIN_APP_CODE_PATH,
    "gs_serving_dependencies_path": SERVE_DEPENDENCIES_PATH,
    "project": PROJECT_ID,
    "bucket": BUCKET_NAME,
    "region": REGION,
    "model_display_name": APP_NAME,
    "serving_container_health_route": "/ping",
    "serving_container_predict_route": f"/predictions/{APP_NAME}",
    "serving_container_ports": [{"containerPort" : 7080}],
    "is_hp_tuning_enabled": "n",
    "pipeline_name": PIPELINE_NAME,
    "pipeline_job_id": PIPELINE_JOB_ID,
    "tensorboard_instance": "projects/560224572293/locations/us-central1/tensorboards/3406515721268625408"
}

In [None]:
pipeline_job = aiplatform.PipelineJob(
    display_name=PIPELINE_NAME,
    job_id=PIPELINE_JOB_ID,
    template_path=PIPELINE_JSON_SPEC_PATH,
    pipeline_root=PIPELINE_ROOT,
    parameter_values=pipeline_params,
    enable_caching=True
)

In [None]:
response = pipeline_job.run(sync=False)
response

In [None]:
df = aiplatform.get_pipeline_df(pipeline=PIPELINE_NAME.replace("_", "-"))
df