# Build Custom Images for Training and Prediction

In [2]:
# !pwd

In [3]:
# PREFIX = 'mabv1'
VERSION        = "v2"                       # TODO
PREFIX         = f'rec-bandits-{VERSION}'   # TODO

print(f"PREFIX: {PREFIX}")

PREFIX: rec-bandits-v2


In [4]:
# staging GCS
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}-bucket'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PROJECT_ID               = "hybrid-vertex"
PROJECT_NUM              = "934903580331"
LOCATION                 = "us-central1"

REGION                   = "us-central1"
BQ_LOCATION              = "US"
VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"

VERTEX_SA                = "934903580331-compute@developer.gserviceaccount.com"

PREFIX                   = "rec-bandits-v2"
VERSION                  = "v2"

BUCKET_NAME              = "rec-bandits-v2-hybrid-vertex-bucket"
BUCKET_URI               = "gs://rec-bandits-v2-hybrid-vertex-bucket"
DATA_GCS_PREFIX          = "data"
DATA_PATH                = "gs://rec-bandits-v2-hybrid-vertex-bucket/data"
VOCAB_SUBDIR             = "vocabs"
VOCAB_FILENAME           = "vocab_dict.pkl"

VPC_NETWORK_FULL         = "projects/934903580331/global/networks/ucaip-haystack-vpc-network"

BIGQUERY_DATASET_ID      = "hybrid_vertex.movielens_ds_rec_bandits_v2"
BIGQUERY_TABLE_ID        = "hybrid_vertex.movielens_ds_rec_bandits_v2.training_dataset"

REPO

In [5]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [6]:
# !tree src

## Write cloudbuild YAML

In [7]:
# %%writefile cloudbuild.yaml

# steps:
# - name: 'gcr.io/cloud-builders/docker'
#   args: ['build', '-t', '$_IMAGE_URI', '$_FILE_LOCATION', '-f', '$_FILE_LOCATION/$_DOCKERNAME']
# images:
# - '$_IMAGE_URI'

# Create custom training container

In [8]:
TRAIN_MAB_IMAGE_NAME = f"train_hpt_mab_e2e_{PREFIX}".replace("-","_")
TRAIN_MAB_IMAGE_URI  = f"gcr.io/hybrid-vertex/{TRAIN_MAB_IMAGE_NAME}" # :latest
DOCKERNAME_MAB       = "Dockerfile_train_mab_e2e"

print(f"TRAIN_MAB_IMAGE_NAME : {TRAIN_MAB_IMAGE_NAME}")
print(f"TRAIN_MAB_IMAGE_URI  : {TRAIN_MAB_IMAGE_URI}")
print(f"DOCKERNAME_MAB       : {DOCKERNAME_MAB}")

TRAIN_MAB_IMAGE_NAME : train_hpt_mab_e2e_rec_bandits_v2
TRAIN_MAB_IMAGE_URI  : gcr.io/hybrid-vertex/train_hpt_mab_e2e_rec_bandits_v2
DOCKERNAME_MAB       : Dockerfile_train_mab_e2e


#### Write a Dockerfile

- Use the [cloudml-hypertune](https://github.com/GoogleCloudPlatform/cloudml-hypertune) Python package to report training metrics to Vertex AI for hyperparameter tuning.
- Use the Google [Cloud Storage client library](https://cloud.google.com/storage/docs/reference/libraries) to read the best hyperparameters learned from a previous hyperarameter tuning job during training.

In [9]:
gpu_profiling = True # True | False

print(f"gpu_profiling : {gpu_profiling}")

gpu_profiling : True


In [10]:
if gpu_profiling:
    # TRAIN_BASE_IMAGE = 'tensorflow/tensorflow:2.13.0-gpu'
    TRAIN_BASE_IMAGE = 'gcr.io/deeplearning-platform-release/tf2-gpu.2-13.py310'
    NVTOP_RUN = 'RUN apt update && apt -y install nvtop'
    # NVTOP_RUN = 'RUN apt-get update && apt-get -y install nvtop'
else:
    TRAIN_BASE_IMAGE = 'python:3.10'
    NVTOP_RUN = None
    
RUN_EXPORT = "RUN export PYTHONPATH=${PYTHONPATH}:${APP_HOME}/"
    
print(f"TRAIN_BASE_IMAGE : {TRAIN_BASE_IMAGE}")
print(f"NVTOP_RUN        : {NVTOP_RUN}")
print(f"RUN_EXPORT       : {RUN_EXPORT}")

TRAIN_BASE_IMAGE : gcr.io/deeplearning-platform-release/tf2-gpu.2-13.py310
NVTOP_RUN        : RUN apt update && apt -y install nvtop
RUN_EXPORT       : RUN export PYTHONPATH=${PYTHONPATH}:${APP_HOME}/


In [11]:
dockerfile = f'''
# Specifies base image and tag.
# FROM gcr.io/google-appengine/python
FROM {TRAIN_BASE_IMAGE}

ENV PYTHONUNBUFFERED True
WORKDIR /root

RUN pip install --upgrade pip

RUN pip install cloudml-hypertune==0.1.0.dev6
RUN pip install google-cloud-storage==1.39.0
RUN pip install tensorflow==2.13.0
RUN pip install tensorboard-plugin-profile==2.13.1
RUN pip install tf-agents==0.17.0
RUN pip install matplotlib==3.8.0
RUN pip install urllib3==1.26.6

{NVTOP_RUN}

# Copies training code to the Docker image.
COPY src/training /root/src/training

{RUN_EXPORT}

# Sets up the entry point to invoke the task.
ENTRYPOINT ["python3", "-m", "src.training.task"]
'''
print(dockerfile)


# Specifies base image and tag.
# FROM gcr.io/google-appengine/python
FROM gcr.io/deeplearning-platform-release/tf2-gpu.2-13.py310

ENV PYTHONUNBUFFERED True
WORKDIR /root

RUN pip install --upgrade pip

RUN pip install cloudml-hypertune==0.1.0.dev6
RUN pip install google-cloud-storage==1.39.0
RUN pip install tensorflow==2.13.0
RUN pip install tensorboard-plugin-profile==2.13.1
RUN pip install tf-agents==0.17.0
RUN pip install matplotlib==3.8.0
RUN pip install urllib3==1.26.6

RUN apt update && apt -y install nvtop

# Copies training code to the Docker image.
COPY src/training /root/src/training

RUN export PYTHONPATH=${PYTHONPATH}:${APP_HOME}/

# Sets up the entry point to invoke the task.
ENTRYPOINT ["python3", "-m", "src.training.task"]



In [None]:
with open(f'{DOCKERNAME_MAB}', 'w') as f:
    f.write(dockerfile)

#### Files that will be included in Cloud Build image
* to adjust this see the gcloudignore section at the end of `00-env-setup.ipynb` notebook

In [None]:
%%writefile .gcloudignore
.gcloudignore
/WIP/*
*.pkl
*.png
*.ipynb
.git
.github
.ipynb_checkpoints/*
*__pycache__
*cpython-37.pyc
/imgs/*
README.md
.gitignore
.DS_Store
*.tfrecord
src/archive/*
00-archived/*
learning/*
*.data-00000-of-00001
src/tests/*
*.index

In [12]:
# check eligble files
!gcloud meta list-files-for-upload

requirements.txt
Dockerfile
Dockerfile_train_mab_e2e
cloudbuild.yaml
src/prediction/prestart.sh
src/prediction/main.py
src/training/policy_util.py
src/training/task.py
src/utils/data_config.py
hptuning/result.json


#### Build the custom container with Cloud Build

In [13]:
# image definitions for training
MACHINE_TYPE          ='e2-highcpu-32'
FILE_LOCATION         = "." # './src'

DOCKERNAME            = DOCKERNAME_MAB
IMAGE_URI             = TRAIN_MAB_IMAGE_URI

print(f"DOCKERNAME    : {DOCKERNAME}")
print(f"IMAGE_URI     : {IMAGE_URI}")
print(f"FILE_LOCATION : {FILE_LOCATION}")
print(f"MACHINE_TYPE  : {MACHINE_TYPE}")

DOCKERNAME    : Dockerfile_train_mab_e2e
IMAGE_URI     : gcr.io/hybrid-vertex/train_hpt_mab_e2e_rec_bandits_v2
FILE_LOCATION : .
MACHINE_TYPE  : e2-highcpu-32


### run in notebook terminal to continue in-notebook

In [17]:
CLOUD_BUILD_CMD = f'''gcloud builds submit --config ./cloudbuild.yaml \
--substitutions _DOCKERNAME={DOCKERNAME},_IMAGE_URI={IMAGE_URI},_FILE_LOCATION={FILE_LOCATION} \
--timeout=2h \
--machine-type={MACHINE_TYPE} \
--quiet
'''
print(CLOUD_BUILD_CMD)

gcloud builds submit --config ./cloudbuild.yaml --substitutions _DOCKERNAME=Dockerfile_train_mab_e2e,_IMAGE_URI=gcr.io/hybrid-vertex/train_hpt_mab_e2e_rec_bandits_v2,_FILE_LOCATION=. --timeout=2h --machine-type=e2-highcpu-32 --quiet



# Create custom prediction container

As with training, create a custom prediction container. This container handles the TF-Agents specific logic that is different from a regular TensorFlow Model. Specifically, it finds the predicted action using a trained policy. The associated source code is in `src/prediction/`.
See other options for Vertex AI predictions [here](https://cloud.google.com/vertex-ai/docs/predictions/getting-predictions).

#### Serve predictions:

- Use [`tensorflow.saved_model.load`](https://www.tensorflow.org/agents/api_docs/python/tf_agents/policies/PolicySaver#usage), instead of [`tf_agents.policies.policy_loader.load`](https://github.com/tensorflow/agents/blob/r0.8.0/tf_agents/policies/policy_loader.py#L26), to load the trained policy, because the latter produces an object of type [`SavedModelPyTFEagerPolicy`](https://github.com/tensorflow/agents/blob/402b8aa81ca1b578ec1f687725d4ccb4115386d2/tf_agents/policies/py_tf_eager_policy.py#L137) whose `action()` is not compatible for use here.
- Note that prediction requests contain only observation data but not reward. This is because: The prediction task is a standalone request that doesn't require prior knowledge of the system state. Meanwhile, end users only know what they observe at the moment. Reward is a piece of information that comes after the action has been made, so the end users would not have knowledge of said reward. In handling prediction requests, you create a [`TimeStep`](https://www.tensorflow.org/agents/api_docs/python/tf_agents/trajectories/TimeStep) object (consisting of `observation`, `reward`, `discount`, `step_type`) using the [`restart()`](https://www.tensorflow.org/agents/api_docs/python/tf_agents/trajectories/restart) function which takes in an `observation`. This function creates the *first* TimeStep in a trajectory of steps, where reward is 0, discount is 1 and step_type is marked as the first timestep. In other words, each prediction request forms the first `TimeStep` in a brand new trajectory.
- For the prediction response, avoid using NumPy-typed values; instead, convert them to native Python values using methods such as [`tolist()`](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.tolist.html) as opposed to `list()`.
- There exists a prestart script in `src/prediction`. FastAPI executes this script before starting up the server. The `PORT` environment variable is set to equal `AIP_HTTP_PORT` in order to run FastAPI on the same port expected by Vertex AI.

In [18]:
SERVING_APP_DIR      = "app"  # fixed for this example in dockerfile

PRED_MAB_IMAGE_NAME = f"mab_custom_prediction_{PREFIX}".replace("-","_")
PRED_MAB_IMAGE_URI  = f"gcr.io/hybrid-vertex/{PRED_MAB_IMAGE_NAME}" # :latest
DOCKERNAME_MAB_PRED = "Dockerfile_predict_mab_e2e"

print(f"PRED_MAB_IMAGE_NAME : {PRED_MAB_IMAGE_NAME}")
print(f"PRED_MAB_IMAGE_URI  : {PRED_MAB_IMAGE_URI}")
print(f"DOCKERNAME_MAB_PRED : {DOCKERNAME_MAB_PRED}")

PRED_MAB_IMAGE_NAME : mab_custom_prediction_rec_bandits_v2
PRED_MAB_IMAGE_URI  : gcr.io/hybrid-vertex/mab_custom_prediction_rec_bandits_v2
DOCKERNAME_MAB_PRED : Dockerfile_predict_mab_e2e


## Create local directory for serving application

In [19]:
# %mkdir $SERVING_APP_DIR
# %%writefile $SERVING_APP_DIR/...

In [47]:
%%writefile requirements.txt
numpy
six
typing-extensions
pillow
tf-agents==0.17.0
tensorflow==2.13.0

Overwriting requirements.txt


### Write Dockerfile

Note: Note: leave the server directory `app`.

In [21]:
dockerfile = f'''
FROM tiangolo/uvicorn-gunicorn-fastapi:python3.10

COPY src/prediction /app
COPY requirements.txt /app/requirements.txt

RUN pip3 install -r /app/requirements.txt
'''
print(dockerfile)


FROM tiangolo/uvicorn-gunicorn-fastapi:python3.10

COPY src/prediction /app
COPY requirements.txt /app/requirements.txt

RUN pip3 install -r /app/requirements.txt



In [22]:
with open(f'{DOCKERNAME_MAB_PRED}', 'w') as f:
    f.write(dockerfile)

### write new YAML

> Note: env==ARTIFACT_DIR

In [26]:
# image definitions for training
MACHINE_TYPE            ='e2-highcpu-32'
FILE_LOCATION           = "." # './src'

DOCKERNAME            = DOCKERNAME_MAB_PRED
IMAGE_URI             = PRED_MAB_IMAGE_URI

# TODO - currently manual from step-by-step notebook
ARTIFACTS_DIR = 'gs://rec-bandits-v2-hybrid-vertex-bucket/sxs-rl-rec-bandits-v2/run-20231019-041358/artifacts'

print(f"DOCKERNAME    : {DOCKERNAME}")
print(f"IMAGE_URI     : {IMAGE_URI}")
print(f"FILE_LOCATION : {FILE_LOCATION}")
print(f"MACHINE_TYPE  : {MACHINE_TYPE}")
print(f"ARTIFACTS_DIR : {ARTIFACTS_DIR}")

DOCKERNAME    : Dockerfile_predict_mab_e2e
IMAGE_URI     : gcr.io/hybrid-vertex/mab_custom_prediction_rec_bandits_v2
FILE_LOCATION : .
MACHINE_TYPE  : e2-highcpu-32
ARTIFACTS_DIR : gs://rec-bandits-v2-hybrid-vertex-bucket/sxs-rl-rec-bandits-v2/run-20231019-041358/artifacts


### Write new CloudBuild with `env` variable specified...

In [44]:
CLOUD_BUILD_CONFIG = f'''
steps:
- name: 'gcr.io/cloud-builders/docker'
  args: ['build', '-t', '$_IMAGE_URI', '$_FILE_LOCATION', '-f', '$_FILE_LOCATION/$_DOCKERNAME']
  env: ['AIP_STORAGE_URI={ARTIFACTS_DIR}']
images:
- '$_IMAGE_URI'
'''
print(CLOUD_BUILD_CONFIG)


steps:
- name: 'gcr.io/cloud-builders/docker'
  args: ['build', '-t', '$_IMAGE_URI', '$_FILE_LOCATION', '-f', '$_FILE_LOCATION/$_DOCKERNAME']
  env: ['AIP_STORAGE_URI=gs://rec-bandits-v2-hybrid-vertex-bucket/sxs-rl-rec-bandits-v2/run-20231019-041358/artifacts']
images:
- '$_IMAGE_URI'



In [45]:
CLOUD_BUILD_YAML = 'cloudbuild.yaml'

with open(f'{CLOUD_BUILD_YAML}', 'w') as f:
    f.write(CLOUD_BUILD_CONFIG)

### run in notebook terminal to contnue in-notebook

In [46]:
CLOUD_BUILD_CMD = f'''gcloud builds submit --config ./{CLOUD_BUILD_YAML} \
--substitutions _DOCKERNAME={DOCKERNAME},_IMAGE_URI={IMAGE_URI},_FILE_LOCATION={FILE_LOCATION} \
--timeout=2h \
--machine-type={MACHINE_TYPE} \
--quiet
'''
print(CLOUD_BUILD_CMD)

gcloud builds submit --config ./cloudbuild.yaml --substitutions _DOCKERNAME=Dockerfile_predict_mab_e2e,_IMAGE_URI=gcr.io/hybrid-vertex/mab_custom_prediction_rec_bandits_v2,_FILE_LOCATION=. --timeout=2h --machine-type=e2-highcpu-32 --quiet



In [None]:
# ! gcloud builds submit --config ./cloudbuild.yaml \
#     --substitutions _DOCKERNAME=$DOCKERNAME,_IMAGE_URI=$IMAGE_URI,_FILE_LOCATION=$FILE_LOCATION,_ARTIFACTS_DIR=ARTIFACTS_DIR \
#     --timeout=2h \
#     --machine-type=$MACHINE_TYPE \
#     --quiet

# archive

In [None]:
# gcloud builds submit --config=gcp/cloudbuild-main.yaml --substitutions=_CLIENT="client",_BRANCH="branch",_TAG="tag" .

In [None]:
# gcloud builds submit --config=gcp/cloudbuild-main.yaml --substitutions _CLIENT='client',_BRANCH='branch',_TAG='tag' .