# Build custom container for matrix-factorization-based simulation environment in Vertex AI training

In [1]:
!pwd

/home/jupyter/tf_vertex_agents/01-offline-bandit-simulation


## Load env config

* use the prefix from `00-env-setup`

In [2]:
VERSION        = "v2"                       # TODO
PREFIX         = f'rec-bandits-{VERSION}'   # TODO

print(f"PREFIX: {PREFIX}")

PREFIX: rec-bandits-v2


In [3]:
# staging GCS
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}-bucket'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PROJECT_ID               = "hybrid-vertex"
PROJECT_NUM              = "934903580331"
LOCATION                 = "us-central1"

REGION                   = "us-central1"
BQ_LOCATION              = "US"
VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"
VERTEX_SA                = "934903580331-compute@developer.gserviceaccount.com"

PREFIX                   = "rec-bandits-v2"
VERSION                  = "v2"

BUCKET_NAME              = "rec-bandits-v2-hybrid-vertex-bucket"
BUCKET_URI               = "gs://rec-bandits-v2-hybrid-vertex-bucket"
DATA_GCS_PREFIX          = "data"
DATA_PATH                = "gs://rec-bandits-v2-hybrid-vertex-bucket/data"
VOCAB_SUBDIR             = "vocabs"
VOCAB_FILENAME           = "vocab_dict.pkl"

VPC_NETWORK_FULL         = "projects/934903580331/global/networks/ucaip-haystack-vpc-network"

BIGQUERY_DATASET_NAME    = "mvlens_rec_bandits_v2"
BIGQUERY_TABLE_NAME      = "training_dataset"

REPOSITORY               = "rl-movielens-rec-bandits-v2"

DOCKERNAM

In [4]:
# ! gsutil ls $BUCKET_URI

## imports

In [5]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# Build Image

In [6]:
# !tree src

## Container Image Variables

In [7]:
print(f"DOCKERNAME_01          = {DOCKERNAME_01}")
print(f"REPOSITORY             = {REPOSITORY}")
print(f"IMAGE_NAME_01          = {IMAGE_NAME_01}")
print(f"REMOTE_IMAGE_NAME      = {REMOTE_IMAGE_NAME}")
print(f"IMAGE_URI_01           = {IMAGE_URI_01}")

DOCKERNAME_01          = Dockerfile_train_my_perarm_env
REPOSITORY             = rl-movielens-rec-bandits-v2
IMAGE_NAME_01          = train-my-perarm-env-v2
REMOTE_IMAGE_NAME      = us-central1-docker.pkg.dev/hybrid-vertex/rl-movielens-rec-bandits-v2/local_docker_tfa
IMAGE_URI_01           = gcr.io/hybrid-vertex/train-my-perarm-env-v2


## Create Artifact Repository

If you don't have an existing artifact repository, create one using the gcloud command below

In [8]:
# ! gcloud artifacts repositories create $REPOSITORY --repository-format=docker --location=$LOCATION

## Create Dockerfile

In [9]:
import os

root_path = '..'
os.chdir(root_path)
os.getcwd()

'/home/jupyter/tf_vertex_agents'

### Create train image

* see [example Dockerfile for GPU](https://github.com/GoogleCloudPlatform/cloudml-samples/blob/main/pytorch/containers/quickstart/mnist/Dockerfile-gpu) jobs in Vertex AI
* see deep learning container [example here](https://cloud.google.com/deep-learning-containers/docs/derivative-container), and here for [available DL containers](https://cloud.google.com/deep-learning-containers/docs/choosing-container#versions)

In [10]:
gpu_profiling = True # True | False

print(f"gpu_profiling : {gpu_profiling}")

gpu_profiling : True


In [11]:
if gpu_profiling:
    # TRAIN_BASE_IMAGE = 'tensorflow/tensorflow:2.13.0-gpu'
    TRAIN_BASE_IMAGE = 'gcr.io/deeplearning-platform-release/tf2-gpu.2-13.py310'
    NVTOP_RUN = 'RUN apt update && apt -y install nvtop'
    # NVTOP_RUN = 'RUN apt-get update && apt-get -y install nvtop'
else:
    TRAIN_BASE_IMAGE = 'python:3.10'
    NVTOP_RUN = None
    
RUN_EXPORT = "RUN export PYTHONPATH=${PYTHONPATH}:${APP_HOME}/"
    
print(f"TRAIN_BASE_IMAGE : {TRAIN_BASE_IMAGE}")
print(f"NVTOP_RUN        : {NVTOP_RUN}")
print(f"RUN_EXPORT       : {RUN_EXPORT}")

TRAIN_BASE_IMAGE : gcr.io/deeplearning-platform-release/tf2-gpu.2-13.py310
NVTOP_RUN        : RUN apt update && apt -y install nvtop
RUN_EXPORT       : RUN export PYTHONPATH=${PYTHONPATH}:${APP_HOME}/


In [12]:
dockerfile = f'''
FROM {TRAIN_BASE_IMAGE}

ENV APP_HOME /workspace

WORKDIR $APP_HOME

COPY /requirements.txt $APP_HOME/requirements.txt
RUN pip install --upgrade pip
RUN pip install --no-cache-dir -r $APP_HOME/requirements.txt
RUN pip install cloudml-hypertune

{NVTOP_RUN}

COPY src/data $APP_HOME/src/data
COPY src/utils $APP_HOME/src/utils
COPY src/trainer $APP_HOME/src/trainer
COPY src/environments $APP_HOME/src/environments

{RUN_EXPORT}

RUN pip freeze | grep wrapt

# Sets up the entry point to invoke the task.
ENTRYPOINT ["python3", "-m", "src.trainer.train_env_task"]
'''
print(dockerfile)


FROM gcr.io/deeplearning-platform-release/tf2-gpu.2-13.py310

ENV APP_HOME /workspace

WORKDIR $APP_HOME

COPY /requirements.txt $APP_HOME/requirements.txt
RUN pip install --upgrade pip
RUN pip install --no-cache-dir -r $APP_HOME/requirements.txt
RUN pip install cloudml-hypertune

RUN apt update && apt -y install nvtop

COPY src/policy_util.py $APP_HOME/src/policy_util.py
COPY src/train_utils.py $APP_HOME/src/train_utils.py
COPY src/environments $APP_HOME/src/environments
COPY src/trainer $APP_HOME/src/trainer
COPY src/data $APP_HOME/src/data

RUN export PYTHONPATH=${PYTHONPATH}:${APP_HOME}/

RUN pip freeze | grep wrapt

# Sets up the entry point to invoke the task.
ENTRYPOINT ["python3", "-m", "src.trainer.train_env_task"]



In [13]:
with open(f'{DOCKERNAME_01}', 'w') as f:
    f.write(dockerfile)

## Build image with Cloud Build

Building images with Cloud Build is best practices
* images are centrally stored and better managed for robust CI/CD
* building images on local workbench instance can alter notebook image config (base image for notebooks vs train images are different)
* if building locally, consider using virutal environments

In [14]:
!pwd

/home/jupyter/tf_vertex_agents


#### set `.gcloudignore`
* to adjust this see the `gcloudignore` section at the end of `00-env-setup.ipynb` notebook

In [15]:
%%writefile .gcloudignore
.gcloudignore
WIP/*
imgs/*
learning/*
*.pkl
*.png
*.ipynb
.git
.github
.gitignore
.DS_Store
*.md
*.tfrecord
.ipynb_checkpoints/*
*cpython-37.pyc
**.cpython-310.pyc
*/__pycache__/*
# src/ranking/*
src/archive/*
04-pipelines/*
03-ranking/*
02-*/*
# src/pred/*
*/vertex_env/*
credentials.json
05-online-learning/*
src/cpr_dir/*
# src/local_model_dir/*
00-data-prep-eda/*
src/data/local_data/*

Overwriting .gcloudignore


In [16]:
# check eligble files
# !gcloud meta list-files-for-upload

### Submit container to Cloud Build

In [17]:
# image definitions for training
MACHINE_TYPE            ='e2-highcpu-32'
FILE_LOCATION           = "." # './src'

print(f"DOCKERNAME_01 : {DOCKERNAME_01}")
print(f"IMAGE_URI_01  : {IMAGE_URI_01}")
print(f"FILE_LOCATION : {FILE_LOCATION}")
print(f"MACHINE_TYPE  : {MACHINE_TYPE}")

DOCKERNAME_01 : Dockerfile_train_my_perarm_env
IMAGE_URI_01  : gcr.io/hybrid-vertex/train-my-perarm-env-v2
FILE_LOCATION : .
MACHINE_TYPE  : e2-highcpu-32


In [19]:
! gcloud builds submit --config ./cloudbuild.yaml \
    --substitutions _DOCKERNAME=$DOCKERNAME_01,_IMAGE_URI=$IMAGE_URI_01,_FILE_LOCATION=$FILE_LOCATION \
    --timeout=2h \
    --machine-type=$MACHINE_TYPE \
    --quiet

**Finished**

## (Optional) Build Image Locally

Building images with Cloud Build is best practices
* images are centrally stored and better managed for robust CI/CD
* building images on local workbench instance can alter notebook image config (base image for notebooks vs train images are different)
* if building locally, consider using virutal environments

Provide a name for your dockerfile and make sure you are authenticated

In [26]:
# ! gcloud auth configure-docker $REGION-docker.pkg.dev --quiet

In [27]:
print("copy these commands into terminal:\n")
print(f"virtualenv vertex_env")
print(f"source vertex_env/bin/activate")

copy these commands into terminal:

virtualenv vertex_env
source vertex_env/bin/activate


In [28]:
# # set variables if running in terminal
print("copy these commands into terminal:\n")
print(f"export REMOTE_IMAGE_NAME={REMOTE_IMAGE_NAME}")
print(f"export DOCKERNAME={DOCKERNAME_01}")
print(f"docker build -t $REMOTE_IMAGE_NAME -f ./$DOCKERNAME .")

copy these commands into terminal:

export REMOTE_IMAGE_NAME=us-central1-docker.pkg.dev/hybrid-vertex/rl-movielens-rec-bandits-v2/local_docker_tfa
export DOCKERNAME=Dockerfile_train_my_perarm_env
docker build -t $REMOTE_IMAGE_NAME -f ./$DOCKERNAME .


In [None]:
# !docker build -t $REMOTE_IMAGE_NAME -f $DOCKERNAME .

### Push container to Registry

In [28]:
# ### push the container to registry

print("copy this command into terminal:\n")
print(f"docker push $REMOTE_IMAGE_NAME")

# !docker push $REMOTE_IMAGE_NAME

copy this command into terminal:

docker push $REMOTE_IMAGE_NAME


### GPU profiling

> enter these commands in the Vertex interactive terminal:

```bash
sudo apt update
sudo apt -y install nvtop
```

**Finished**