# Build custom container for Vertex training

In [1]:
!pwd

/home/jupyter/tf_vertex_agents/01-baseline-perarm-bandit


## Load env config

* use the prefix from `00-env-setup`

In [2]:
# PREFIX = 'mabv1'
VERSION        = "v2"                       # TODO
PREFIX         = f'rec-bandits-{VERSION}'   # TODO

print(f"PREFIX: {PREFIX}")

PREFIX: rec-bandits-v2


In [3]:
# staging GCS
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}-bucket'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PROJECT_ID               = "hybrid-vertex"
PROJECT_NUM              = "934903580331"
LOCATION                 = "us-central1"

REGION                   = "us-central1"
BQ_LOCATION              = "US"
VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"

VERTEX_SA                = "934903580331-compute@developer.gserviceaccount.com"

PREFIX                   = "rec-bandits-v2"
VERSION                  = "v2"

BUCKET_NAME              = "rec-bandits-v2-hybrid-vertex-bucket"
BUCKET_URI               = "gs://rec-bandits-v2-hybrid-vertex-bucket"
DATA_GCS_PREFIX          = "data"
DATA_PATH                = "gs://rec-bandits-v2-hybrid-vertex-bucket/data"
VOCAB_SUBDIR             = "vocabs"
VOCAB_FILENAME           = "vocab_dict.pkl"

VPC_NETWORK_FULL         = "projects/934903580331/global/networks/ucaip-haystack-vpc-network"

BIGQUERY_DATASET_ID      = "hybrid_vertex.movielens_ds_rec_bandits_v2"
BIGQUERY_TABLE_ID        = "hybrid_vertex.movielens_ds_rec_bandits_v2.training_dataset"

REPO

In [4]:
# ! gsutil ls $BUCKET_URI

## imports

In [5]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# Build Image

In [6]:
# !tree src

## Container Image Variables

In [7]:
DOCKERNAME_TRAIN_MYENV = "Dockerfile_train_my_perarm_env"

In [8]:
print(f"DOCKERNAME_TRAIN_MYENV = {DOCKERNAME_TRAIN_MYENV}")
print(f"REPOSITORY             = {REPOSITORY}")
print(f"IMAGE_NAME             = {IMAGE_NAME}")
print(f"REMOTE_IMAGE_NAME      = {REMOTE_IMAGE_NAME}")
print(f"IMAGE_URI_01           = {IMAGE_URI_01}")

DOCKERNAME        = Dockerfile_train_my_perarm_env
REPOSITORY        = rl-movielens-rec-bandits-v2
IMAGE_NAME        = train-perarm-feats-v2
REMOTE_IMAGE_NAME = us-central1-docker.pkg.dev/hybrid-vertex/rl-movielens-rec-bandits-v2/train-perarm-feats-v2
IMAGE_URI_01      = gcr.io/hybrid-vertex/train-perarm-feats-v2-01


## Create Artifact Repository

If you don't have an existing artifact repository, create one using the gcloud command below

In [9]:
# ! gcloud artifacts repositories create $REPOSITORY --repository-format=docker --location=$LOCATION

## Create Dockerfile

In [10]:
import os

root_path = '..'
os.chdir(root_path)
os.getcwd()

'/home/jupyter/tf_vertex_agents'

### Create train image

* see [example Dockerfile for GPU](https://github.com/GoogleCloudPlatform/cloudml-samples/blob/main/pytorch/containers/quickstart/mnist/Dockerfile-gpu) jobs in Vertex AI
* see deep learning container [example here](https://cloud.google.com/deep-learning-containers/docs/derivative-container), and here for [available DL containers](https://cloud.google.com/deep-learning-containers/docs/choosing-container#versions)

In [11]:
gpu_profiling = True # True | False

print(f"gpu_profiling : {gpu_profiling}")

gpu_profiling : True


In [12]:
if gpu_profiling:
    # TRAIN_BASE_IMAGE = 'tensorflow/tensorflow:2.13.0-gpu'
    TRAIN_BASE_IMAGE = 'gcr.io/deeplearning-platform-release/tf2-gpu.2-13.py310'
    NVTOP_RUN = 'RUN apt update && apt -y install nvtop'
    # NVTOP_RUN = 'RUN apt-get update && apt-get -y install nvtop'
else:
    TRAIN_BASE_IMAGE = 'python:3.10'
    NVTOP_RUN = None
    
RUN_EXPORT = "RUN export PYTHONPATH=${PYTHONPATH}:${APP_HOME}/"
    
print(f"TRAIN_BASE_IMAGE : {TRAIN_BASE_IMAGE}")
print(f"NVTOP_RUN        : {NVTOP_RUN}")
print(f"RUN_EXPORT       : {RUN_EXPORT}")

TRAIN_BASE_IMAGE : gcr.io/deeplearning-platform-release/tf2-gpu.2-13.py310
NVTOP_RUN        : RUN apt update && apt -y install nvtop
RUN_EXPORT       : RUN export PYTHONPATH=${PYTHONPATH}:${APP_HOME}/


In [13]:
dockerfile = f'''
FROM {TRAIN_BASE_IMAGE}

ENV PYTHONUNBUFFERED True

ENV APP_HOME /workspace

WORKDIR $APP_HOME

COPY /requirements.txt $APP_HOME/requirements.txt
RUN pip install --upgrade pip
RUN pip install --no-cache-dir -r $APP_HOME/requirements.txt
RUN pip install cloudml-hypertune

{NVTOP_RUN}

COPY src/per_arm_rl $APP_HOME/src/per_arm_rl

{RUN_EXPORT}

# Sets up the entry point to invoke the task.
ENTRYPOINT ["python3", "-m", "src.per_arm_rl.perarm_task"]
'''
print(dockerfile)


FROM gcr.io/deeplearning-platform-release/tf2-gpu.2-13.py310

ENV PYTHONUNBUFFERED True

ENV APP_HOME /workspace

WORKDIR $APP_HOME

COPY /requirements.txt $APP_HOME/requirements.txt
RUN pip install --upgrade pip
RUN pip install --no-cache-dir -r $APP_HOME/requirements.txt
RUN pip install cloudml-hypertune

RUN apt update && apt -y install nvtop

COPY src/per_arm_rl $APP_HOME/src/per_arm_rl

RUN export PYTHONPATH=${PYTHONPATH}:${APP_HOME}/

# Sets up the entry point to invoke the task.
ENTRYPOINT ["python3", "-m", "src.per_arm_rl.perarm_task"]



In [14]:
with open(f'{DOCKERNAME}', 'w') as f:
    f.write(dockerfile)

## Build image with Cloud Build

Building images with Cloud Build is best practices
* images are centrally stored and better managed for robust CI/CD
* building images on local workbench instance can alter notebook image config (base image for notebooks vs train images are different)
* if building locally, consider using virutal environments

### Files that will be included in Cloud Build image
* to adjust this see the `gcloudignore` section at the end of `00-env-setup.ipynb` notebook

In [15]:
# check eligble files
!gcloud meta list-files-for-upload

Dockerfile_perarm_feats
requirements.txt
Dockerfile_train_my_perarm_env
Dockerfile_perarm_feats_tpu
cloudbuild.yaml
src/ranking/ranking_trainer.py
src/ranking/stationary_stochastic_repeated_feature_py_environment.py
src/ranking/repeated_feature_perarm_network.py
src/ranking/jw_perarm_mv_env.py
src/ranking/jw_perarm_mv_env_tf.py
src/ranking/tmp_cascading_bandit_perarm_env.py
src/per_arm_rl/utils_config.py
src/per_arm_rl/perarm_task.py
src/per_arm_rl/__init__.py
src/per_arm_rl/my_per_arm_py_env.py
src/per_arm_rl/policy_util.py
src/per_arm_rl/train_utils.py
src/per_arm_rl/trainer_baseline.py
src/per_arm_rl/data_utils.py
src/per_arm_rl/data_config.py
src/perarm_features/train_perarm.py
src/perarm_features/reward_factory.py
src/perarm_features/emb_features.py
src/perarm_features/agent_factory.py
src/perarm_features/__init__.py
src/perarm_features/trainer_common.py
src/perarm_features/task.py
src/perarm_features/eval_perarm.py
src/perarm_features/ranking_bandit_policy.py
01-baseline-perarm-b

In [16]:
# %%writefile cloudbuild.yaml

# steps:
# - name: 'gcr.io/cloud-builders/docker'
#   args: ['build', '-t', '$_IMAGE_URI', '$_FILE_LOCATION', '-f', '$_FILE_LOCATION/$_DOCKERNAME']
# images:
# - '$_IMAGE_URI'

In [17]:
# image definitions for training
MACHINE_TYPE            ='e2-highcpu-32'
FILE_LOCATION           = "." # './src'

DOCKERNAME     = DOCKERNAME_TRAIN_MYENV
IMAGE_URI      = IMAGE_URI_01

print(f"DOCKERNAME    : {DOCKERNAME}")
print(f"IMAGE_URI     : {IMAGE_URI}")
print(f"FILE_LOCATION : {FILE_LOCATION}")
print(f"MACHINE_TYPE  : {MACHINE_TYPE}")

print(f"DOCKERNAME        : {DOCKERNAME}")
print(f"IMAGE_URI      : {IMAGE_URI}")
print(f"FILE_LOCATION     : {FILE_LOCATION}")
print(f"MACHINE_TYPE      : {MACHINE_TYPE}")

DOCKERNAME        : Dockerfile_train_my_perarm_env
IMAGE_URI_01      : gcr.io/hybrid-vertex/train-perarm-feats-v2-01
FILE_LOCATION     : .
MACHINE_TYPE      : e2-highcpu-32


In [18]:
! gcloud builds submit --config ./cloudbuild.yaml \
    --substitutions _DOCKERNAME=$DOCKERNAME,_IMAGE_URI=$IMAGE_URI,_FILE_LOCATION=$FILE_LOCATION \
    --timeout=2h \
    --machine-type=$MACHINE_TYPE \
    --quiet

Creating temporary tarball archive of 30 file(s) totalling 280.8 KiB before compression.
Uploading tarball of [.] to [gs://hybrid-vertex_cloudbuild/source/1697660082.826936-df3db4fcdf1a44d4806100dc518d1725.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/hybrid-vertex/locations/global/builds/04489209-8dae-4157-a6fa-1a5f31283fec].
Logs are available at [ https://console.cloud.google.com/cloud-build/builds/04489209-8dae-4157-a6fa-1a5f31283fec?project=934903580331 ].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "04489209-8dae-4157-a6fa-1a5f31283fec"

FETCHSOURCE
Fetching storage object: gs://hybrid-vertex_cloudbuild/source/1697660082.826936-df3db4fcdf1a44d4806100dc518d1725.tgz#1697660083197134
Copying gs://hybrid-vertex_cloudbuild/source/1697660082.826936-df3db4fcdf1a44d4806100dc518d1725.tgz#1697660083197134...
/ [1 files][ 62.5 KiB/ 62.5 KiB]                                                
Operation completed over 1 objects/62

## (Optional) Build Image Locally

Building images with Cloud Build is best practices
* images are centrally stored and better managed for robust CI/CD
* building images on local workbench instance can alter notebook image config (base image for notebooks vs train images are different)
* if building locally, consider using virutal environments

Provide a name for your dockerfile and make sure you are authenticated

In [23]:
# ! gcloud auth configure-docker $REGION-docker.pkg.dev --quiet

In [24]:
print("copy these commands into terminal:\n")
print(f"virtualenv vertex_env")
print(f"source vertex_env/bin/activate")

copy these commands into terminal:

virtualenv vertex_env
source vertex_env/bin/activate


In [26]:
# # set variables if running in terminal
print("copy these commands into terminal:\n")
print(f"export REMOTE_IMAGE_NAME={REMOTE_IMAGE_NAME}")
print(f"export DOCKERNAME={DOCKERNAME}")
print(f"docker build -t $REMOTE_IMAGE_NAME -f ./$DOCKERNAME .")

copy these commands into terminal:

export REMOTE_IMAGE_NAME=us-central1-docker.pkg.dev/hybrid-vertex/rl-movielens-rec-bandits-v2/train-perarm-feats-v2
export DOCKERNAME=Dockerfile_perarm_feats
docker build -t $REMOTE_IMAGE_NAME -f ./$DOCKERNAME .


In [27]:
# !docker build -t $REMOTE_IMAGE_NAME -f $DOCKERNAME .

### Push container to Registry

In [28]:
# ### push the container to registry

print("copy this command into terminal:\n")
print(f"docker push $REMOTE_IMAGE_NAME")

# !docker push $REMOTE_IMAGE_NAME

copy this command into terminal:

docker push $REMOTE_IMAGE_NAME


### GPU profiling

> enter these commands in the Vertex interactive terminal:

```bash
sudo apt update
sudo apt -y install nvtop
```

**Finished**