# Build custom container for Vertex training

In [1]:
!pwd

/home/jupyter/tf_vertex_agents/02-supervised-to-bandit-training


## Load env config

* use the prefix from `00-env-setup`

In [2]:
# PREFIX = 'mabv1'
VERSION        = "v2"                       # TODO
PREFIX         = f'rec-bandits-{VERSION}'   # TODO

print(f"PREFIX: {PREFIX}")

PREFIX: rec-bandits-v2


In [3]:
# staging GCS
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}-bucket'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PROJECT_ID               = "hybrid-vertex"
PROJECT_NUM              = "934903580331"
LOCATION                 = "us-central1"

REGION                   = "us-central1"
BQ_LOCATION              = "US"
VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"

VERTEX_SA                = "934903580331-compute@developer.gserviceaccount.com"

PREFIX                   = "rec-bandits-v2"
VERSION                  = "v2"

BUCKET_NAME              = "rec-bandits-v2-hybrid-vertex-bucket"
BUCKET_URI               = "gs://rec-bandits-v2-hybrid-vertex-bucket"
DATA_GCS_PREFIX          = "data"
DATA_PATH                = "gs://rec-bandits-v2-hybrid-vertex-bucket/data"
VOCAB_SUBDIR             = "vocabs"
VOCAB_FILENAME           = "vocab_dict.pkl"
DATA_PATH_KFP_DEMO       = "gs://rec-bandits-v2-hybrid-vertex-bucket/data/kfp_demo_data/u.data"

VPC_NETWORK_FULL         = "projects/934903580331/global/networks/ucaip-haystack-vpc-network"

BIGQUERY_DATASET_NAME    = "mvlens_rec_bandits_v2"
BIGQUERY_TABLE_NA

In [4]:
# ! gsutil ls $BUCKET_URI

## imports

In [5]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# Build Image

In [6]:
# !tree src

## Container Image Variables

In [7]:
# DOCKERNAME = DOCKERNAME_02
# IMAGE_NAME = IMAGE_NAME_02
# IMAGE_URI = IMAGE_URI_02

print(f"DOCKERNAME_02     = {DOCKERNAME_02}")
print(f"REPOSITORY        = {REPOSITORY}")
print(f"IMAGE_NAME_02     = {IMAGE_NAME_02}")
print(f"REMOTE_IMAGE_NAME = {REMOTE_IMAGE_NAME}")
print(f"IMAGE_URI_02      = {IMAGE_URI_02}")

DOCKERNAME_02     = Dockerfile_perarm_feats
REPOSITORY        = rl-movielens-rec-bandits-v2
IMAGE_NAME_02     = train-perarm-feats-v2
REMOTE_IMAGE_NAME = us-central1-docker.pkg.dev/hybrid-vertex/rl-movielens-rec-bandits-v2/local_docker_tfa
IMAGE_URI_02      = gcr.io/hybrid-vertex/train-perarm-feats-v2


## Create Artifact Repository

If you don't have an existing artifact repository, create one using the gcloud command below

In [8]:
# ! gcloud artifacts repositories create $REPOSITORY --repository-format=docker --location=$LOCATION

## Create Dockerfile

In [9]:
import os

root_path = '..'
os.chdir(root_path)
os.getcwd()

'/home/jupyter/tf_vertex_agents'

## Create train image

* see [example Dockerfile for GPU](https://github.com/GoogleCloudPlatform/cloudml-samples/blob/main/pytorch/containers/quickstart/mnist/Dockerfile-gpu) jobs in Vertex AI
* see deep learning container [example here](https://cloud.google.com/deep-learning-containers/docs/derivative-container), and here for [available DL containers](https://cloud.google.com/deep-learning-containers/docs/choosing-container#versions)
* installing [NVIDIA driver](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.htmlhttps://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)
* [Tensorflow GPU images and compatiblity](https://www.tensorflow.org/install/source#gpuhttps://www.tensorflow.org/install/source#gpu)
> `tensorflow 2.15` compatible with `CUDA 12.2` and `cuDNN 8.9`

### GPU

In [10]:
# !gcloud container images list --repository="us-docker.pkg.dev/deeplearning-platform-release/gcr.io"

In [11]:
gpu_profiling = True # True | False

print(f"gpu_profiling : {gpu_profiling}")

gpu_profiling : True


In [12]:
if gpu_profiling:
    # TRAIN_BASE_IMAGE = 'tensorflow/tensorflow:2.14.0-gpu'
    # TRAIN_BASE_IMAGE = 'tensorflow/tensorflow:2.15.0-gpu'
    TRAIN_BASE_IMAGE = 'gcr.io/deeplearning-platform-release/tf-gpu.2-13.py310:m111' # works
    NVTOP_RUN = 'RUN apt update && apt -y install nvtop'
    # NVTOP_RUN = 'RUN apt-get update && apt-get -y install nvtop'
else:
    TRAIN_BASE_IMAGE = 'python:3.11'
    NVTOP_RUN = ""
    
RUN_EXPORT = "RUN export PYTHONPATH=${PYTHONPATH}:${APP_HOME}/"
    
print(f"TRAIN_BASE_IMAGE : {TRAIN_BASE_IMAGE}")
print(f"NVTOP_RUN        : {NVTOP_RUN}")
print(f"RUN_EXPORT       : {RUN_EXPORT}")

TRAIN_BASE_IMAGE : gcr.io/deeplearning-platform-release/tf-gpu.2-13.py310:m111
NVTOP_RUN        : RUN apt update && apt -y install nvtop
RUN_EXPORT       : RUN export PYTHONPATH=${PYTHONPATH}:${APP_HOME}/


In [13]:
dockerfile = f'''
FROM {TRAIN_BASE_IMAGE}

ENV PYTHONUNBUFFERED True

ENV APP_HOME /workspace

WORKDIR $APP_HOME

COPY /requirements.txt $APP_HOME/requirements.txt

RUN pip install --upgrade pip
RUN pip install --no-cache-dir -r $APP_HOME/requirements.txt

RUN ls $APP_HOME

COPY src/perarm_features $APP_HOME/src/perarm_features
COPY src/per_arm_rl $APP_HOME/src/per_arm_rl

{NVTOP_RUN}

RUN ls $APP_HOME

{RUN_EXPORT}

RUN pip freeze

# Sets up the entry point to invoke the task.
ENTRYPOINT ["python3", "-m", "src.perarm_features.task"]
'''
print(dockerfile)


FROM gcr.io/deeplearning-platform-release/tf-gpu.2-13.py310:m111

ENV PYTHONUNBUFFERED True

ENV APP_HOME /workspace

WORKDIR $APP_HOME

COPY /requirements.txt $APP_HOME/requirements.txt

RUN pip install --upgrade pip
RUN pip install --no-cache-dir -r $APP_HOME/requirements.txt

RUN ls $APP_HOME

COPY src/perarm_features $APP_HOME/src/perarm_features
COPY src/per_arm_rl $APP_HOME/src/per_arm_rl

RUN apt update && apt -y install nvtop

RUN ls $APP_HOME

RUN export PYTHONPATH=${PYTHONPATH}:${APP_HOME}/

RUN pip freeze

# Sets up the entry point to invoke the task.
ENTRYPOINT ["python3", "-m", "src.perarm_features.task"]



## Write dockerfile

In [17]:
with open(f'{DOCKERNAME_02}', 'w') as f:
    f.write(dockerfile)

## Build image with Cloud Build

Building images with Cloud Build is best practices
* images are centrally stored and better managed for robust CI/CD
* building images on local workbench instance can alter notebook image config (base image for notebooks vs train images are different)
* if building locally, consider using virutal environments

#### set `.gcloudignore`
* to adjust this see the `gcloudignore` section at the end of `00-env-setup.ipynb` notebook

In [18]:
%%writefile .gcloudignore
.gcloudignore
WIP/*
imgs/*
learning/*
*.pkl
*.png
*.ipynb
.git
.github
.gitignore
.DS_Store
*.md
*.tfrecord
.ipynb_checkpoints/*
*cpython-37.pyc
**.cpython-310.pyc
*/__pycache__/*
src/ranking/*
src/archive/*
04-pipelines/*
03-ranking/*
01-baseline-perarm-bandit/*
src/pred/*
src/serve/*
Dockerfile_perarm_feats_tpu
Dockerfile_predict_mab_02
vertex_env/*
credentials.json
*/tmp_dir_v1/*
*/cpr_dir/*
*/local_model_dir/*

Overwriting .gcloudignore


In [19]:
# check eligble files
!gcloud meta list-files-for-upload

Dockerfile_perarm_feats
requirements.txt
Dockerfile_train_my_perarm_env
accelerated-bandits-script.py
pred_instances.json
Dockerfile_predict_mab_02e
cloudbuild.yaml
src/instances.json
src/per_arm_rl/utils_config.py
src/per_arm_rl/perarm_task.py
src/per_arm_rl/data_config_1m.py
src/per_arm_rl/__init__.py
src/per_arm_rl/my_per_arm_py_env.py
src/per_arm_rl/policy_util.py
src/per_arm_rl/train_utils.py
src/per_arm_rl/trainer_baseline.py
src/per_arm_rl/data_utils.py
src/per_arm_rl/data_config.py
src/perarm_features/train_perarm.py
src/perarm_features/reward_factory.py
src/perarm_features/emb_features.py
src/perarm_features/agent_factory.py
src/perarm_features/__init__.py
src/perarm_features/trainer_common.py
src/perarm_features/task.py
src/perarm_features/eval_perarm.py
src/perarm_features/ranking_bandit_policy.py
01-online-bandit-simulation/result.json


### Submit container to Cloud Build

In [20]:
# image definitions for training
MACHINE_TYPE            ='e2-highcpu-32'
FILE_LOCATION           = "." # './src'

print(f"DOCKERNAME_02 : {DOCKERNAME_02}")
print(f"IMAGE_URI_02  : {IMAGE_URI_02}")
print(f"FILE_LOCATION : {FILE_LOCATION}")
print(f"MACHINE_TYPE  : {MACHINE_TYPE}")

DOCKERNAME_02 : Dockerfile_perarm_feats
IMAGE_URI_02  : gcr.io/hybrid-vertex/train-perarm-feats-v2
FILE_LOCATION : .
MACHINE_TYPE  : e2-highcpu-32


In [21]:
! gcloud builds submit -q --config ./cloudbuild.yaml \
    --substitutions _DOCKERNAME=$DOCKERNAME_02,_IMAGE_URI=$IMAGE_URI_02,_FILE_LOCATION=$FILE_LOCATION \
    --timeout=2h \
    --machine-type=$MACHINE_TYPE

Creating temporary tarball archive of 28 file(s) totalling 232.9 KiB before compression.
Uploading tarball of [.] to [gs://hybrid-vertex_cloudbuild/source/1708396890.236905-a8eb80294e81446f868447ed5ec682d1.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/hybrid-vertex/locations/global/builds/831c5c78-0827-4b14-afce-ba8bdd760ace].
Logs are available at [ https://console.cloud.google.com/cloud-build/builds/831c5c78-0827-4b14-afce-ba8bdd760ace?project=934903580331 ].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "831c5c78-0827-4b14-afce-ba8bdd760ace"

FETCHSOURCE
Fetching storage object: gs://hybrid-vertex_cloudbuild/source/1708396890.236905-a8eb80294e81446f868447ed5ec682d1.tgz#1708396890785199
Copying gs://hybrid-vertex_cloudbuild/source/1708396890.236905-a8eb80294e81446f868447ed5ec682d1.tgz#1708396890785199...
/ [1 files][ 56.8 KiB/ 56.8 KiB]                                                
Operation completed over 1 objects/56

### TPU

* Libtpu [versions](https://cloud.google.com/tpu/docs/supported-tpu-configurations#libtpu_versions)

In [None]:
# !gsutil ls gs://cloud-tpu-tpuvm-artifacts/tensorflow/tf-2.15.0
# !gsutil ls gs://cloud-tpu-tpuvm-artifacts/tensorflow/tf-2.14.0

In [None]:
# !gsutil ls gs://cloud-tpu-tpuvm-artifacts/libtpu/1.9.0

In [None]:
# dockerfile = f'''
# FROM python:3.11

# ENV PYTHONUNBUFFERED True

# RUN wget https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/libtpu/1.8.0/libtpu.so -O /lib/libtpu.so
# RUN chmod 777 /lib/libtpu.so

# RUN wget https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/tensorflow/tf-2.14.0/tensorflow-2.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
# RUN pip3 install tensorflow-2.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
# RUN rm tensorflow-2.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

# ENV APP_HOME /workspace

# WORKDIR $APP_HOME

# COPY /requirements.txt $APP_HOME/requirements.txt

# RUN pip install --upgrade pip
# RUN pip install --no-cache-dir -r $APP_HOME/requirements.txt

# RUN ls $APP_HOME

# COPY src/perarm_features $APP_HOME/src/perarm_features
# COPY src/per_arm_rl $APP_HOME/src/per_arm_rl

# RUN ls $APP_HOME

# {RUN_EXPORT}

# RUN pip freeze

# ENV TPU_NAME=local

# # Sets up the entry point to invoke the task.
# ENTRYPOINT ["python3", "-m", "src.perarm_features.task"]
# '''
# print(dockerfile)

### Stash

In [None]:
# TRAIN_BASE_IMAGE = 'us-docker.pkg.dev/vertex-ai/training/tf-gpu.2-11:latest'
# docker pull tensorflow/tensorflow:2.13.0-gpu
# FROM nvidia/cuda:9.0-cudnn7-runtime
# us-docker.pkg.dev/vertex-ai/training/tf-gpu.2-11:latest
# FROM nvidia/12.2.2-cudnn8-runtime-ubuntu22.04
# nvcr.io/nvidia/tensorflow:24.01-tf2-py3
# gcr.io/deeplearning-platform-release/tf2-gpu.2-5
# nvidia/cuda:11.8.0-devel-ubuntu22.04

In [None]:
    # TRAIN_BASE_IMAGE = "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/tf-gpu.2-15.py310" # error: cuda/driver
    # TRAIN_BASE_IMAGE = "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/tf2-gpu.2-13.py310"
    # TRAIN_BASE_IMAGE = 'gcr.io/deeplearning-platform-release/tf2-gpu.2-13.py310'
    # TRAIN_BASE_IMAGE = "nvidia/cuda:12.2.0-runtime-ubuntu22.04"   # error: cuda/driver
    # TRAIN_BASE_IMAGE = "nvidia/12.2.2-cudnn8-runtime-ubuntu22.04" # error: couldnt find cuda/driver
    # TRAIN_BASE_IMAGE = "nvcr.io/nvidia/tensorflow:24.01-tf2-py3" # error: cuda/driver
    # TRAIN_BASE_IMAGE = "nvcr.io/nvidia/tensorflow:24.01-tf2-py311" # error: doesnt exist
    # TRAIN_BASE_IMAGE = "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/tf-gpu:latest"
    
    # TRAIN_BASE_IMAGE = 'nvcr.io/nvidia/tensorflow:24.01-tf2-py3'
    # TRAIN_BASE_IMAGE = 'gcr.io/deeplearning-platform-release/tf2-gpu.2-15.py310'

In [None]:
# # Install additional packages
# RUN apt-get -y update && \
#          apt-get -y upgrade && \
#          apt-get install -y python3-pip python3-dev
#
# RUN apt-get install -y git


#### 
# RUN apt-get update && \
#     apt-get -y --no-install-recommends install python3-dev gcc python3-pip git && \
#     rm -rf /var/lib/apt/lists/*

In [None]:
# dockerfile = f'''
# # FROM nvidia/cuda:11.8-ubuntu2004

# # # Update package lists
# # RUN apt update && apt install -y software-properties-common

# # # Add the Nvidia graphics repository
# # RUN apt-key add -p https://developer.nvidia.com/nvidia-linux-public-key-current.gpg
# # RUN add-apt-repository "deb https://developer.nvidia.com/coolstuff/cuda/11.8/ubuntu2004/x86_64/"

# # # Update package lists again
# # RUN apt update

# # # Install Nvidia drivers and required libraries
# # RUN apt install -y nvidia-driver-11.8 libcuda11-*-11.8 libnvidia-container-tools-1

# # # Add the cuda toolkit library path to LD_LIBRARY_PATH
# # RUN echo "/usr/local/cuda/11.1/lib64" >> /etc/ld.so.conf.d/nvidia.conf

# # # Reload the ldconfig cache
# # RUN ldconfig

# # FROM tensorflow/tensorflow:2.13.0-gpu

# # ENV PYTHONUNBUFFERED True

# # ENV APP_HOME /workspace

# # WORKDIR $APP_HOME

# # COPY /requirements.txt $APP_HOME/requirements.txt

# # RUN pip install --upgrade pip
# # RUN pip install --no-cache-dir -r $APP_HOME/requirements.txt

# # RUN ls $APP_HOME

# # COPY src/perarm_features $APP_HOME/src/perarm_features
# # COPY src/per_arm_rl $APP_HOME/src/per_arm_rl

# # RUN apt update && apt -y install nvtop

# # RUN ls $APP_HOME

# # {RUN_EXPORT}

# # RUN pip freeze

# # # Sets up the entry point to invoke the task.
# # ENTRYPOINT ["python3", "-m", "src.perarm_features.task"]
# # '''
# # print(dockerfile)

In [None]:
# FROM nvidia/cuda:11.1-ubuntu2004

# # Update package lists
# RUN apt update && apt install -y software-properties-common

# # Add the Nvidia graphics repository
# RUN apt-key add -p https://developer.nvidia.com/nvidia-linux-public-key-current.gpg
# RUN add-apt-repository "deb https://developer.nvidia.com/coolstuff/cuda/11.1/ubuntu2004/x86_64/"

# # Update package lists again
# RUN apt update

# # Install Nvidia drivers and required libraries
# RUN apt install -y nvidia-driver-11.1 libcuda11-*-11.1 libnvidia-container-tools-1

# # Add the cuda toolkit library path to LD_LIBRARY_PATH
# RUN echo "/usr/local/cuda/11.1/lib64" >> /etc/ld.so.conf.d/nvidia.conf

# # Reload the ldconfig cache
# RUN ldconfig

# # Your additional Dockerfile instructions go here
# FROM python:3.8-buster

# RUN apt update && apt -y install --no-install-recommends sudo

# WORKDIR /mendel-model

# COPY model_dev/mendel-model/requirements.txt /mendel-model

# RUN pip3 install -r /mendel-model/requirements.txt --ignore-installed

# COPY model_utils/mendel-utils-python /home/deps/mendel-utils-python
# RUN pip3 install -e /home/deps/mendel-utils-python

# COPY model_dev/mendel-model /mendel-model
# RUN pip3 install -e /mendel-model

# RUN curl -sSL https://sdk.cloud.google.com | bash
# ENV PATH $PATH:/root/google-cloud-sdk/bin

# ENTRYPOINT [ "python3", "/mendel-model/helper_scripts/model_training/train_ee.py", "-c", "/mendel-model/sample_configs/entity_extraction/clues-ee-training.json"]

In [77]:
# FROM nvidia/cuda:11.1-ubuntu2004

# # Update package lists
# RUN apt update && apt install -y software-properties-common

# # Add the Nvidia graphics repository
# RUN apt-key add -p https://developer.nvidia.com/nvidia-linux-public-key-current.gpg
# RUN add-apt-repository "deb https://developer.nvidia.com/coolstuff/cuda/11.1/ubuntu2004/x86_64/"

# # Update package lists again
# RUN apt update

# # Install Nvidia drivers and required libraries
# RUN apt install -y nvidia-driver-11.1 libcuda11-*-11.1 libnvidia-container-tools-1

# # Add the cuda toolkit library path to LD_LIBRARY_PATH
# RUN echo "/usr/local/cuda/11.1/lib64" >> /etc/ld.so.conf.d/nvidia.conf

# # Reload the ldconfig cache
# RUN ldconfig

# # Your additional Dockerfile instructions go here
# FROM python:3.8-buster

## (Optional) Build Image Locally

Building images with Cloud Build is best practices
* images are centrally stored and better managed for robust CI/CD
* building images on local workbench instance can alter notebook image config (base image for notebooks vs train images are different)
* if building locally, consider using virutal environments

Provide a name for your dockerfile and make sure you are authenticated

In [23]:
# ! gcloud auth configure-docker $REGION-docker.pkg.dev --quiet

In [24]:
print("copy these commands into terminal:\n")
print(f"virtualenv vertex_env")
print(f"source vertex_env/bin/activate")

copy these commands into terminal:

virtualenv vertex_env
source vertex_env/bin/activate


In [26]:
# # set variables if running in terminal
print("copy these commands into terminal:\n")
print(f"export REMOTE_IMAGE_NAME={REMOTE_IMAGE_NAME}")
print(f"export DOCKERNAME={DOCKERNAME}")
print(f"docker build -t $REMOTE_IMAGE_NAME -f ./$DOCKERNAME .")

copy these commands into terminal:

export REMOTE_IMAGE_NAME=us-central1-docker.pkg.dev/hybrid-vertex/rl-movielens-rec-bandits-v2/train-perarm-feats-v2
export DOCKERNAME=Dockerfile_perarm_feats
docker build -t $REMOTE_IMAGE_NAME -f ./$DOCKERNAME .


In [27]:
# !docker build -t $REMOTE_IMAGE_NAME -f $DOCKERNAME .

### Push container to Registry

In [28]:
# ### push the container to registry

print("copy this command into terminal:\n")
print(f"docker push $REMOTE_IMAGE_NAME")

# !docker push $REMOTE_IMAGE_NAME

copy this command into terminal:

docker push $REMOTE_IMAGE_NAME


### GPU profiling

> enter these commands in the Vertex interactive terminal:

```bash
sudo apt update
sudo apt -y install nvtop
```

**Finished**