# Build custom train image 

In [1]:
VERSION        = "v2"                       # TODO
PREFIX         = f'rec-bandits-{VERSION}'   # TODO

print(f"PREFIX: {PREFIX}")

PREFIX: rec-bandits-v2


In [2]:
# staging GCS
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}-bucket'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PROJECT_ID               = "hybrid-vertex"
PROJECT_NUM              = "934903580331"
LOCATION                 = "us-central1"

REGION                   = "us-central1"
BQ_LOCATION              = "US"
VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"
VERTEX_SA                = "934903580331-compute@developer.gserviceaccount.com"

PREFIX                   = "rec-bandits-v2"
VERSION                  = "v2"

BUCKET_NAME              = "rec-bandits-v2-hybrid-vertex-bucket"
BUCKET_URI               = "gs://rec-bandits-v2-hybrid-vertex-bucket"
DATA_GCS_PREFIX          = "data"
DATA_PATH                = "gs://rec-bandits-v2-hybrid-vertex-bucket/data"
VOCAB_SUBDIR             = "vocabs"
VOCAB_FILENAME           = "vocab_dict.pkl"

VPC_NETWORK_FULL         = "projects/934903580331/global/networks/ucaip-haystack-vpc-network"

BIGQUERY_DATASET_NAME    = "mvlens_rec_bandits_v2"
BIGQUERY_TABLE_NAME      = "training_dataset"

REPOSITORY               = "rl-movielens-rec-bandits-v2"

DOCKERNAM

In [3]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

## Container Image Variables

In [4]:
DOCKERNAME_06 = "Dockerfile_reinforce_recsys"
IMAGE_NAME_06 = "train-reinforce-agent-v1"
IMAGE_URI_06 = f"gcr.io/hybrid-vertex/{IMAGE_NAME_06}"

print(f"DOCKERNAME_06 = {DOCKERNAME_06}")
print(f"IMAGE_NAME_06 = {IMAGE_NAME_06}")
print(f"IMAGE_URI_06  = {IMAGE_URI_06}")
print(f"REPOSITORY    = {REPOSITORY}")

DOCKERNAME_06 = Dockerfile_reinforce_recsys
IMAGE_NAME_06 = train-reinforce-agent-v1
IMAGE_URI_06  = gcr.io/hybrid-vertex/train-reinforce-agent-v1
REPOSITORY    = rl-movielens-rec-bandits-v2


In [5]:
import os

root_path = '..'
os.chdir(root_path)
os.getcwd()

'/home/jupyter/tf_vertex_agents'

## Create train image

In [6]:
gpu_profiling = True # True | False

print(f"gpu_profiling : {gpu_profiling}")

gpu_profiling : True


In [7]:
if gpu_profiling:
    # TRAIN_BASE_IMAGE = 'tensorflow/tensorflow:2.14.0-gpu'
    # TRAIN_BASE_IMAGE = 'tensorflow/tensorflow:2.15.0-gpu'
    # TRAIN_BASE_IMAGE = "gcr.io/deeplearning-platform-release/tf-gpu.2-15"     # need to test
    TRAIN_BASE_IMAGE = 'gcr.io/deeplearning-platform-release/tf-gpu.2-13.py310' # works m111
    NVTOP_RUN = 'RUN apt update && apt -y install nvtop'
    # NVTOP_RUN = 'RUN apt-get update && apt-get -y install nvtop'
else:
    TRAIN_BASE_IMAGE = 'python:3.10'
    NVTOP_RUN = ""
    
RUN_EXPORT = "RUN export PYTHONPATH=${PYTHONPATH}:${APP_HOME}/"
    
print(f"TRAIN_BASE_IMAGE : {TRAIN_BASE_IMAGE}")
print(f"NVTOP_RUN        : {NVTOP_RUN}")
print(f"RUN_EXPORT       : {RUN_EXPORT}")

TRAIN_BASE_IMAGE : gcr.io/deeplearning-platform-release/tf-gpu.2-13.py310
NVTOP_RUN        : RUN apt update && apt -y install nvtop
RUN_EXPORT       : RUN export PYTHONPATH=${PYTHONPATH}:${APP_HOME}/


In [8]:
dockerfile = f'''
FROM {TRAIN_BASE_IMAGE}

ENV PYTHONUNBUFFERED True
ENV APP_HOME /workspace

WORKDIR $APP_HOME

COPY /requirements.txt $APP_HOME/requirements.txt
RUN pip install --upgrade pip
RUN pip install --no-cache-dir -r $APP_HOME/requirements.txt

COPY src/data $APP_HOME/src/data
COPY src/utils $APP_HOME/src/utils
COPY src/agents $APP_HOME/src/agents
COPY src/trainer $APP_HOME/src/trainer
COPY src/networks $APP_HOME/src/networks
COPY src/data_preprocessor $APP_HOME/src/data_preprocessor

{NVTOP_RUN}

{RUN_EXPORT}

RUN pip freeze

# Sets up the entry point to invoke the task.
ENTRYPOINT ["python3", "-m", "src.trainer.train_topkop_rfa"]
'''
print(dockerfile)


FROM gcr.io/deeplearning-platform-release/tf-gpu.2-13.py310

ENV PYTHONUNBUFFERED True
ENV APP_HOME /workspace

WORKDIR $APP_HOME

COPY /requirements.txt $APP_HOME/requirements.txt
RUN pip install --upgrade pip
RUN pip install --no-cache-dir -r $APP_HOME/requirements.txt

COPY src/data $APP_HOME/src/data
COPY src/utils $APP_HOME/src/utils
COPY src/agents $APP_HOME/src/agents
COPY src/trainer $APP_HOME/src/trainer
COPY src/networks $APP_HOME/src/networks
COPY src/data_preprocessor $APP_HOME/src/data_preprocessor

RUN apt update && apt -y install nvtop

RUN export PYTHONPATH=${PYTHONPATH}:${APP_HOME}/

RUN pip freeze

# Sets up the entry point to invoke the task.
ENTRYPOINT ["python3", "-m", "src.trainer.train_topkop_rfa"]



In [9]:
with open(f'{DOCKERNAME_06}', 'w') as f:
    f.write(dockerfile)

# Build image with Cloud Build

In [10]:
! pwd

/home/jupyter/tf_vertex_agents


### set `.gcloudignore`

In [15]:
# %%writefile .gcloudignore
# .gcloudignore
# WIP/*
# imgs/*
# learning/*
# *.pkl
# *.png
# *.ipynb
# .git
# .github
# .gitignore
# .DS_Store
# *.md
# *.tfrecord
# .ipynb_checkpoints/*
# *cpython-37.pyc
# **.cpython-310.pyc
# */__pycache__/*
# src/archive/*
# */vertex_env/*
# credentials.json
# src/data/local_data/*

In [14]:
# check eligble files

# ! gcloud meta list-files-for-upload

## Submit container to Cloud Build

In [10]:
# image definitions for training
MACHINE_TYPE            ='e2-highcpu-32'
FILE_LOCATION           = "." # './src'

print(f"DOCKERNAME_06 : {DOCKERNAME_06}")
print(f"IMAGE_URI_06  : {IMAGE_URI_06}")
print(f"FILE_LOCATION : {FILE_LOCATION}")
print(f"MACHINE_TYPE  : {MACHINE_TYPE}")

DOCKERNAME_06 : Dockerfile_reinforce_recsys
IMAGE_URI_06  : gcr.io/hybrid-vertex/train-reinforce-agent-v1
FILE_LOCATION : .
MACHINE_TYPE  : e2-highcpu-32


In [11]:
! gcloud builds submit -q --config ./cloudbuild.yaml \
    --substitutions _DOCKERNAME=$DOCKERNAME_06,_IMAGE_URI=$IMAGE_URI_06,_FILE_LOCATION=$FILE_LOCATION \
    --timeout=2h \
    --machine-type=$MACHINE_TYPE

Creating temporary archive of 66 file(s) totalling 3.7 MiB before compression.
Uploading tarball of [.] to [gs://hybrid-vertex_cloudbuild/source/1733954031.582611-0d79e35e41ae4e1da33b8c9bff34c416.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/hybrid-vertex/locations/global/builds/9e1f1614-9b5a-42e0-bcf0-ef85235dbadb].
Logs are available at [ https://console.cloud.google.com/cloud-build/builds/9e1f1614-9b5a-42e0-bcf0-ef85235dbadb?project=934903580331 ].
Waiting for build to complete. Polling interval: 1 second(s).
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "9e1f1614-9b5a-42e0-bcf0-ef85235dbadb"

FETCHSOURCE
Fetching storage object: gs://hybrid-vertex_cloudbuild/source/1733954031.582611-0d79e35e41ae4e1da33b8c9bff34c416.tgz#1733954032892945
Copying gs://hybrid-vertex_cloudbuild/source/1733954031.582611-0d79e35e41ae4e1da33b8c9bff34c416.tgz#1733954032892945...
/ [1 files][654.2 KiB/654.2 KiB]                                  

**Finished**