# Build custom images for GPI pipelines

In [1]:
VERSION        = "v2"                       # TODO
PREFIX         = f'rec-bandits-{VERSION}'   # TODO

print(f"PREFIX: {PREFIX}")

PREFIX: rec-bandits-v2


In [2]:
# staging GCS
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}-bucket'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PROJECT_ID               = "hybrid-vertex"
PROJECT_NUM              = "934903580331"
LOCATION                 = "us-central1"

REGION                   = "us-central1"
BQ_LOCATION              = "US"
VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"
VERTEX_SA                = "934903580331-compute@developer.gserviceaccount.com"

PREFIX                   = "rec-bandits-v2"
VERSION                  = "v2"

BUCKET_NAME              = "rec-bandits-v2-hybrid-vertex-bucket"
BUCKET_URI               = "gs://rec-bandits-v2-hybrid-vertex-bucket"
DATA_GCS_PREFIX          = "data"
DATA_PATH                = "gs://rec-bandits-v2-hybrid-vertex-bucket/data"
VOCAB_SUBDIR             = "vocabs"
VOCAB_FILENAME           = "vocab_dict.pkl"

VPC_NETWORK_FULL         = "projects/934903580331/global/networks/ucaip-haystack-vpc-network"

BIGQUERY_DATASET_NAME    = "mvlens_rec_bandits_v2"
BIGQUERY_TABLE_NAME      = "training_dataset"

REPOSITORY               = "rl-movielens-rec-bandits-v2"

DOCKERNAM

In [3]:
POLICY_PIPE_IMAGE = f"gcr.io/{PROJECT_ID}/mv-gpi-pipeline"
DOCKERNAME_GPI_PIPE  = "Dockerfile_gpi_pipe"

POLICY_TRAIN_IMAGE = f"gcr.io/{PROJECT_ID}/mv-gpi-train"
DOCKERNAME_GPI_TRAIN  = "Dockerfile_gpi_train"

In [4]:
import os

root_path = '..'
os.chdir(root_path)
os.getcwd()

'/home/jupyter/tf_vertex_agents'

## GPI Pipeline base image

In [5]:
TRAIN_BASE_IMAGE = 'gcr.io/deeplearning-platform-release/tf-gpu.2-13.py310:m111'
RUN_EXPORT = "RUN export PYTHONPATH=${PYTHONPATH}:${APP_HOME}/"

In [6]:
dockerfile = f'''
FROM {TRAIN_BASE_IMAGE}

ENV PYTHONUNBUFFERED True

ENV APP_HOME /workspace

WORKDIR $APP_HOME

COPY /requirements.txt $APP_HOME/requirements.txt

RUN pip install --upgrade pip
RUN pip install --no-cache-dir -r $APP_HOME/requirements.txt
RUN pip install kfp==2.3.0

RUN ls $APP_HOME

COPY src/networks $APP_HOME/src/networks
COPY src/agents $APP_HOME/src/agents
COPY src/data $APP_HOME/src/data
COPY src/data_preprocessor $APP_HOME/src/data_preprocessor
COPY src/policy_pipeline $APP_HOME/src/policy_pipeline
COPY src/utils $APP_HOME/src/utils
COPY src/trainer $APP_HOME/src/trainer

RUN ls $APP_HOME

{RUN_EXPORT}

RUN pip freeze
'''
print(dockerfile)


FROM gcr.io/deeplearning-platform-release/tf-gpu.2-13.py310:m111

ENV PYTHONUNBUFFERED True

ENV APP_HOME /workspace

WORKDIR $APP_HOME

COPY /requirements.txt $APP_HOME/requirements.txt

RUN pip install --upgrade pip
RUN pip install --no-cache-dir -r $APP_HOME/requirements.txt
RUN pip install kfp==2.3.0

RUN ls $APP_HOME

COPY src/networks $APP_HOME/src/networks
COPY src/agents $APP_HOME/src/agents
COPY src/data $APP_HOME/src/data
COPY src/data_preprocessor $APP_HOME/src/data_preprocessor
COPY src/policy_pipeline $APP_HOME/src/policy_pipeline
COPY src/utils $APP_HOME/src/utils
COPY src/trainer $APP_HOME/src/trainer

RUN ls $APP_HOME

RUN export PYTHONPATH=${PYTHONPATH}:${APP_HOME}/

RUN pip freeze



In [7]:
with open(f'{DOCKERNAME_GPI_PIPE}', 'w') as f:
    f.write(dockerfile)

In [8]:
# image definitions for training
MACHINE_TYPE            ='e2-highcpu-32'
FILE_LOCATION           = "." # './src'

print(f"DOCKERNAME_GPI_PIPE : {DOCKERNAME_GPI_PIPE}")
print(f"POLICY_PIPE_IMAGE   : {POLICY_PIPE_IMAGE}")
print(f"FILE_LOCATION       : {FILE_LOCATION}")
print(f"MACHINE_TYPE        : {MACHINE_TYPE}")

DOCKERNAME_GPI_PIPE : Dockerfile_gpi_pipe
POLICY_PIPE_IMAGE   : gcr.io/hybrid-vertex/mv-gpi-pipeline
FILE_LOCATION       : .
MACHINE_TYPE        : e2-highcpu-32


In [9]:
! gcloud builds submit -q --config ./cloudbuild.yaml \
    --substitutions _DOCKERNAME=$DOCKERNAME_GPI_PIPE,_IMAGE_URI=$POLICY_PIPE_IMAGE,_FILE_LOCATION=$FILE_LOCATION \
    --timeout=2h \
    --machine-type=$MACHINE_TYPE

print("build complete")

Creating temporary tarball archive of 85 file(s) totalling 2.8 MiB before compression.
Uploading tarball of [.] to [gs://hybrid-vertex_cloudbuild/source/1712237427.225585-bdd8c08332254c2dab6289867deb5438.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/hybrid-vertex/locations/global/builds/530496f8-2679-40d7-bd25-edf4fc28d1e0].
Logs are available at [ https://console.cloud.google.com/cloud-build/builds/530496f8-2679-40d7-bd25-edf4fc28d1e0?project=934903580331 ].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "530496f8-2679-40d7-bd25-edf4fc28d1e0"

FETCHSOURCE
Fetching storage object: gs://hybrid-vertex_cloudbuild/source/1712237427.225585-bdd8c08332254c2dab6289867deb5438.tgz#1712237428328345
Copying gs://hybrid-vertex_cloudbuild/source/1712237427.225585-bdd8c08332254c2dab6289867deb5438.tgz#1712237428328345...
/ [1 files][785.9 KiB/785.9 KiB]                                                
Operation completed over 1 objects/785.

## GPI train image

In [11]:
os.getcwd()

'/home/jupyter/tf_vertex_agents'

In [15]:
TRAIN_BASE_IMAGE = 'gcr.io/deeplearning-platform-release/tf-gpu.2-13.py310' # works m111
NVTOP_RUN = 'RUN apt update && apt -y install nvtop'
RUN_EXPORT = "RUN export PYTHONPATH=${PYTHONPATH}:${APP_HOME}/"
    
print(f"TRAIN_BASE_IMAGE : {TRAIN_BASE_IMAGE}")
print(f"NVTOP_RUN        : {NVTOP_RUN}")
print(f"RUN_EXPORT       : {RUN_EXPORT}")

TRAIN_BASE_IMAGE : gcr.io/deeplearning-platform-release/tf-gpu.2-13.py310
NVTOP_RUN        : RUN apt update && apt -y install nvtop
RUN_EXPORT       : RUN export PYTHONPATH=${PYTHONPATH}:${APP_HOME}/


In [16]:
dockerfile = f'''
FROM {TRAIN_BASE_IMAGE}

ENV PYTHONUNBUFFERED True

ENV APP_HOME /workspace

WORKDIR $APP_HOME

COPY /requirements.txt $APP_HOME/requirements.txt

RUN pip install --upgrade pip
RUN pip install --no-cache-dir -r $APP_HOME/requirements.txt

RUN ls $APP_HOME

COPY src/data $APP_HOME/src/data
COPY src/utils $APP_HOME/src/utils
COPY src/agents $APP_HOME/src/agents
COPY src/trainer $APP_HOME/src/trainer
COPY src/networks $APP_HOME/src/networks

{NVTOP_RUN}

RUN ls $APP_HOME

{RUN_EXPORT}

RUN pip freeze

# Sets up the entry point to invoke the task.
ENTRYPOINT ["python3", "-m", "src.trainer.agent_update_task"]
'''
print(dockerfile)


FROM gcr.io/deeplearning-platform-release/tf-gpu.2-13.py310

ENV PYTHONUNBUFFERED True

ENV APP_HOME /workspace

WORKDIR $APP_HOME

COPY /requirements.txt $APP_HOME/requirements.txt

RUN pip install --upgrade pip
RUN pip install --no-cache-dir -r $APP_HOME/requirements.txt

RUN ls $APP_HOME

COPY src/data $APP_HOME/src/data
COPY src/utils $APP_HOME/src/utils
COPY src/agents $APP_HOME/src/agents
COPY src/trainer $APP_HOME/src/trainer
COPY src/networks $APP_HOME/src/networks

RUN apt update && apt -y install nvtop

RUN ls $APP_HOME

RUN export PYTHONPATH=${PYTHONPATH}:${APP_HOME}/

RUN pip freeze

# Sets up the entry point to invoke the task.
ENTRYPOINT ["python3", "-m", "src.trainer.agent_update_task"]



In [17]:
with open(f'{DOCKERNAME_GPI_TRAIN}', 'w') as f:
    f.write(dockerfile)

In [18]:
! gcloud builds submit -q --config ./cloudbuild.yaml \
    --substitutions _DOCKERNAME=$DOCKERNAME_GPI_TRAIN,_IMAGE_URI=$POLICY_TRAIN_IMAGE,_FILE_LOCATION=$FILE_LOCATION \
    --timeout=2h \
    --machine-type=$MACHINE_TYPE

Creating temporary tarball archive of 85 file(s) totalling 2.8 MiB before compression.
Uploading tarball of [.] to [gs://hybrid-vertex_cloudbuild/source/1712235453.099396-9cbba084eb6848d68abd535ed9d37c9b.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/hybrid-vertex/locations/global/builds/20e545e3-392e-4940-b456-0f969b6170a0].
Logs are available at [ https://console.cloud.google.com/cloud-build/builds/20e545e3-392e-4940-b456-0f969b6170a0?project=934903580331 ].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "20e545e3-392e-4940-b456-0f969b6170a0"

FETCHSOURCE
Fetching storage object: gs://hybrid-vertex_cloudbuild/source/1712235453.099396-9cbba084eb6848d68abd535ed9d37c9b.tgz#1712235454111114
Copying gs://hybrid-vertex_cloudbuild/source/1712235453.099396-9cbba084eb6848d68abd535ed9d37c9b.tgz#1712235454111114...
/ [1 files][785.8 KiB/785.8 KiB]                                                
Operation completed over 1 objects/785.