## create files to submit to Cloud Build

In [1]:
GCP_PROJECTS = !gcloud config get-value project
PROJECT_ID = GCP_PROJECTS[0]
PROJECT_NUM = !gcloud projects list --filter="$PROJECT_ID" --format="value(PROJECT_NUMBER)"
PROJECT_NUM = PROJECT_NUM[0]
REGION = 'us-central1'

print(f"PROJECT_ID: {PROJECT_ID}")
print(f"PROJECT_NUM: {PROJECT_NUM}")
print(f"REGION: {REGION}")

PROJECT_ID: hybrid-vertex
PROJECT_NUM: 934903580331
REGION: us-central1


### define env variables and paths

In [21]:
VERSION=22.12
REPO_NAME="workbench"
FRAMEWORK="pytorch"
MERLIN_IMAGE_NAME=f"merlin-{FRAMEWORK}-{VERSION}"
IMAGE_URI = f'gcr.io/{PROJECT_ID}/{MERLIN_IMAGE_NAME}'

print(f"IMAGE_URI: {IMAGE_URI}")

IMAGE_URI: gcr.io/hybrid-vertex/merlin-pytorch-22.12


## write container files

In [7]:
# !pwd

In [31]:
REPO_DOCKER_PATH_PREFIX = 'src'
CONTAINER_SUBDIR = 'merlin_container'

# ! rm -rf {REPO_DOCKER_PATH_PREFIX}
! mkdir {REPO_DOCKER_PATH_PREFIX}
! mkdir {REPO_DOCKER_PATH_PREFIX}/{CONTAINER_SUBDIR}

In [32]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/cloudbuild.yaml

steps:
- name: 'gcr.io/cloud-builders/docker'
  args: ['build', '-t', '$_IMAGE_URI', '$_FILE_LOCATION', '-f', '$_FILE_LOCATION/Dockerfile.$_DOCKERNAME']
images:
- '$_IMAGE_URI'

Writing src/cloudbuild.yaml


In [33]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/{CONTAINER_SUBDIR}/jupyter_notebook_config.py
c.NotebookApp.ip = '*'
c.NotebookApp.token = ''
c.NotebookApp.password = ''
c.NotebookApp.open_browser = False
c.NotebookApp.port = 8080
c.NotebookApp.terminado_settings = {'shell_command': ['/bin/bash']}
c.NotebookApp.allow_origin_pat = (
'(^https://8080-dot-[0-9]+-dot-devshell\.appspot\.com$)|'
'(^https://colab\.research\.google\.com$)|'
'((https?://)?[0-9a-z]+-dot-(?:us|asia|europe|northamerica|southamerica)-?[0-9a-z]+\.notebooks\.googleusercontent.com)')
c.NotebookApp.allow_remote_access = True
c.NotebookApp.disable_check_xsrf = False

Writing src/merlin_container/jupyter_notebook_config.py


In [34]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/{CONTAINER_SUBDIR}/requirements.txt
fastapi
transformers4rec[pytorch,nvtabular,dataloader]
gsutil
gcsfs
matplotlib
google-cloud-aiplatform==1.21.0
jupyterlab-git

Writing src/merlin_container/requirements.txt


In [35]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/Dockerfile.{FRAMEWORK}
FROM nvcr.io/nvidia/merlin/merlin-pytorch:22.12
RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg  add - && apt-get update -y && apt-get install google-cloud-sdk -y
EXPOSE 8080

# copy the dependencies file to the working directory
COPY merlin_container/requirements.txt .
# install dependencies
RUN apt-get install nodejs npm -y
RUN pip install -r requirements.txt
#RUN mkdir /root/.jupyter
             
COPY merlin_container/jupyter_notebook_config.py /root/.jupyter

ENV pwd=""
ENTRYPOINT exec jupyter-lab --ip=0.0.0.0 --port=8080 --no-browser --allow-root --ServerApp.allow_origin="*" --NotebookApp.token="$pwd" --NotebookApp.password="$pwd"

Writing src/Dockerfile.pytorch


### optionally, include a `.gitignore` file

* limits the files submitted to Cloud Build
* see [gcloudignore](https://cloud.google.com/sdk/gcloud/reference/topic/gcloudignore) for details

In [36]:
! gcloud config set gcloudignore/enabled true

Updated property [gcloudignore/enabled].


In [40]:
%%writefile .gcloudignore
.gcloudignore
/merlin_container_v2/
create-merlin-image.ipynb
Dockerfile
.git
.github
.ipynb_checkpoints/*

Overwriting .gcloudignore


In [41]:
!gcloud meta list-files-for-upload
# !ls

README.md
src/cloudbuild.yaml
src/Dockerfile.pytorch
src/merlin_container/requirements.txt
src/merlin_container/jupyter_notebook_config.py


## submit to Cloud Build

In [42]:
FILE_LOCATION = f'./{REPO_DOCKER_PATH_PREFIX}'
MACHINE_TYPE ='e2-highcpu-32'

print(f"MERLIN_IMAGE_NAME : {MERLIN_IMAGE_NAME}")
print(f"IMAGE_URI: {IMAGE_URI}")
print(f"FILE_LOCATION: {FILE_LOCATION}")
print(f"MACHINE_TYPE: {MACHINE_TYPE}")

MERLIN_IMAGE_NAME : merlin-pytorch-22.12
IMAGE_URI: gcr.io/hybrid-vertex/merlin-pytorch-22.12
FILE_LOCATION: ./src
MACHINE_TYPE: e2-highcpu-32


In [43]:
! gcloud builds submit --config $REPO_DOCKER_PATH_PREFIX/cloudbuild.yaml \
    --substitutions _DOCKERNAME=$FRAMEWORK,_IMAGE_URI=$IMAGE_URI,_FILE_LOCATION=$FILE_LOCATION \
    --timeout=2h \
    --machine-type=$MACHINE_TYPE

Creating temporary tarball archive of 5 file(s) totalling 1.7 KiB before compression.
Uploading tarball of [.] to [gs://hybrid-vertex_cloudbuild/source/1676585977.98018-f2f6210d3a004a3aae8b29ba8e35d31c.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/hybrid-vertex/locations/global/builds/73983f02-0901-4f78-93ff-5436bedca1bc].
Logs are available at [ https://console.cloud.google.com/cloud-build/builds/73983f02-0901-4f78-93ff-5436bedca1bc?project=934903580331 ].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "73983f02-0901-4f78-93ff-5436bedca1bc"

FETCHSOURCE
Fetching storage object: gs://hybrid-vertex_cloudbuild/source/1676585977.98018-f2f6210d3a004a3aae8b29ba8e35d31c.tgz#1676585978210535
Copying gs://hybrid-vertex_cloudbuild/source/1676585977.98018-f2f6210d3a004a3aae8b29ba8e35d31c.tgz#1676585978210535...
/ [1 files][  1.4 KiB/  1.4 KiB]                                                
Operation completed over 1 objects/1.4 KiB.

## Creating Vertex Workbench Notebook Instance 

**Once Cloud Build job complete, follow these instructions to create a Vertex Workbench instance using the Merlin pytorch image**

* Go to Vertex Workbench console, select `NEW NOTEBOOK` then `Customize...`
* under `ENVIRONMENT` drop-down, select first option `custom container`
* A side panel will appear, locate the `IMAGE_URI` 
* Under `Machine Configuration` choose compute resources. *Note: if attaching a GPU, you must select `Install NVIDIA GPU driver automatically for me` (which will appear as a checkbox under the `GPU type` field
> * For an A100-powered instance, select the `a2-highgpu-1g` machine type (GPU type will auto-populate with defualt count=1)
* If planning to interact with Vertex Matching Engine from notebook instance (e.g., CRUD ops), the notebook instance must use the same vpc-network as the Matching Engine index endpoints 
