# Environment Setup for training TF-Agents

## Install Packages
Run `pip requirements.txt` in either (1) the notebook cell below or (2) in a notebook terminal window

In [None]:
# !pwd

In [None]:
import os

# The Vertex AI Workbench Notebook product has specific requirements
IS_WORKBENCH_NOTEBOOK = os.getenv("DL_ANACONDA_HOME")
IS_USER_MANAGED_WORKBENCH_NOTEBOOK = os.path.exists(
    "/opt/deeplearning/metadata/env_version"
)

# Vertex AI Notebook requires dependencies to be installed with '--user'
USER_FLAG = ""
if IS_WORKBENCH_NOTEBOOK:
    USER_FLAG = "--user"

# !pip install --no-cache-dir -r ./requirements.txt --user -q

## Set vars

#### CREATE_NEW_ASSETS

* `True` creates new GCS buckets and BQ tables, etc.
* `False` skips these steps (in case you need to re-run notebook to include new variables you create)

In [None]:
# create new BQ datasets, tables, etc.?
CREATE_NEW_ASSETS         = True 

In [None]:
# naming convention for all cloud resources
VERSION        = "v2"              # TODO
PREFIX         = f'rec-bandits-{VERSION}'   # TODO

print(f"PREFIX = {PREFIX}")

### GCP project

In [None]:
# creds, PROJECT_ID = google.auth.default()
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

PROJECT_NUM              = !gcloud projects describe $PROJECT_ID --format="value(projectNumber)"
PROJECT_NUM              = PROJECT_NUM[0]

VERTEX_SA                = f'{PROJECT_NUM}-compute@developer.gserviceaccount.com'

VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"

# locations / regions for cloud resources
LOCATION                 = 'us-central1'        
REGION                   = LOCATION
BQ_LOCATION              = 'US'

print(f"PROJECT_ID            = {PROJECT_ID}")
print(f"PROJECT_NUM           = {PROJECT_NUM}")
print(f"VPC_NETWORK_NAME      = {VPC_NETWORK_NAME}")
print(f"LOCATION              = {LOCATION}")
print(f"REGION                = {REGION}")
print(f"BQ_LOCATION           = {BQ_LOCATION}")

### Define Cloud Resource Names and Args

You shouldn't need to change the variable names below. We are going to save them to a config file we can call across different notebooks and environments; the goal is to ease tracking and managing these many variables across the repo!

In [None]:
# GCS bucket and paths
BUCKET_NAME                   = f'{PREFIX}-{PROJECT_ID}-bucket'
BUCKET_URI                    = f'gs://{BUCKET_NAME}'

BIGQUERY_DATASET_NAME         = f"{BUCKET_NAME.lower().replace(PROJECT_ID,'').replace('bucket','').replace('-','_').replace('__','_')}".rstrip("_")

print(f"BUCKET_NAME           = {BUCKET_NAME}")
print(f"BUCKET_URI            = {BUCKET_URI}")
print(f"BIGQUERY_DATASET_NAME = {BIGQUERY_DATASET_NAME}")

In [None]:
# Location to write TF-Records for MovieLens 100K dataset
DATA_GCS_PREFIX          = "data"
DATA_PATH                = f"{BUCKET_URI}/{DATA_GCS_PREFIX}"
VOCAB_SUBDIR             = "vocabs"
VOCAB_FILENAME           = 'vocab_dict.pkl'
DATA_PATH_KFP_DEMO       = f"{DATA_PATH}/kfp_demo_data/u.data"

VPC_NETWORK_FULL         = f"projects/{PROJECT_NUM}/global/networks/{VPC_NETWORK_NAME}"

# BigQuery parameters (used for the Generator, Ingester, Logger)
BIGQUERY_DATASET_NAME      = f"mvlens_{BIGQUERY_DATASET_NAME}"
BIGQUERY_TABLE_NAME        = f"training_dataset"

# container registry
REPOSITORY                = f'rl-movielens-{PREFIX}'

# Custom Images - 01-baseline-perarm-bandit
DOCKERNAME_01             = "Dockerfile_train_my_perarm_env"
IMAGE_NAME_01             = f'train-my-perarm-env-{VERSION}'
IMAGE_URI_01              = f'gcr.io/{PROJECT_ID}/{IMAGE_NAME_01}'

# Custom Images - 02-perarm-features-bandit
DOCKERNAME_02             = "Dockerfile_perarm_feats"
IMAGE_NAME_02             = f'train-perarm-feats-{VERSION}'
IMAGE_URI_02              = f'gcr.io/{PROJECT_ID}/{IMAGE_NAME_02}'

# Custom Images - 03-ranking
DOCKERNAME_03             = "Dockerfile_ranking_bandit"
IMAGE_NAME_03             = f'train-rank-bandit-{VERSION}'
IMAGE_URI_03              = f'gcr.io/{PROJECT_ID}/{IMAGE_NAME_03}'

# Custom Images - 04-pipelines
DOCKERNAME_04             = "Dockerfile_train_mab_e2e"
IMAGE_NAME_04             = f'train-mab-e2e-{VERSION}'
IMAGE_URI_04              = f'gcr.io/{PROJECT_ID}/{IMAGE_NAME_04}'

DOCKERNAME_04_pred        = "Dockerfile_pred_mab_e2e"
IMAGE_NAME_04_pred        = f'pred-mab-e2e-{VERSION}'
IMAGE_URI_04_pred         = f'gcr.io/{PROJECT_ID}/{IMAGE_NAME_04_pred}'

# docker (local build)
REMOTE_IMAGE_NAME         = f"{REGION}-docker.pkg.dev/{PROJECT_ID}/{REPOSITORY}/local_docker_tfa"

print(f"DATA_GCS_PREFIX      : {DATA_GCS_PREFIX}")
print(f"DATA_PATH            : {DATA_PATH}")
print(f"VOCAB_SUBDIR         : {VOCAB_SUBDIR}")
print(f"VOCAB_FILENAME       : {VOCAB_FILENAME}")
print(f"DATA_PATH_KFP_DEMO   : {DATA_PATH_KFP_DEMO}")

print(f"VPC_NETWORK_FULL     : {VPC_NETWORK_FULL}")

print(f"BIGQUERY_DATASET_NAME: {BIGQUERY_DATASET_NAME}")
print(f"BIGQUERY_TABLE_NAME  : {BIGQUERY_TABLE_NAME}")

print(f"REPOSITORY           : {REPOSITORY}")

print(f"DOCKERNAME_01        : {DOCKERNAME_01}")
print(f"IMAGE_NAME_01        : {IMAGE_NAME_01}")
print(f"IMAGE_URI_01         : {IMAGE_URI_01}")

print(f"DOCKERNAME_02        : {DOCKERNAME_02}")
print(f"IMAGE_NAME_02        : {IMAGE_NAME_02}")
print(f"IMAGE_URI_02         : {IMAGE_URI_02}")

print(f"DOCKERNAME_03        : {DOCKERNAME_03}")
print(f"IMAGE_NAME_03        : {IMAGE_NAME_03}")
print(f"IMAGE_URI_03         : {IMAGE_URI_03}")

print(f"DOCKERNAME_04        : {DOCKERNAME_04}")
print(f"IMAGE_NAME_04        : {IMAGE_NAME_04}")
print(f"IMAGE_URI_04         : {IMAGE_URI_04}")

print(f"DOCKERNAME_04_pred   : {DOCKERNAME_04_pred}")
print(f"IMAGE_NAME_04_pred   : {IMAGE_NAME_04_pred}")
print(f"IMAGE_URI_04_pred    : {IMAGE_URI_04_pred}")

print(f"REMOTE_IMAGE_NAME    : {REMOTE_IMAGE_NAME}")

In [None]:
if CREATE_NEW_ASSETS:
    # create new bucket
    ! gsutil mb -l $REGION $BUCKET_URI
    
    # give Service account IAM perms
    ! gsutil iam ch serviceAccount:{VERTEX_SA}:roles/storage.objects.get $BUCKET_URI
    ! gsutil iam ch serviceAccount:{VERTEX_SA}:roles/storage.objects.get $BUCKET_URI

## Repo structure

* these variables are used to structure the repo
* this means they are required for correctly building Dockerfile's, importing classes, etc.

In [None]:
REPO_DOCKER_PATH_PREFIX          = 'src'
RL_SUB_DIR                       = 'per_arm_rl'

print(f"REPO_DOCKER_PATH_PREFIX  : {REPO_DOCKER_PATH_PREFIX}")
print(f"RL_SUB_DIR               : {RL_SUB_DIR}")

## Save Notebook Configuration Data
If you want to avoid having to re-enter these across notebooks

In [None]:
config = f"""
PROJECT_ID               = \"{PROJECT_ID}\"
PROJECT_NUM              = \"{PROJECT_NUM}\"
LOCATION                 = \"{LOCATION}\"

REGION                   = \"{REGION}\"
BQ_LOCATION              = \"{BQ_LOCATION}\"
VPC_NETWORK_NAME         = \"{VPC_NETWORK_NAME}\"

VERTEX_SA                = \"{VERTEX_SA}\"

PREFIX                   = \"{PREFIX}\"
VERSION                  = \"{VERSION}\"

BUCKET_NAME              = \"{BUCKET_NAME}\"
BUCKET_URI               = \"{BUCKET_URI}\"
DATA_GCS_PREFIX          = \"{DATA_GCS_PREFIX}\"
DATA_PATH                = \"{DATA_PATH}\"
VOCAB_SUBDIR             = \"{VOCAB_SUBDIR}\"
VOCAB_FILENAME           = \"{VOCAB_FILENAME}\"
DATA_PATH_KFP_DEMO       = \"{DATA_PATH_KFP_DEMO}\"

VPC_NETWORK_FULL         = \"{VPC_NETWORK_FULL}\"

BIGQUERY_DATASET_NAME    = \"{BIGQUERY_DATASET_NAME}\"
BIGQUERY_TABLE_NAME      = \"{BIGQUERY_TABLE_NAME}\"

REPOSITORY               = \"{REPOSITORY}\"

DOCKERNAME_01            = \"{DOCKERNAME_01}\"
IMAGE_NAME_01            = \"{IMAGE_NAME_01}\"
IMAGE_URI_01             = \"{IMAGE_URI_01}\"

DOCKERNAME_02            = \"{DOCKERNAME_02}\"
IMAGE_NAME_02            = \"{IMAGE_NAME_02}\"
IMAGE_URI_02             = \"{IMAGE_URI_02}\"

DOCKERNAME_03            = \"{DOCKERNAME_03}\"
IMAGE_NAME_03            = \"{IMAGE_NAME_03}\"
IMAGE_URI_03             = \"{IMAGE_URI_03}\"

DOCKERNAME_04            = \"{DOCKERNAME_04}\"
IMAGE_NAME_04            = \"{IMAGE_NAME_04}\"
IMAGE_URI_04             = \"{IMAGE_URI_04}\"

DOCKERNAME_04_pred       = \"{DOCKERNAME_04_pred}\"
IMAGE_NAME_04_pred       = \"{IMAGE_NAME_04_pred}\"
IMAGE_URI_04_pred        = \"{IMAGE_URI_04_pred}\"

REMOTE_IMAGE_NAME        = \"{REMOTE_IMAGE_NAME}\"

REPO_DOCKER_PATH_PREFIX  = \"{REPO_DOCKER_PATH_PREFIX}\"
RL_SUB_DIR               = \"{RL_SUB_DIR}\"
"""
print(config)

In [None]:
!echo '{config}' | gsutil cp - {BUCKET_URI}/config/notebook_env.py

Copy your first dataset to your bucket

In [None]:
# SOURCE_URI = "gs://cloud-samples-data/vertex-ai/community-content/tf_agents_bandits_movie_recommendation_with_kfp_and_vertex_sdk/u.data"

# ! gsutil cp $SOURCE_URI $DATA_PATH_KFP_DEMO

In [None]:
# !gsutil ls $BUCKET_URI

# Create BigQuery dataset and tables

In [None]:
from google.cloud import bigquery

# bigquery client
bqclient = bigquery.Client(
    project=PROJECT_ID,
    # location=LOCATION
)

In [None]:
if CREATE_NEW_ASSETS:
    ds = bigquery.Dataset(f"{PROJECT_ID}.{BIGQUERY_DATASET_NAME}")
    ds.location = BQ_LOCATION
    ds = bqclient.create_dataset(dataset = ds, exists_ok = False)

    print(ds.full_dataset_id)

# gitignore

In [None]:
%%writefile .gitignore
*.cpython-310.pyc
*checkpoint*
*.ipynb_checkpoints/*
*WIP*
*/archive/*
# .gcloudignore
# .git
# .github
# *__pycache__
# *cpython-37.pyc
# .gitignore
# .DS_Store

# gcloudignore

In [None]:
! gcloud config set gcloudignore/enabled true

In [None]:
%%writefile .gcloudignore
.gcloudignore
/WIP/
# /img/
*.pkl
*.png
*.ipynb
.git
.github
.ipynb_checkpoints/*
*/__pycache__/*
*cpython-37.pyc
**.cpython-310.pyc
/hptuning/*
/imgs/*
README.md
.gitignore
.DS_Store
*.tfrecord
src/archive/*
00-archived/*
learning/*
.ipynb_checkpoints/**
*.md
src_root/*
*.h
*.gsutil
.local/include/python3.10/*
*.local/lib/python3.10/site-packages/gslib/tests/*
*.local/include/python3.10/*
.local/lib/*

In [None]:
# check eligible files
!gcloud meta list-files-for-upload

**Finished**