# Train REINFORCE Agent with Vertex Training

In [1]:
VERSION        = "v2"                       # TODO
PREFIX         = f'rec-bandits-{VERSION}'   # TODO

print(f"PREFIX: {PREFIX}")

PREFIX: rec-bandits-v2


In [2]:
# staging GCS
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}-bucket'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PROJECT_ID               = "hybrid-vertex"
PROJECT_NUM              = "934903580331"
LOCATION                 = "us-central1"

REGION                   = "us-central1"
BQ_LOCATION              = "US"
VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"
VERTEX_SA                = "934903580331-compute@developer.gserviceaccount.com"

PREFIX                   = "rec-bandits-v2"
VERSION                  = "v2"

BUCKET_NAME              = "rec-bandits-v2-hybrid-vertex-bucket"
BUCKET_URI               = "gs://rec-bandits-v2-hybrid-vertex-bucket"
DATA_GCS_PREFIX          = "data"
DATA_PATH                = "gs://rec-bandits-v2-hybrid-vertex-bucket/data"
VOCAB_SUBDIR             = "vocabs"
VOCAB_FILENAME           = "vocab_dict.pkl"

VPC_NETWORK_FULL         = "projects/934903580331/global/networks/ucaip-haystack-vpc-network"

BIGQUERY_DATASET_NAME    = "mvlens_rec_bandits_v2"
BIGQUERY_TABLE_NAME      = "training_dataset"

REPOSITORY               = "rl-movielens-rec-bandits-v2"

DOCKERNAM

## Imports

In [3]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [4]:
import json
import time
import numpy as np
import pandas as pd
import pickle as pkl
from pprint import pprint
from datetime import datetime

# logging
import logging
logging.disable(logging.WARNING)

from google.cloud import aiplatform
from google.cloud import storage

storage_client = storage.Client(project=PROJECT_ID)
aiplatform.init(project=PROJECT_ID,location=REGION)

import sys
sys.path.append("..")

from src.utils import train_utils
from src.data import data_utils, data_config

In [5]:
EXAMPLE_GEN_GCS_PATH = data_config.EXAMPLE_GEN_GCS_PATH
GCS_DATA_PATH = f"{BUCKET_URI}/{EXAMPLE_GEN_GCS_PATH}"

print(f"GCS_DATA_PATH: {GCS_DATA_PATH}")

# !gsutil ls $GCS_DATA_PATH

GCS_DATA_PATH: gs://rec-bandits-v2-hybrid-vertex-bucket/data/movielens/m1m


## Custom train image

> TODO: add to `notebook_env.py` config

In [6]:
DOCKERNAME_06 = "Dockerfile_reinforce_recsys"
IMAGE_NAME_06 = "train-reinforce-agent-v1"
IMAGE_URI_06 = f"gcr.io/hybrid-vertex/{IMAGE_NAME_06}"

print(f"DOCKERNAME_06 = {DOCKERNAME_06}")
print(f"IMAGE_NAME_06 = {IMAGE_NAME_06}")
print(f"IMAGE_URI_06  = {IMAGE_URI_06}")
print(f"REPOSITORY    = {REPOSITORY}")

DOCKERNAME_06 = Dockerfile_reinforce_recsys
IMAGE_NAME_06 = train-reinforce-agent-v1
IMAGE_URI_06  = gcr.io/hybrid-vertex/train-reinforce-agent-v1
REPOSITORY    = rl-movielens-rec-bandits-v2


## Train compute

In [7]:
ACCELERATOR = "t4" # str: "a100" | "t4" | None | l4
ACCELERATOR = str(ACCELERATOR)

print(f"ACCELERATOR: {ACCELERATOR}")

ACCELERATOR: t4


In [8]:
if ACCELERATOR == "a100":
    WORKER_MACHINE_TYPE = 'a2-highgpu-1g'
    REPLICA_COUNT = 1
    ACCELERATOR_TYPE = 'NVIDIA_TESLA_A100'
    PER_MACHINE_ACCELERATOR_COUNT = 1
    REDUCTION_SERVER_COUNT = 0                                                      
    REDUCTION_SERVER_MACHINE_TYPE = "n1-highcpu-16"
    DISTRIBUTE_STRATEGY = 'single'
elif ACCELERATOR == 't4':
    # WORKER_MACHINE_TYPE = 'n1-highcpu-16'
    WORKER_MACHINE_TYPE = 'n1-highmem-16'
    REPLICA_COUNT = 1
    ACCELERATOR_TYPE = 'NVIDIA_TESLA_T4'
    PER_MACHINE_ACCELERATOR_COUNT = 1
    DISTRIBUTE_STRATEGY = 'single'
    REDUCTION_SERVER_COUNT = 0                                                      
    REDUCTION_SERVER_MACHINE_TYPE = "n1-highcpu-16"
elif ACCELERATOR == 'l4':
    WORKER_MACHINE_TYPE = "g2-standard-16"
    REPLICA_COUNT = 1
    ACCELERATOR_TYPE = 'NVIDIA_L4'
    PER_MACHINE_ACCELERATOR_COUNT = 1
    DISTRIBUTE_STRATEGY = 'single'
    REDUCTION_SERVER_COUNT = 0                                                      
    REDUCTION_SERVER_MACHINE_TYPE = "n1-highcpu-16"
elif ACCELERATOR == 'tpu':
    WORKER_MACHINE_TYPE = "cloud-tpu"
    REPLICA_COUNT = 1
    ACCELERATOR_TYPE = 'TPU_v3'
    PER_MACHINE_ACCELERATOR_COUNT = 8 # 8 | +32+ for TPU Pods
    DISTRIBUTE_STRATEGY = 'single'
    REDUCTION_SERVER_COUNT = 0                                                      
    REDUCTION_SERVER_MACHINE_TYPE = None
elif ACCELERATOR == "False":
    WORKER_MACHINE_TYPE = 'n2-highmem-32' # 'n1-highmem-96'n | 'n2-highmem-92'
    REPLICA_COUNT = 1
    ACCELERATOR_TYPE = None
    PER_MACHINE_ACCELERATOR_COUNT = 0
    DISTRIBUTE_STRATEGY = 'single'
    REDUCTION_SERVER_COUNT = 0                                                      
    REDUCTION_SERVER_MACHINE_TYPE = "n1-highcpu-16"

print(f"WORKER_MACHINE_TYPE            : {WORKER_MACHINE_TYPE}")
print(f"REPLICA_COUNT                  : {REPLICA_COUNT}")
print(f"ACCELERATOR_TYPE               : {ACCELERATOR_TYPE}")
print(f"PER_MACHINE_ACCELERATOR_COUNT  : {PER_MACHINE_ACCELERATOR_COUNT}")
print(f"DISTRIBUTE_STRATEGY            : {DISTRIBUTE_STRATEGY}")
print(f"REDUCTION_SERVER_COUNT         : {REDUCTION_SERVER_COUNT}")
print(f"REDUCTION_SERVER_MACHINE_TYPE  : {REDUCTION_SERVER_MACHINE_TYPE}")

WORKER_MACHINE_TYPE            : n1-highmem-16
REPLICA_COUNT                  : 1
ACCELERATOR_TYPE               : NVIDIA_TESLA_T4
PER_MACHINE_ACCELERATOR_COUNT  : 1
DISTRIBUTE_STRATEGY            : single
REDUCTION_SERVER_COUNT         : 0
REDUCTION_SERVER_MACHINE_TYPE  : n1-highcpu-16


## set Vertex AI Experiment

**references**
* [Using Vertex TensorBoard with Experiments](https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-setup#use_default_instance)
* `aiplatform.init()` [src](https://github.com/googleapis/python-aiplatform/blob/main/google/cloud/aiplatform/initializer.py#L163)

In [27]:
EXP_VERSION = "v12"

RUN_TAG = None
# RUN_TAG = "rnn-mode"
# RUN_TAG = "reinforce-mode"
# RUN_TAG = "topk-reinforce-mode-256"
# RUN_TAG = "gamma-games-v3"
# RUN_TAG = "topk8"

In [28]:
EXPERIMENT_NAME   = f'06d-rfa-{EXP_VERSION}'
invoke_time       = time.strftime("%Y%m%d-%H%M%S")

# new experiment run
if RUN_TAG:
    RUN_NAME      = f'run-{RUN_TAG}'
else:
    RUN_NAME      = f'run-{invoke_time}'

EXPERIMENT_DIR    = os.path.join(BUCKET_URI, EXPERIMENT_NAME)
CHECKPT_DIR       = os.path.join(EXPERIMENT_DIR, "chkpoint")
BASE_OUTPUT_DIR   = os.path.join(EXPERIMENT_DIR, RUN_NAME)
LOG_DIR           = os.path.join(BASE_OUTPUT_DIR, "logs")
ROOT_DIR          = os.path.join(BASE_OUTPUT_DIR, "root")
ARTIFACTS_DIR     = os.path.join(BASE_OUTPUT_DIR, "artifacts")

aiplatform.init(
    project=PROJECT_ID,
    location=REGION,
    experiment=EXPERIMENT_NAME,
    experiment_tensorboard=True,
)

tensorboard = aiplatform.Experiment(EXPERIMENT_NAME).get_backing_tensorboard_resource()
TB_RESOURCE_NAME = tensorboard.resource_name

print(f"EXPERIMENT_NAME  : {EXPERIMENT_NAME}")
print(f"RUN_NAME         : {RUN_NAME}\n")
print(f"EXPERIMENT_DIR   : {EXPERIMENT_DIR}")
print(f"CHECKPT_DIR      : {CHECKPT_DIR}")
print(f"BASE_OUTPUT_DIR  : {BASE_OUTPUT_DIR}")
print(f"LOG_DIR          : {LOG_DIR}")
print(f"ROOT_DIR         : {ROOT_DIR}")
print(f"ARTIFACTS_DIR    : {ARTIFACTS_DIR}\n")
print(f"TB_RESOURCE_NAME : {TB_RESOURCE_NAME}")

EXPERIMENT_NAME  : 06d-rfa-v12
RUN_NAME         : run-20241213-212305

EXPERIMENT_DIR   : gs://rec-bandits-v2-hybrid-vertex-bucket/06d-rfa-v12
CHECKPT_DIR      : gs://rec-bandits-v2-hybrid-vertex-bucket/06d-rfa-v12/chkpoint
BASE_OUTPUT_DIR  : gs://rec-bandits-v2-hybrid-vertex-bucket/06d-rfa-v12/run-20241213-212305
LOG_DIR          : gs://rec-bandits-v2-hybrid-vertex-bucket/06d-rfa-v12/run-20241213-212305/logs
ROOT_DIR         : gs://rec-bandits-v2-hybrid-vertex-bucket/06d-rfa-v12/run-20241213-212305/root
ARTIFACTS_DIR    : gs://rec-bandits-v2-hybrid-vertex-bucket/06d-rfa-v12/run-20241213-212305/artifacts

TB_RESOURCE_NAME : projects/934903580331/locations/us-central1/tensorboards/257140585364717568


## Set training args

In [29]:
# ====================================================
# hyperparams
# ====================================================
BATCH_SIZE       = 512
EVAL_BATCH_SIZE  = 512
NUM_EVAL_BATCHES = 0 # num batches to take; 0 == take all dataset

# ====================================================
# agent & networks
# ====================================================
INPUT_EMBEDDING_SIZE = 100
INPUT_FC_LAYER_PARAMS = [100, 100]
LSTM_SIZE = [25,]
OUTPUT_FC_LAYER_PARAMS = [10,]

# k actions recommended by policy
POLICY_NUM_ACTIONS = 5

# number of actions with highest Q value
NUM_GREEDY_ACTIONS = 4

# num negative actions used to compute sampled_softmax loss
# if 0, regular softmax will be used instead of sampled_softmax
SAMPLED_SOFTMAX_NEGATIVES = 0

# number of actions to retrieve using SCANN in the policy
# if 0, all actions used to compute softmax
SCANN_NUM_CANDIDATES_ACTIONS = 0

# K in off policy correction to compute alpha (Section 4.3 of paper)
# if 0, no off-policy correction applied
OFF_POLICY_CORRECT_EXP = 12     # 0 | 16

# Trains policy with supervised loss (instead of "Off-Policy REINFORCE loss")
# useful for debugging e.g. as a sanity check that the model can mimick the dataset behavior
SUPERVISED_LOSS_MAIN_POLICY = False

# discount factor for future rewards
# eval: 0.9 | 0.5 | 0.1 | 0
GAMMA = 0.9

# log agent & network variables and gradients
# impacts job performance
SUMMARIZE_GRADS_AND_VARS = False

# TODO: error with summary stats
DEBUG_SUMMARIES = False

# ====================================================
# performance
# ====================================================
USE_GPU = True
USE_TPU = False
TF_GPU_THREAD_COUNT = "1" # '1' | '4' | '8'
USE_TF_FUNCTIONS = True

# ====================================================
# train job
# ====================================================
NUM_ITERATIONS   = 100_000
LOG_INTERVAL     = int(NUM_ITERATIONS/100)
EVAL_INTERVAL    = int(NUM_ITERATIONS/4)
SUMMARY_INTERVAL = int(NUM_ITERATIONS/100)
CHKPT_INTERVAL   = int(NUM_ITERATIONS/2)
LEARNING_RATE    = 1e-3

print(f"NUM_ITERATIONS   : {NUM_ITERATIONS}")
print(f"LOG_INTERVAL     : {LOG_INTERVAL}")
print(f"EVAL_INTERVAL    : {EVAL_INTERVAL}")
print(f"SUMMARY_INTERVAL : {SUMMARY_INTERVAL}")
print(f"CHKPT_INTERVAL   : {CHKPT_INTERVAL}\n")

NUM_ITERATIONS   : 100000
LOG_INTERVAL     : 1000
EVAL_INTERVAL    : 25000
SUMMARY_INTERVAL : 1000
CHKPT_INTERVAL   : 50000



### worker args

In [30]:
WORKER_ARGS = [
    f"--project={PROJECT_ID}"
    , f"--project_number={PROJECT_NUM}"
    , f"--bucket_name={BUCKET_NAME}"
    , f"--artifacts_dir={ARTIFACTS_DIR}"
    , f"--chkpoint_dir={CHECKPT_DIR}"
    , f"--log_dir={LOG_DIR}"
    , f"--data_dir_prefix_path={EXAMPLE_GEN_GCS_PATH}"
    , f"--vocab_prefix_path={EXAMPLE_GEN_GCS_PATH}/{VOCAB_SUBDIR}"
    , f"--vocab_filename={VOCAB_FILENAME}"
    
    ### hparams
    , f"--batch_size={BATCH_SIZE}"
    , f"--eval_batch_size={EVAL_BATCH_SIZE}"
    , f"--num_eval_batches={NUM_EVAL_BATCHES}"
    , f"--num_iterations={NUM_ITERATIONS}"
    
    ### agent and networks
    , f"--input_embedding_size={INPUT_EMBEDDING_SIZE}"
    , f"--input_fc_layer_params={INPUT_FC_LAYER_PARAMS}"
    , f"--lstm_size={LSTM_SIZE}"
    , f"--output_fc_layer_params={OUTPUT_FC_LAYER_PARAMS}"
    , f"--policy_num_actions={POLICY_NUM_ACTIONS}"
    , f"--num_greedy_actions={NUM_GREEDY_ACTIONS}"
    , f"--sampled_softmax_num_negatives={SAMPLED_SOFTMAX_NEGATIVES}"
    , f"--off_policy_correction_exponent={OFF_POLICY_CORRECT_EXP}"
    , f"--scann_num_candidate_actions={SCANN_NUM_CANDIDATES_ACTIONS}"
    , f"--gamma={GAMMA}"
    
    # train job
    , f"--log_interval={LOG_INTERVAL}"
    , f"--summary_interval={SUMMARY_INTERVAL}"
    , f"--eval_interval={EVAL_INTERVAL}"
    , f"--chkpt_interval={CHKPT_INTERVAL}"
    , f"--learning_rate={LEARNING_RATE}"
    , f"--tb_resource_name={TB_RESOURCE_NAME}"
    
    # experiment & runs
    , f"--experiment_name={EXPERIMENT_NAME}"
    , f"--experiment_run={RUN_NAME}"
    , f"--log_vertex_experiment" # omit for False
    
    # performance 
    , f"--tf_gpu_thread_count={TF_GPU_THREAD_COUNT}"
]

if SUPERVISED_LOSS_MAIN_POLICY:
    WORKER_ARGS.append("--use_supervised_loss_for_main_policy")

if SUMMARIZE_GRADS_AND_VARS:
    WORKER_ARGS.append("--sum_grads_vars")

if DEBUG_SUMMARIES:
    WORKER_ARGS.append("--debug_summaries")

if USE_GPU:
    WORKER_ARGS.append("--use_gpu")

if USE_TPU:
    WORKER_ARGS.append("--use_tpu")

if USE_TF_FUNCTIONS:
    WORKER_ARGS.append("--use_tf_functions")

pprint(WORKER_ARGS)

['--project=hybrid-vertex',
 '--project_number=934903580331',
 '--bucket_name=rec-bandits-v2-hybrid-vertex-bucket',
 '--artifacts_dir=gs://rec-bandits-v2-hybrid-vertex-bucket/06d-rfa-v12/run-20241213-212305/artifacts',
 '--chkpoint_dir=gs://rec-bandits-v2-hybrid-vertex-bucket/06d-rfa-v12/chkpoint',
 '--log_dir=gs://rec-bandits-v2-hybrid-vertex-bucket/06d-rfa-v12/run-20241213-212305/logs',
 '--data_dir_prefix_path=data/movielens/m1m',
 '--vocab_prefix_path=data/movielens/m1m/vocabs',
 '--vocab_filename=vocab_dict.pkl',
 '--batch_size=512',
 '--eval_batch_size=512',
 '--num_eval_batches=0',
 '--num_iterations=100000',
 '--input_embedding_size=100',
 '--input_fc_layer_params=[100, 100]',
 '--lstm_size=[25]',
 '--output_fc_layer_params=[10]',
 '--policy_num_actions=5',
 '--num_greedy_actions=4',
 '--sampled_softmax_num_negatives=0',
 '--off_policy_correction_exponent=12',
 '--scann_num_candidate_actions=0',
 '--gamma=0.9',
 '--log_interval=1000',
 '--summary_interval=1000',
 '--eval_interv

### workerpool specs

In [31]:
WORKER_POOL_SPECS = train_utils.prepare_worker_pool_specs(
    # image_uri=f"{REMOTE_IMAGE_NAME}:latest",
    image_uri=f"{IMAGE_URI_06}:latest",
    args=WORKER_ARGS,
    replica_count=REPLICA_COUNT,
    machine_type=WORKER_MACHINE_TYPE,
    accelerator_count=PER_MACHINE_ACCELERATOR_COUNT,
    accelerator_type=ACCELERATOR_TYPE,
    reduction_server_count=REDUCTION_SERVER_COUNT,
    reduction_server_machine_type=REDUCTION_SERVER_MACHINE_TYPE,
)
pprint(WORKER_POOL_SPECS)

[{'container_spec': {'args': ['--project=hybrid-vertex',
                              '--project_number=934903580331',
                              '--bucket_name=rec-bandits-v2-hybrid-vertex-bucket',
                              '--artifacts_dir=gs://rec-bandits-v2-hybrid-vertex-bucket/06d-rfa-v12/run-20241213-212305/artifacts',
                              '--chkpoint_dir=gs://rec-bandits-v2-hybrid-vertex-bucket/06d-rfa-v12/chkpoint',
                              '--log_dir=gs://rec-bandits-v2-hybrid-vertex-bucket/06d-rfa-v12/run-20241213-212305/logs',
                              '--data_dir_prefix_path=data/movielens/m1m',
                              '--vocab_prefix_path=data/movielens/m1m/vocabs',
                              '--vocab_filename=vocab_dict.pkl',
                              '--batch_size=512',
                              '--eval_batch_size=512',
                              '--num_eval_batches=0',
                              '--num_iterations=100000',

## Submit custom train job

In [32]:
JOB_NAME = f"{EXPERIMENT_NAME}-{RUN_NAME}"
print(f"JOB_NAME: {JOB_NAME}")

JOB_NAME: 06d-rfa-v12-run-20241213-212305


In [33]:
# Create a CustomJob
my_custom_job = aiplatform.CustomJob(
    display_name=JOB_NAME
    , project=PROJECT_ID
    , worker_pool_specs=WORKER_POOL_SPECS
    , base_output_dir=BASE_OUTPUT_DIR
    , staging_bucket=ROOT_DIR
)

In [34]:
my_custom_job.run(
    # experiment=EXPERIMENT_NAME,
    # experiment_run=RUN_NAME,
    tensorboard=TB_RESOURCE_NAME,
    service_account=VERTEX_SA,
    restart_job_on_worker_restart=False,
    enable_web_access=True,
    sync=False,
)

In [35]:
print(f"Job Name: {my_custom_job.display_name}")
print(f"Job Resource Name: {my_custom_job.resource_name}\n")

my_custom_job.job_spec

Job Name: 06d-rfa-v12-run-20241213-212305
Job Resource Name: projects/934903580331/locations/us-central1/customJobs/4407749941737291776



worker_pool_specs {
  container_spec {
    image_uri: "gcr.io/hybrid-vertex/train-reinforce-agent-v1:latest"
    args: "--project=hybrid-vertex"
    args: "--project_number=934903580331"
    args: "--bucket_name=rec-bandits-v2-hybrid-vertex-bucket"
    args: "--artifacts_dir=gs://rec-bandits-v2-hybrid-vertex-bucket/06d-rfa-v12/run-20241213-212305/artifacts"
    args: "--chkpoint_dir=gs://rec-bandits-v2-hybrid-vertex-bucket/06d-rfa-v12/chkpoint"
    args: "--log_dir=gs://rec-bandits-v2-hybrid-vertex-bucket/06d-rfa-v12/run-20241213-212305/logs"
    args: "--data_dir_prefix_path=data/movielens/m1m"
    args: "--vocab_prefix_path=data/movielens/m1m/vocabs"
    args: "--vocab_filename=vocab_dict.pkl"
    args: "--batch_size=512"
    args: "--eval_batch_size=512"
    args: "--num_eval_batches=0"
    args: "--num_iterations=100000"
    args: "--input_embedding_size=100"
    args: "--input_fc_layer_params=[100, 100]"
    args: "--lstm_size=[25]"
    args: "--output_fc_layer_params=[10]"
    ar

#### tmp dubgging - START

> Run training job with experiment tracking [code example](https://cloud.google.com/vertex-ai/docs/experiments/run-training-job-experiments#vertex_ai_experiment_training_job_manual_log-python_vertex_ai_sdk)

args for `.from_local_script()` [src](https://github.com/googleapis/python-aiplatform/blob/main/google/cloud/aiplatform/jobs.py#L1925)
* `script_path`: The path, relative to the working directory on your local file system, to the script that is the entry point for your training code.
* `experiment`
* `experiment_run`
* 'experiment' and 'tensorboard' cannot be set together

In [18]:
with open('../requirements.txt', 'r') as file:
    REQUIREMENT_LIST = [line.strip() for line in file if line.strip()]
    pprint(REQUIREMENT_LIST)

['google-cloud-aiplatform==1.71.0',
 'google-cloud-storage',
 'numpy',
 'numba',
 'matplotlib',
 'tensorflow==2.13.0',
 'tf-agents==0.17.0',
 'tensorflow-recommenders==0.7.3',
 'tensorflow-probability==0.20.1',
 'tensorflow-datasets',
 'tensorboard',
 'tensorboard-plugin-profile',
 'tensorboard-plugin-wit',
 'tensorboard-data-server',
 'tensorflow-io',
 'protobuf==3.20.3']


In [19]:
# aiplatform.init(
#     project=PROJECT_ID,
#     location=REGION,
#     experiment=EXPERIMENT_NAME,
#     experiment_tensorboard=True,
# )

In [20]:
# # TODO: test from local script
# my_custom_job = aiplatform.CustomJob.from_local_script(
#     display_name=JOB_NAME,
#     project=PROJECT_ID,
#     location=REGION,
#     staging_bucket=ROOT_DIR,
#     base_output_dir=BASE_OUTPUT_DIR,
    
#     # train job
#     script_path="../src/trainer/train_topkop_rfa.py",
#     container_uri=f"{IMAGE_URI_06}:latest",
#     requirements=REQUIREMENT_LIST,
#     args=WORKER_ARGS,
#     # environment_variables=XXXXX, # Dict[str,str]
    
#     # compute
#     replica_count = REPLICA_COUNT,
#     machine_type = WORKER_MACHINE_TYPE,
#     accelerator_count = PER_MACHINE_ACCELERATOR_COUNT,
#     accelerator_type = ACCELERATOR_TYPE,
#     enable_autolog = True,
# )

In [44]:
# my_custom_job.run(
#     experiment=EXPERIMENT_NAME,
#     # experiment_run=RUN_NAME,
#     # tensorboard=TB_RESOURCE_NAME,
#     service_account=VERTEX_SA,
#     restart_job_on_worker_restart=False,
#     enable_web_access=True,
#     sync=False,
# )

In [None]:
# print(f"Job Name: {my_custom_job.display_name}")
# print(f"Job Resource Name: {my_custom_job.resource_name}\n")

In [None]:
# job.job_spec

```python
job = aiplatform.CustomJob.from_local_script(
    display_name="my-custom-job",
    script_path="training_script.py",
    container_uri="gcr.io/cloud-aiplatform/training/tf-cpu.2-2:latest",
    requirements=["gcsfs==0.7.1"],
    replica_count=1,
    args=['--dataset', 'gs://my-bucket/my-dataset',
    '--model_output_uri', 'gs://my-bucket/model']
    labels={'my_key': 'my_value'},
)
```

In [50]:
from google.cloud.aiplatform.tensorboard import uploader_constants

uploader_constants.ALLOWED_PLUGINS

frozenset({'distributions',
           'graphs',
           'histograms',
           'hparams',
           'images',
           'profile',
           'scalars',
           'text'})

In [53]:
# TB_RESOURCE_NAME
TB_ID = TB_RESOURCE_NAME.split('/')[-1]
TB_ID

'257140585364717568'

In [54]:
# tensorboard = aiplatform.Experiment(experiment_name).get_backing_tensorboard_resource()
tensorboard

<google.cloud.aiplatform.tensorboard.tensorboard_resource.Tensorboard object at 0x7f47a1e02860> 
resource name: projects/934903580331/locations/us-central1/tensorboards/257140585364717568

In [55]:
tensorboard.to_dict()

{'name': 'projects/934903580331/locations/us-central1/tensorboards/257140585364717568',
 'displayName': 'Default Tensorboard 2024-11-07 19:18:27.757496',
 'createTime': '2024-11-07T19:18:28.183945Z',
 'updateTime': '2024-11-07T19:18:28.544Z',
 'etag': 'AMEw9yNES55QPajzlS7SzpFa8LkKQ-QICxojIuNha6HILTGJrqde6Z7AsrjMrV8AvBAz',
 'blobStoragePathPrefix': 'cloud-ai-platform-5e3b4da5-183a-4327-a74b-c0d3d264301f',
 'isDefault': True}

#### tmp dubgging - END

# Get link to Vertex AI Experiment console

In [None]:
vertex_experiment_run = aiplatform.ExperimentRun(
    run_name=RUN_NAME, 
    experiment=EXPERIMENT_NAME, 
    project=PROJECT_ID,
    location=REGION,
)

vertex_experiment_run

In [None]:
# get summary metrics
vertex_experiment_run.get_metrics()

In [None]:
# get params
vertex_experiment_run.get_params()

In [None]:
vertex_experiment_run.get_time_series_data_frame()

In [None]:
experiment_df = aiplatform.get_experiment_df()
experiment_df = experiment_df[experiment_df.experiment_name == EXPERIMENT_NAME]
experiment_df.T

## TensorBoard

In [None]:
TRAIN_LOG_DIR = os.path.join(LOG_DIR, 'train')
EVAL_LOG_DIR = os.path.join(LOG_DIR, 'eval')

print(f'TRAIN_LOG_DIR : {TRAIN_LOG_DIR}')
print(f'EVAL_LOG_DIR  : {EVAL_LOG_DIR}')

In [None]:
# %load_ext tensorboard
%reload_ext tensorboard

In [None]:
%tensorboard --logdir=$TRAIN_LOG_DIR

# Inference

> TODO: need to figure out saver first

**Finished**