# Using GPUs & TPUs with TF-Agents

## Load notebook config

* use the prefix defined in `00-env-setup`

In [1]:
PREFIX = 'mabv1'

In [2]:
# staging GCS
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}-bucket'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PROJECT_ID               = "hybrid-vertex"
PROJECT_NUM              = "934903580331"
LOCATION                 = "us-central1"

REGION                   = "us-central1"
BQ_LOCATION              = "US"
VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"

VERTEX_SA                = "934903580331-compute@developer.gserviceaccount.com"

PREFIX                   = "mabv1"
VERSION                  = "v1"

BUCKET_NAME              = "mabv1-hybrid-vertex-bucket"
BUCKET_URI               = "gs://mabv1-hybrid-vertex-bucket"
DATA_GCS_PREFIX          = "data"
DATA_PATH                = "gs://mabv1-hybrid-vertex-bucket/data"
VOCAB_SUBDIR             = "vocabs"
VOCAB_FILENAME           = "vocab_dict.pkl"

VPC_NETWORK_FULL         = "projects/934903580331/global/networks/ucaip-haystack-vpc-network"

BIGQUERY_DATASET_ID      = "hybrid-vertex.movielens_dataset_mabv1"
BIGQUERY_TABLE_ID        = "hybrid-vertex.movielens_dataset_mabv1.training_dataset"

REPO_DOCKER_PATH_PREFIX  = "src"
RL_SUB_DIR     

### imports

In [3]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [4]:
import functools
from collections import defaultdict
from typing import Callable, Dict, List, Optional, TypeVar
from datetime import datetime
import time
from pprint import pprint
import pickle as pkl

# logging
import logging
logging.disable(logging.WARNING)

import matplotlib.pyplot as plt
import numpy as np

# google cloud
from google.cloud import aiplatform as vertex_ai
from google.cloud import storage

# tensorflow
import tensorflow as tf

from tf_agents.bandits.metrics import tf_metrics as tf_bandit_metrics
from tf_agents.metrics import tf_metrics

from tf_agents.bandits.agents import neural_epsilon_greedy_agent
from tf_agents.bandits.agents import neural_linucb_agent
from tf_agents.bandits.networks import global_and_arm_feature_network

from tf_agents.specs import tensor_spec
from tf_agents.trajectories import time_step as ts
from tf_agents.bandits.policies import policy_utilities
from tf_agents.trajectories import trajectory
from tf_agents.policies import policy_saver

from tf_agents.bandits.specs import utils as bandit_spec_utils
from tf_agents.train.utils import spec_utils
from tf_agents.train.utils import strategy_utils
from tf_agents.train.utils import train_utils

# GPU
from numba import cuda 
import gc

import sys
sys.path.append("..")

# this repo
from src.per_arm_rl import data_utils
from src.per_arm_rl import data_config
from src.per_arm_rl import train_utils as train_utils

# tf exceptions and vars
if tf.__version__[0] != "2":
    raise Exception("The trainer only runs with TensorFlow version 2.")

T = TypeVar("T")

In [5]:
from src.perarm_features import agent_factory as agent_factory
from src.perarm_features import reward_factory as reward_factory

In [6]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [8]:
device = cuda.get_current_device()
device.reset()
gc.collect()

14

In [9]:
# cloud storage client
storage_client = storage.Client(project=PROJECT_ID)

# Vertex client
vertex_ai.init(project=PROJECT_ID, location=LOCATION)

### Generate Vocabs

In [10]:
GENERATE_VOCABS = False

print(f"GENERATE_VOCABS: {GENERATE_VOCABS}")

GENERATE_VOCABS: False


In [11]:
if not GENERATE_VOCABS:

    EXISTING_VOCAB_FILE = f'gs://{BUCKET_NAME}/{VOCAB_SUBDIR}/{VOCAB_FILENAME}'
    print(f"Downloading vocab...")
    
    os.system(f'gsutil -q cp {EXISTING_VOCAB_FILE} .')
    print(f"Downloaded vocab from: {EXISTING_VOCAB_FILE}\n")

    filehandler = open(VOCAB_FILENAME, 'rb')
    VOCAB_DICT = pkl.load(filehandler)
    filehandler.close()
    
    for key in VOCAB_DICT.keys():
        pprint(key)

Downloading vocab...
Downloaded vocab from: gs://mabv1-hybrid-vertex-bucket/vocabs/vocab_dict.pkl

'movie_id'
'user_id'
'user_occupation_text'
'movie_genres'
'bucketized_user_age'
'max_timestamp'
'min_timestamp'
'timestamp_buckets'


### train config

In [12]:
NUM_OOV_BUCKETS        = 1
GLOBAL_EMBEDDING_SIZE  = 16
MV_EMBEDDING_SIZE      = 32 #32

GLOBAL_DIM             = 64
PER_ARM_DIM            = 64

BATCH_SIZE             = 128
EVAL_BATCH_SIZE        = 1
NUM_ACTIONS            = 2 
#this is kinda deceptive - 
#our approach is to learn by "flashing" one movie rating at a time per user context. 
#The n_actions = show/don't show the movie with one degree of freedom (n-1)

print(f"NUM_OOV_BUCKETS        : {NUM_OOV_BUCKETS}")
print(f"GLOBAL_EMBEDDING_SIZE  : {GLOBAL_EMBEDDING_SIZE}")
print(f"MV_EMBEDDING_SIZE      : {MV_EMBEDDING_SIZE}")
print(f"GLOBAL_DIM             : {GLOBAL_DIM}")
print(f"PER_ARM_DIM            : {PER_ARM_DIM}")
print(f"BATCH_SIZE             : {BATCH_SIZE}")
print(f"EVAL_BATCH_SIZE        : {EVAL_BATCH_SIZE}")
print(f"NUM_ACTIONS            : {NUM_ACTIONS}")

NUM_OOV_BUCKETS        : 1
GLOBAL_EMBEDDING_SIZE  : 16
MV_EMBEDDING_SIZE      : 32
GLOBAL_DIM             : 64
PER_ARM_DIM            : 64
BATCH_SIZE             : 128
EVAL_BATCH_SIZE        : 1
NUM_ACTIONS            : 2


#### tmp - debugging

In [13]:
# options = tf.data.Options()
# options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.AUTO

SPLIT = "train" # "train" | "val"

train_files = []
for blob in storage_client.list_blobs(f"{BUCKET_NAME}", prefix=f'{DATA_GCS_PREFIX}/{SPLIT}'):
    if '.tfrecord' in blob.name:
        train_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
        
train_dataset = tf.data.TFRecordDataset(train_files)
train_dataset = train_dataset.map(data_utils.parse_tfrecord)

In [14]:
# for i in range(1):
    
#     iterator = iter(train_dataset.batch(1))
#     data = next(iterator)

# data

In [15]:
# test_globals = agent_factory._get_global_context_features_fn(
#     vocab_dict = VOCAB_DICT, 
#     num_oov_buckets = NUM_OOV_BUCKETS, 
#     global_emb_size = GLOBAL_EMBEDDING_SIZE,
#     elements = data
# )

# test_globals

In [16]:
# test_arms = agent_factory._get_per_arm_features_fn(
#     vocab_dict = VOCAB_DICT, 
#     num_oov_buckets = NUM_OOV_BUCKETS, 
#     mv_emb_size = MV_EMBEDDING_SIZE,
#     elements = data
# )

# test_arms

In [17]:
# ====================================================
# get global_context_sampling_fn
# ====================================================
def _get_global_context_features(x):
    """
    This function generates a single global observation vector.
    """
    user_id_model = agent_factory.get_user_id_emb_model(
        vocab_dict=VOCAB_DICT, 
        num_oov_buckets=NUM_OOV_BUCKETS, 
        global_emb_size=GLOBAL_EMBEDDING_SIZE
    )
    user_age_model = agent_factory.get_user_age_emb_model(
        vocab_dict=VOCAB_DICT, 
        num_oov_buckets=NUM_OOV_BUCKETS, 
        global_emb_size=GLOBAL_EMBEDDING_SIZE
    )
    user_occ_model = agent_factory.get_user_occ_emb_model(
        vocab_dict=VOCAB_DICT, 
        num_oov_buckets=NUM_OOV_BUCKETS, 
        global_emb_size=GLOBAL_EMBEDDING_SIZE
    )
    user_ts_model = agent_factory.get_ts_emb_model(
        vocab_dict=VOCAB_DICT, 
        num_oov_buckets=NUM_OOV_BUCKETS, 
        global_emb_size=GLOBAL_EMBEDDING_SIZE
    )

    # for x in train_dataset.batch(1).take(1):
    user_id_value = x['user_id']
    user_age_value = x['bucketized_user_age']
    user_occ_value = x['user_occupation_text']
    user_ts_value = x['timestamp']

    _id = user_id_model(user_id_value)
    _age = user_age_model(user_age_value)
    _occ = user_occ_model(user_occ_value)
    _ts = user_ts_model(user_ts_value)

    # to numpy array
    _id = np.array(_id.numpy())
    _age = np.array(_age.numpy())
    _occ = np.array(_occ.numpy())
    _ts = np.array(_ts.numpy())

    concat = np.concatenate(
        [_id, _age, _occ, _ts], axis=-1
    ).astype(np.float32)

    return concat

In [18]:
# test_globals

In [19]:
# test_t = _get_global_context_features(data)
# test_t

In [20]:
# ====================================================
# get per_arm_context_sampling_fn
# ====================================================
def _get_per_arm_features(x):
    """
    This function generates a single per-arm observation vector
    """

    mvid_model = agent_factory.get_mv_id_emb_model(
        vocab_dict=VOCAB_DICT, 
        num_oov_buckets=NUM_OOV_BUCKETS, 
        mv_emb_size=MV_EMBEDDING_SIZE
    )

    mvgen_model = agent_factory.get_mv_gen_emb_model(
        vocab_dict=VOCAB_DICT, 
        num_oov_buckets=NUM_OOV_BUCKETS, 
        mv_emb_size=MV_EMBEDDING_SIZE
    )

    # for x in train_dataset.batch(1).take(1):
    mv_id_value = x['movie_id']
    mv_gen_value = x['movie_genres'] #[0]

    _mid = mvid_model(mv_id_value)
    _mgen = mvgen_model(mv_gen_value)

    # to numpy array
    _mid = np.array(_mid.numpy())
    _mgen = np.array(_mgen.numpy())

    concat = np.concatenate(
        [_mid, _mgen], axis=-1
    ).astype(np.float32)

    return concat

In [21]:
# test_a = _get_per_arm_features(data)
# test_a

### TensorSpecs

In [22]:
observation_spec = {
    'global': tf.TensorSpec([GLOBAL_DIM], tf.float32),
    'per_arm': tf.TensorSpec([NUM_ACTIONS, PER_ARM_DIM], tf.float32) #excluding action dim here
}
observation_spec

{'global': TensorSpec(shape=(64,), dtype=tf.float32, name=None),
 'per_arm': TensorSpec(shape=(2, 64), dtype=tf.float32, name=None)}

In [23]:
action_spec = tensor_spec.BoundedTensorSpec(
    shape=[], 
    dtype=tf.int32,
    minimum=tf.constant(0),            
    maximum=NUM_ACTIONS-1, # n degrees of freedom and will dictate the expected mean reward spec shape
    name="action_spec"
)

action_spec

BoundedTensorSpec(shape=(), dtype=tf.int32, name='action_spec', minimum=array(0, dtype=int32), maximum=array(1, dtype=int32))

In [24]:
time_step_spec = ts.time_step_spec(
    observation_spec = observation_spec, 
    # reward_spec = _reward_spec
)
time_step_spec

TimeStep(
{'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
 'observation': {'global': TensorSpec(shape=(64,), dtype=tf.float32, name=None),
                 'per_arm': TensorSpec(shape=(2, 64), dtype=tf.float32, name=None)},
 'reward': TensorSpec(shape=(), dtype=tf.float32, name='reward'),
 'step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type')})

## Distribution strategy

Use `strategy_utils` to generate a strategy. Under the hood, passing the parameter:

* `use_gpu = False` returns `tf.distribute.get_strategy()`, which uses CPU
* `use_gpu = True` returns `tf.distribute.MirroredStrategy()`, which uses all GPUs that are visible to TensorFlow on one machine

In [25]:
use_gpu = True
use_tpu = False

distribution_strategy = strategy_utils.get_strategy(tpu=use_tpu, use_gpu=use_gpu)
distribution_strategy

<tensorflow.python.distribute.mirrored_strategy.MirroredStrategy at 0x7fc0304b4820>

In [26]:
# distribution_strategy = tf.distribute.MirroredStrategy()
# distribution_strategy

In [27]:
# distribution_strategy.cluster_resolver.

In [28]:
# tf.distribute.get_strategy()

### Config

In [29]:
# ================================
# Agents
# ================================
AGENT_TYPE      = 'epsGreedy' # 'LinUCB' | 'LinTS |, 'epsGreedy' | 'NeuralLinUCB'

# Parameters for linear agents (LinUCB and LinTS).
AGENT_ALPHA     = 0.1

# Parameters for neural agents (NeuralEpsGreedy and NerualLinUCB).
EPSILON         = 0.01
LR              = 0.05

# Parameters for NeuralLinUCB
ENCODING_DIM    = 1
EPS_PHASE_STEPS = 1000

NUM_EVAL_STEPS = 10000

# ================================
# Agent's Preprocess Network
# ================================
NETWORK_TYPE    = "commontower" # 'commontower' | 'dotproduct'

if AGENT_TYPE == 'NeuralLinUCB':
    NETWORK_TYPE = 'commontower'
    
GLOBAL_LAYERS   = [64, 32, 16]
ARM_LAYERS      = [64, 32, 16]
COMMON_LAYERS   = [16, 8]

HPARAMS = {  # TODO - streamline and consolidate
    "batch_size": BATCH_SIZE,
    "eval_batch_size" : EVAL_BATCH_SIZE,
    "num_actions": NUM_ACTIONS,
    "model_type": AGENT_TYPE,
    "network_type": NETWORK_TYPE,
    "global_layers": GLOBAL_LAYERS,
    "per_arm_layers": ARM_LAYERS,
    "common_layers": COMMON_LAYERS,
    "learning_rate": LR,
    "epsilon": EPSILON,
    "num_eval_steps": NUM_EVAL_STEPS,
}
pprint(HPARAMS)

{'batch_size': 128,
 'common_layers': [16, 8],
 'epsilon': 0.01,
 'eval_batch_size': 1,
 'global_layers': [64, 32, 16],
 'learning_rate': 0.05,
 'model_type': 'epsGreedy',
 'network_type': 'commontower',
 'num_actions': 2,
 'num_eval_steps': 10000,
 'per_arm_layers': [64, 32, 16]}


### trajectory function

In [30]:
def _trajectory_fn(element): # hparams
    
    """Converts a dataset element into a trajectory."""
    global_features = _get_global_context_features(element)
    arm_features = _get_per_arm_features(element)
    
    # global_features = agent_factory._get_global_context_features_fn(
    #     vocab_dict = VOCAB_DICT,
    #     num_oov_buckets = NUM_OOV_BUCKETS,
    #     global_emb_size = GLOBAL_EMBEDDING_SIZE,
    #     elements = element
    # )
    # arm_features = agent_factory._get_global_context_features_fn(
    #     vocab_dict = VOCAB_DICT,
    #     num_oov_buckets = NUM_OOV_BUCKETS,
    #     global_emb_size = MV_EMBEDDING_SIZE,
    #     elements = element
    # )
    
    # Adds a time dimension.
    arm_features = train_utils._add_outer_dimension(arm_features)

    # obs spec
    observation = {
        bandit_spec_utils.GLOBAL_FEATURE_KEY:
            train_utils._add_outer_dimension(global_features),
    }

    reward = train_utils._add_outer_dimension(reward_factory._get_rewards(element))

    # To emit the predicted rewards in policy_info, we need to create dummy
    # rewards to match the definition in TensorSpec for the ones specified in
    # emit_policy_info set.
    dummy_rewards = tf.zeros([HPARAMS['batch_size'], 1, HPARAMS['num_actions']])
    policy_info = policy_utilities.PerArmPolicyInfo(
        chosen_arm_features=arm_features,
        # Pass dummy mean rewards here to match the model_spec for emitting
        # mean rewards in policy info
        predicted_rewards_mean=dummy_rewards,
        bandit_policy_type=policy_utilities.BanditPolicyType.GREEDY
    )
    
    if HPARAMS['model_type'] == 'neural_ucb':
        policy_info = policy_info._replace(
            predicted_rewards_optimistic=dummy_rewards
        )
        
    return trajectory.single_step(
        observation=observation,
        action=tf.zeros_like(
            reward, dtype=tf.int32
        ),  # Arm features are copied from policy info, put dummy zeros here
        policy_info=policy_info,
        reward=reward,
        discount=tf.zeros_like(reward)
    )

#### tmp - debugging

In [32]:
for x in train_dataset.batch(HPARAMS['batch_size']).take(1):
    test_traj_v1 = _trajectory_fn(x)
    
# test_traj_v1

In [33]:
for x in train_dataset.batch(HPARAMS['batch_size']).take(1):
    test_traj_v2 = _trajectory_fn(x)
    
# test_traj_v2

In [34]:
print(f"test_traj_v1.action.shape: {test_traj_v1.action.shape}") 
print(f"test_traj_v2.action.shape: {test_traj_v2.action.shape}") 

test_traj_v1.action.shape: (128, 1)
test_traj_v2.action.shape: (128, 1)


In [35]:
print(f"test_traj_v1.discount.shape: {test_traj_v1.discount.shape}") 
print(f"test_traj_v2.discount.shape: {test_traj_v2.discount.shape}") 

test_traj_v1.discount.shape: (128, 1)
test_traj_v2.discount.shape: (128, 1)


In [36]:
print(f"test_traj_v1.observation.shape: {test_traj_v1.observation['global'].shape}") 
print(f"test_traj_v2.observation.shape: {test_traj_v2.observation['global'].shape}") 

test_traj_v1.observation.shape: (128, 1, 64)
test_traj_v2.observation.shape: (128, 1, 64)


In [37]:
# print(f"test_traj_v1.observation.shape: {test_traj_v1.observation['per_arm'].shape}") 
# print(f"test_traj_v2.observation.shape: {test_traj_v2.observation['per_arm'].shape}") 

In [38]:
print(f"test_traj_v1.reward.shape: {test_traj_v1.reward.shape}") 
print(f"test_traj_v2.reward.shape: {test_traj_v2.reward.shape}") 

test_traj_v1.reward.shape: (128, 1)
test_traj_v2.reward.shape: (128, 1)


### Create Agent

In [76]:
tf.profiler.experimental.stop()

In [31]:
with distribution_strategy.scope():
    
    global_step = tf.compat.v1.train.get_or_create_global_step()

    agent, network = agent_factory._get_agent(
        agent_type=AGENT_TYPE, 
        network_type=NETWORK_TYPE, 
        time_step_spec=time_step_spec, 
        action_spec=action_spec, 
        observation_spec=observation_spec,
        global_step = global_step,
        global_layers = GLOBAL_LAYERS,
        arm_layers = ARM_LAYERS,
        common_layers = COMMON_LAYERS,
        agent_alpha = AGENT_ALPHA,
        learning_rate = LR,
        epsilon = EPSILON,
        encoding_dim = ENCODING_DIM,
        eps_phase_steps = EPS_PHASE_STEPS,
    )
    
agent.initialize()

print(f"Agent: {agent.name}")

if network:
    print(f"Network: {network}")

Agent: OffpolicyNeuralEpsGreedyAgent
Network: global_and_arm_common_tower_network


In [78]:
global_step

MirroredVariable:{
  0: <tf.Variable 'global_step:0' shape=() dtype=int64, numpy=0>
}

#### tmp - debugging

In [None]:
# global_step = tf.compat.v1.train.get_or_create_global_step()

# agent, network = agent_factory._get_agent(
#     agent_type=AGENT_TYPE, 
#     network_type=NETWORK_TYPE, 
#     time_step_spec=time_step_spec, 
#     action_spec=action_spec, 
#     observation_spec=observation_spec,
#     global_step = global_step,
#     global_layers = GLOBAL_LAYERS,
#     arm_layers = ARM_LAYERS,
#     common_layers = COMMON_LAYERS,
#     agent_alpha = AGENT_ALPHA,
#     learning_rate = LR,
#     epsilon = EPSILON,
#     encoding_dim = ENCODING_DIM,
#     eps_phase_steps = EPS_PHASE_STEPS,
# )

# agent.initialize()

# print(f"Agent: {agent.name}")

# if network:
#     print(f"Network: {network}")

Agent: OffpolicyNeuralEpsGreedyAgent
Network: global_and_arm_common_tower_network_2


In [32]:
# agent.action_spec

In [34]:
# agent.time_step_spec

In [35]:
# agent.training_data_spec

## Vertex Experiment

In [36]:
EXPERIMENT_NAME   = f'acc-paf-v2'

# new experiment
invoke_time       = time.strftime("%Y%m%d-%H%M%S")
RUN_NAME          = f'run-{invoke_time}'

BASE_OUTPUT_DIR   = f'{BUCKET_URI}/{EXPERIMENT_NAME}/{RUN_NAME}'
LOG_DIR           = f"{BASE_OUTPUT_DIR}/logs"
ROOT_DIR          = f"{BASE_OUTPUT_DIR}/root"       # Root directory for writing logs/summaries/checkpoints.
ARTIFACTS_DIR     = f"{BASE_OUTPUT_DIR}/artifacts"  # Where the trained model will be saved and restored.

vertex_ai.init(
    project=PROJECT_ID,
    location=REGION,
    experiment=EXPERIMENT_NAME
)

print(f"EXPERIMENT_NAME   : {EXPERIMENT_NAME}")
print(f"RUN_NAME          : {RUN_NAME}\n")
print(f"BASE_OUTPUT_DIR   : {BASE_OUTPUT_DIR}")
print(f"LOG_DIR           : {LOG_DIR}")
print(f"ROOT_DIR          : {ROOT_DIR}")
print(f"ARTIFACTS_DIR     : {ARTIFACTS_DIR}")

EXPERIMENT_NAME   : acc-paf-v2
RUN_NAME          : run-20230823-220920

BASE_OUTPUT_DIR   : gs://mabv1-hybrid-vertex-bucket/acc-paf-v2/run-20230823-220920
LOG_DIR           : gs://mabv1-hybrid-vertex-bucket/acc-paf-v2/run-20230823-220920/logs
ROOT_DIR          : gs://mabv1-hybrid-vertex-bucket/acc-paf-v2/run-20230823-220920/root
ARTIFACTS_DIR     : gs://mabv1-hybrid-vertex-bucket/acc-paf-v2/run-20230823-220920/artifacts


In [37]:
# ====================================================
# TB summary writer
# ====================================================
with distribution_strategy.scope():
    train_summary_writer = tf.compat.v2.summary.create_file_writer(
        f"{LOG_DIR}/train", flush_millis=10 * 1000
    )

    train_summary_writer.set_as_default()

## Train loops

In [38]:
from src.perarm_features import train_perarm as train_perarm

NUM_ITER_STEPS       = 100
STEPS_PER_LOOP       = 1
LOG_INTERVAL         = 10
CHKPT_INTERVAL       = 200
NUM_EVAL_STEPS       = 100
ASYNC_STEPS_PER_LOOP = 1

print(f"NUM_ITER_STEPS       : {NUM_ITER_STEPS}")       #  = 50
print(f"STEPS_PER_LOOP       : {STEPS_PER_LOOP}")       #  = 1
print(f"LOG_INTERVAL         : {LOG_INTERVAL}")         #  = 10
print(f"CHKPT_INTERVAL       : {CHKPT_INTERVAL}")       #  = 200
print(f"NUM_EVAL_STEPS       : {NUM_EVAL_STEPS}")       #  = 100
print(f"ASYNC_STEPS_PER_LOOP : {ASYNC_STEPS_PER_LOOP}") # = 1

NUM_ITER_STEPS       : 100
STEPS_PER_LOOP       : 1
LOG_INTERVAL         : 10
CHKPT_INTERVAL       : 200
NUM_EVAL_STEPS       : 100
ASYNC_STEPS_PER_LOOP : 1


In [39]:
TOTAL_TRAIN_TAKE = NUM_ITER_STEPS * HPARAMS['batch_size']
TOTAL_TRAIN_TAKE

12800

In [40]:
#start the timer and training
start_time = time.time()

# Reset the train step
agent.train_step_counter.assign(0)

metric_results, agent = train_perarm.train_perarm(
    agent = agent,
    global_dim = GLOBAL_DIM,
    per_arm_dim = PER_ARM_DIM,
    num_iterations = NUM_ITER_STEPS,
    steps_per_loop = STEPS_PER_LOOP,
    num_eval_steps = NUM_EVAL_STEPS,
    # data
    batch_size = HPARAMS['batch_size'],
    eval_batch_size = HPARAMS['eval_batch_size'],
    # functions
    _trajectory_fn = _trajectory_fn,
    # _run_bandit_eval_fn = _run_bandit_eval,
    # train intervals
    chkpt_interval = CHKPT_INTERVAL,
    log_interval = LOG_INTERVAL,
    # dirs
    bucket_name = BUCKET_NAME,
    data_dir_prefix_path = DATA_GCS_PREFIX,
    log_dir = LOG_DIR,
    model_dir = ARTIFACTS_DIR,
    root_dir = ROOT_DIR,
    async_steps_per_loop = ASYNC_STEPS_PER_LOOP,
    resume_training_loops = False,
    use_gpu = True,
    use_tpu = False,
    profiler = True,
    global_step = global_step,
    total_train_take = TOTAL_TRAIN_TAKE, # TODO - remove?
    train_summary_writer = train_summary_writer,
    # additional_metrics = metrics,
)

end_time = time.time()
runtime_mins = int((end_time - start_time) / 60)
print(f"complete train job in {runtime_mins} minutes")

distribution_strategy: <tensorflow.python.distribute.mirrored_strategy.MirroredStrategy object at 0x7fbfe862f160>
train_files: ['gs://mabv1-hybrid-vertex-bucket/data/train/ml-ratings-100k-train.tfrecord']
train_ds_iterator: <tensorflow.python.data.ops.iterator_ops.OwnedIterator object at 0x7fbfe8659930>
setting checkpoint_manager: gs://mabv1-hybrid-vertex-bucket/acc-paf-v2/run-20230823-220920/root/chkpoint
Did not find a pre-existing checkpoint. Starting from scratch.
wrapping agent.train in tf-function
starting_loop: 0
starting train loop...
step = 0: loss = 15.930000305175781
step = 10: loss = 8.600000381469727
step = 20: loss = 1.6799999475479126
step = 30: loss = 2.319999933242798
step = 40: loss = 1.1299999952316284
step = 50: loss = 1.440000057220459
step = 60: loss = 1.440000057220459
step = 70: loss = 1.6399999856948853
step = 80: loss = 1.3899999856948853
step = 90: loss = 1.309999942779541
runtime_mins: 1
saved trained policy to: gs://mabv1-hybrid-vertex-bucket/acc-paf-v2/run

### GPU profiling

> once training job begins, enter these commands in the Vertex interactive terminal:

```bash
sudo apt update
sudo apt -y install nvtop
```

In [41]:
# %load_ext tensorboard
%reload_ext tensorboard

In [42]:
%tensorboard --logdir=$LOG_DIR 