# Using GPUs & TPUs with TF-Agents

## Load notebook config

* use the prefix defined in `00-env-setup`

In [1]:
PREFIX = 'mabv1'

In [2]:
# staging GCS
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}-bucket'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PROJECT_ID               = "hybrid-vertex"
PROJECT_NUM              = "934903580331"
LOCATION                 = "us-central1"

REGION                   = "us-central1"
BQ_LOCATION              = "US"
VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"

VERTEX_SA                = "934903580331-compute@developer.gserviceaccount.com"

PREFIX                   = "mabv1"
VERSION                  = "v1"

BUCKET_NAME              = "mabv1-hybrid-vertex-bucket"
DATA_GCS_PREFIX          = "data"
DATA_PATH                = "gs://mabv1-hybrid-vertex-bucket/data"
BUCKET_URI               = "gs://mabv1-hybrid-vertex-bucket"

VPC_NETWORK_FULL         = "projects/934903580331/global/networks/ucaip-haystack-vpc-network"

BIGQUERY_DATASET_ID      = "hybrid-vertex.movielens_dataset_mabv1"
BIGQUERY_TABLE_ID        = "hybrid-vertex.movielens_dataset_mabv1.training_dataset"

REPO_DOCKER_PATH_PREFIX  = "src"
RL_SUB_DIR               = "per_arm_rl"



### imports

In [3]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [4]:
import functools
from collections import defaultdict
from typing import Callable, Dict, List, Optional, TypeVar
from datetime import datetime
import time
from pprint import pprint
import pickle as pkl

# logging
import logging
logging.disable(logging.WARNING)

import matplotlib.pyplot as plt
import numpy as np

# google cloud
from google.cloud import aiplatform as vertex_ai
from google.cloud import storage

# tensorflow
import tensorflow as tf

from tf_agents.bandits.metrics import tf_metrics as tf_bandit_metrics
from tf_agents.metrics import tf_metrics

from tf_agents.bandits.agents import neural_epsilon_greedy_agent
from tf_agents.bandits.agents import neural_linucb_agent
from tf_agents.bandits.networks import global_and_arm_feature_network

from tf_agents.specs import tensor_spec
from tf_agents.trajectories import time_step as ts
from tf_agents.bandits.policies import policy_utilities
from tf_agents.trajectories import trajectory
from tf_agents.policies import policy_saver

from tf_agents.bandits.specs import utils as bandit_spec_utils
from tf_agents.train.utils import spec_utils
from tf_agents.train.utils import strategy_utils
from tf_agents.train.utils import train_utils

# GPU
from numba import cuda 
import gc

import sys
sys.path.append("..")

# this repo
from src.per_arm_rl import data_utils
from src.per_arm_rl import data_config
from src.per_arm_rl import train_utils as train_utils

# tf exceptions and vars
if tf.__version__[0] != "2":
    raise Exception("The trainer only runs with TensorFlow version 2.")

T = TypeVar("T")

In [5]:
from src.perarm_features import agent_factory as agent_factory
from src.perarm_features import reward_factory as reward_factory

In [6]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [9]:
device = cuda.get_current_device()
device.reset()
gc.collect()

19

In [10]:
# cloud storage client
storage_client = storage.Client(project=PROJECT_ID)

# Vertex client
vertex_ai.init(project=PROJECT_ID, location=LOCATION)

### Generate Vocabs

In [11]:
GENERATE_VOCABS = False
print(f"GENERATE_VOCABS: {GENERATE_VOCABS}")

VOCAB_SUBDIR   = "vocabs"
VOCAB_FILENAME = "vocab_dict.pkl"

GENERATE_VOCABS: False


In [12]:
if not GENERATE_VOCABS:

    EXISTING_VOCAB_FILE = f'gs://{BUCKET_NAME}/{VOCAB_SUBDIR}/{VOCAB_FILENAME}'
    print(f"Downloading vocab...")
    
    os.system(f'gsutil -q cp {EXISTING_VOCAB_FILE} .')
    print(f"Downloaded vocab from: {EXISTING_VOCAB_FILE}\n")

    filehandler = open(VOCAB_FILENAME, 'rb')
    VOCAB_DICT = pkl.load(filehandler)
    filehandler.close()
    
    for key in VOCAB_DICT.keys():
        pprint(key)

Downloading vocab...
Downloaded vocab from: gs://mabv1-hybrid-vertex-bucket/vocabs/vocab_dict.pkl

'movie_id'
'user_id'
'user_occupation_text'
'movie_genres'
'bucketized_user_age'
'max_timestamp'
'min_timestamp'
'timestamp_buckets'


### train config

In [13]:
NUM_OOV_BUCKETS        = 1
GLOBAL_EMBEDDING_SIZE  = 16
MV_EMBEDDING_SIZE      = 32 #32

GLOBAL_DIM             = 64
PER_ARM_DIM            = 64

BATCH_SIZE             = 128
EVAL_BATCH_SIZE        = 1
NUM_ACTIONS            = 2 
#this is kinda deceptive - 
#our approach is to learn by "flashing" one movie rating at a time per user context. 
#The n_actions = show/don't show the movie with one degree of freedom (n-1)

print(f"NUM_OOV_BUCKETS        : {NUM_OOV_BUCKETS}")
print(f"GLOBAL_EMBEDDING_SIZE  : {GLOBAL_EMBEDDING_SIZE}")
print(f"MV_EMBEDDING_SIZE      : {MV_EMBEDDING_SIZE}")
print(f"GLOBAL_DIM             : {GLOBAL_DIM}")
print(f"PER_ARM_DIM            : {PER_ARM_DIM}")
print(f"BATCH_SIZE             : {BATCH_SIZE}")
print(f"EVAL_BATCH_SIZE        : {EVAL_BATCH_SIZE}")
print(f"NUM_ACTIONS            : {NUM_ACTIONS}")

NUM_OOV_BUCKETS        : 1
GLOBAL_EMBEDDING_SIZE  : 16
MV_EMBEDDING_SIZE      : 32
GLOBAL_DIM             : 64
PER_ARM_DIM            : 64
BATCH_SIZE             : 128
EVAL_BATCH_SIZE        : 1
NUM_ACTIONS            : 2


In [14]:
# ====================================================
# get global_context_sampling_fn
# ====================================================
def _get_global_context_features(x):
    """
    This function generates a single global observation vector.
    """
    user_id_model = agent_factory.get_user_id_emb_model(
        vocab_dict=VOCAB_DICT, 
        num_oov_buckets=NUM_OOV_BUCKETS, 
        global_emb_size=GLOBAL_EMBEDDING_SIZE
    )
    user_age_model = agent_factory.get_user_age_emb_model(
        vocab_dict=VOCAB_DICT, 
        num_oov_buckets=NUM_OOV_BUCKETS, 
        global_emb_size=GLOBAL_EMBEDDING_SIZE
    )
    user_occ_model = agent_factory.get_user_occ_emb_model(
        vocab_dict=VOCAB_DICT, 
        num_oov_buckets=NUM_OOV_BUCKETS, 
        global_emb_size=GLOBAL_EMBEDDING_SIZE
    )
    user_ts_model = agent_factory.get_ts_emb_model(
        vocab_dict=VOCAB_DICT, 
        num_oov_buckets=NUM_OOV_BUCKETS, 
        global_emb_size=GLOBAL_EMBEDDING_SIZE
    )

    # for x in train_dataset.batch(1).take(1):
    user_id_value = x['user_id']
    user_age_value = x['bucketized_user_age']
    user_occ_value = x['user_occupation_text']
    user_ts_value = x['timestamp']

    _id = user_id_model(user_id_value)
    _age = user_age_model(user_age_value)
    _occ = user_occ_model(user_occ_value)
    _ts = user_ts_model(user_ts_value)

    # to numpy array
    _id = np.array(_id.numpy())
    _age = np.array(_age.numpy())
    _occ = np.array(_occ.numpy())
    _ts = np.array(_ts.numpy())

    concat = np.concatenate(
        [_id, _age, _occ, _ts], axis=-1
    ).astype(np.float32)

    return concat

In [15]:
# ====================================================
# get per_arm_context_sampling_fn
# ====================================================
def _get_per_arm_features(x):
    """
    This function generates a single per-arm observation vector
    """

    mvid_model = agent_factory.get_mv_id_emb_model(
        vocab_dict=VOCAB_DICT, 
        num_oov_buckets=NUM_OOV_BUCKETS, 
        mv_emb_size=MV_EMBEDDING_SIZE
    )

    mvgen_model = agent_factory.get_mv_gen_emb_model(
        vocab_dict=VOCAB_DICT, 
        num_oov_buckets=NUM_OOV_BUCKETS, 
        mv_emb_size=MV_EMBEDDING_SIZE
    )

    # for x in train_dataset.batch(1).take(1):
    mv_id_value = x['movie_id']
    mv_gen_value = x['movie_genres'] #[0]

    _mid = mvid_model(mv_id_value)
    _mgen = mvgen_model(mv_gen_value)

    # to numpy array
    _mid = np.array(_mid.numpy())
    _mgen = np.array(_mgen.numpy())

    concat = np.concatenate(
        [_mid, _mgen], axis=-1
    ).astype(np.float32)

    return concat

### TensorSpecs

In [16]:
observation_spec = {
    'global': tf.TensorSpec([GLOBAL_DIM], tf.float32),
    'per_arm': tf.TensorSpec([NUM_ACTIONS, PER_ARM_DIM], tf.float32) #excluding action dim here
}
observation_spec

{'global': TensorSpec(shape=(64,), dtype=tf.float32, name=None),
 'per_arm': TensorSpec(shape=(2, 64), dtype=tf.float32, name=None)}

In [17]:
action_spec = tensor_spec.BoundedTensorSpec(
    shape=[], 
    dtype=tf.int32,
    minimum=tf.constant(0),            
    maximum=NUM_ACTIONS-1, # n degrees of freedom and will dictate the expected mean reward spec shape
    name="action_spec"
)

action_spec

BoundedTensorSpec(shape=(), dtype=tf.int32, name='action_spec', minimum=array(0, dtype=int32), maximum=array(1, dtype=int32))

In [18]:
time_step_spec = ts.time_step_spec(
    observation_spec = observation_spec, 
    # reward_spec = _reward_spec
)
time_step_spec

TimeStep(
{'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
 'observation': {'global': TensorSpec(shape=(64,), dtype=tf.float32, name=None),
                 'per_arm': TensorSpec(shape=(2, 64), dtype=tf.float32, name=None)},
 'reward': TensorSpec(shape=(), dtype=tf.float32, name='reward'),
 'step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type')})

## Distribution strategy

Use `strategy_utils` to generate a strategy. Under the hood, passing the parameter:

* `use_gpu = False` returns `tf.distribute.get_strategy()`, which uses CPU
* `use_gpu = True` returns `tf.distribute.MirroredStrategy()`, which uses all GPUs that are visible to TensorFlow on one machine

In [19]:
RUN_IN_NOTEBOOK = False

In [20]:
use_gpu = True
use_tpu = False

distribution_strategy = strategy_utils.get_strategy(tpu=use_tpu, use_gpu=use_gpu)
distribution_strategy

<tensorflow.python.distribute.mirrored_strategy.MirroredStrategy at 0x7f82cfd10b20>

In [21]:
# distribution_strategy = tf.distribute.MirroredStrategy()
# distribution_strategy

In [22]:
# distribution_strategy.cluster_resolver.

In [23]:
# tf.distribute.get_strategy()

### Agent Networks

In [24]:
# ================================
# Agents
# ================================
AGENT_TYPE      = 'epsGreedy' # 'LinUCB' | 'LinTS |, 'epsGreedy' | 'NeuralLinUCB'

# Parameters for linear agents (LinUCB and LinTS).
AGENT_ALPHA     = 0.1

# Parameters for neural agents (NeuralEpsGreedy and NerualLinUCB).
EPSILON         = 0.01
LR              = 0.05

# Parameters for NeuralLinUCB
ENCODING_DIM    = 1
EPS_PHASE_STEPS = 1000

NUM_EVAL_STEPS = 10000

# ================================
# Agent's Preprocess Network
# ================================
NETWORK_TYPE    = "commontower" # 'commontower' | 'dotproduct'

if AGENT_TYPE == 'NeuralLinUCB':
    NETWORK_TYPE = 'commontower'
    
GLOBAL_LAYERS   = [64, 32, 16]
ARM_LAYERS      = [64, 32, 16]
COMMON_LAYERS   = [16, 8]

HPARAMS = {  # TODO - streamline and consolidate
    "batch_size": BATCH_SIZE,
    "eval_batch_size" : EVAL_BATCH_SIZE,
    "num_actions": NUM_ACTIONS,
    "model_type": AGENT_TYPE,
    "network_type": NETWORK_TYPE,
    "global_layers": GLOBAL_LAYERS,
    "per_arm_layers": ARM_LAYERS,
    "common_layers": COMMON_LAYERS,
    "learning_rate": LR,
    "epsilon": EPSILON,
    "num_eval_steps": NUM_EVAL_STEPS,
}
pprint(HPARAMS)

{'batch_size': 128,
 'common_layers': [16, 8],
 'epsilon': 0.01,
 'eval_batch_size': 1,
 'global_layers': [64, 32, 16],
 'learning_rate': 0.05,
 'model_type': 'epsGreedy',
 'network_type': 'commontower',
 'num_actions': 2,
 'num_eval_steps': 10000,
 'per_arm_layers': [64, 32, 16]}


In [23]:
with distribution_strategy.scope():
    
    global_step = tf.compat.v1.train.get_or_create_global_step()

    agent, network = agent_factory._get_agent(
        agent_type=AGENT_TYPE, 
        network_type=NETWORK_TYPE, 
        time_step_spec=time_step_spec, 
        action_spec=action_spec, 
        observation_spec=observation_spec,
        global_step = global_step,
        global_layers = GLOBAL_LAYERS,
        arm_layers = ARM_LAYERS,
        common_layers = COMMON_LAYERS,
        agent_alpha = AGENT_ALPHA,
        learning_rate = LR,
        epsilon = EPSILON,
        encoding_dim = ENCODING_DIM,
        eps_phase_steps = EPS_PHASE_STEPS,
    )
    
agent.initialize()

print(f"Agent: {agent.name}")

if network:
    print(f"Network: {network}")

Agent: OffpolicyNeuralEpsGreedyAgent
Network: global_and_arm_common_tower_network


In [25]:
def _trajectory_fn(element): # hparams
    
    """Converts a dataset element into a trajectory."""
    global_features = _get_global_context_features(element)
    arm_features = _get_per_arm_features(element)
    
    # Adds a time dimension.
    arm_features = train_utils._add_outer_dimension(arm_features)

    # obs spec
    observation = {
        bandit_spec_utils.GLOBAL_FEATURE_KEY:
            train_utils._add_outer_dimension(global_features),
    }

    reward = train_utils._add_outer_dimension(reward_factory._get_rewards(element))

    # To emit the predicted rewards in policy_info, we need to create dummy
    # rewards to match the definition in TensorSpec for the ones specified in
    # emit_policy_info set.
    dummy_rewards = tf.zeros([HPARAMS['batch_size'], 1, HPARAMS['num_actions']])
    policy_info = policy_utilities.PerArmPolicyInfo(
        chosen_arm_features=arm_features,
        # Pass dummy mean rewards here to match the model_spec for emitting
        # mean rewards in policy info
        predicted_rewards_mean=dummy_rewards,
        bandit_policy_type=policy_utilities.BanditPolicyType.GREEDY
    )
    
    if HPARAMS['model_type'] == 'neural_ucb':
        policy_info = policy_info._replace(
            predicted_rewards_optimistic=dummy_rewards
        )
        
    return trajectory.single_step(
        observation=observation,
        action=tf.zeros_like(
            reward, dtype=tf.int32
        ),  # Arm features are copied from policy info, put dummy zeros here
        policy_info=policy_info,
        reward=reward,
        discount=tf.zeros_like(reward)
    )

### Vertex Experiment

In [36]:
if RUN_IN_NOTEBOOK:
    
    EXPERIMENT_NAME   = f'acc-paf-v2'

    # new experiment
    invoke_time       = time.strftime("%Y%m%d-%H%M%S")
    RUN_NAME          = f'run-{invoke_time}'

    BASE_OUTPUT_DIR   = f'{BUCKET_URI}/{EXPERIMENT_NAME}/{RUN_NAME}'
    LOG_DIR           = f"{BASE_OUTPUT_DIR}/logs"
    ROOT_DIR          = f"{BASE_OUTPUT_DIR}/root"       # Root directory for writing logs/summaries/checkpoints.
    ARTIFACTS_DIR     = f"{BASE_OUTPUT_DIR}/artifacts"  # Where the trained model will be saved and restored.

    vertex_ai.init(
        project=PROJECT_ID,
        location=REGION,
        experiment=EXPERIMENT_NAME
    )

    print(f"EXPERIMENT_NAME   : {EXPERIMENT_NAME}")
    print(f"RUN_NAME          : {RUN_NAME}\n")
    print(f"BASE_OUTPUT_DIR   : {BASE_OUTPUT_DIR}")
    print(f"LOG_DIR           : {LOG_DIR}")
    print(f"ROOT_DIR          : {ROOT_DIR}")
    print(f"ARTIFACTS_DIR     : {ARTIFACTS_DIR}")
    
else:
    print("set experiment in `train_perarm` script section below...")

set experiment in `train_perarm` script section below...


### Data prep

In [26]:
options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.AUTO

In [27]:
if RUN_IN_NOTEBOOK:
    
    train_files = []
    for blob in storage_client.list_blobs(f"{BUCKET_NAME}", prefix=f'{DATA_GCS_PREFIX}/train'):
        if '.tfrecord' in blob.name:
            train_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))

    print(train_files)

    train_dataset = tf.data.TFRecordDataset(train_files)
    train_dataset = train_dataset.map(data_utils.parse_tfrecord)

    train_dataset = train_dataset.cache()
    train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)

    # train_ds_iterator = iter(train_dataset)
    train_ds_iterator = iter(train_dataset.batch(HPARAMS['batch_size']))

    print(f"train_ds_iterator: {train_ds_iterator}")
    
else:
    print("train dataset defined in `train_perarm` script")

train dataset defined in `train_perarm` script


In [28]:
# for i in range(1):
    
#     # iterator = iter(train_dataset.batch(1))
#     data = next(train_ds_iterator)

# data

In [29]:
if RUN_IN_NOTEBOOK:
    
    val_files = []
    for blob in storage_client.list_blobs(f"{BUCKET_NAME}", prefix=f'{DATA_GCS_PREFIX}/val'):
        if '.tfrecord' in blob.name:
            val_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))

    val_dataset = tf.data.TFRecordDataset(val_files)

    # val_dataset.repeat().batch(HPARAMS['eval_batch_size'])

    val_dataset = val_dataset.map(data_utils.parse_tfrecord) #, num_parallel_calls=tf.data.AUTOTUNE)

    # eval_ds = val_dataset.prefetch(tf.data.AUTOTUNE)

    eval_ds = val_dataset.batch(HPARAMS["eval_batch_size"])

    # train_ds_iterator = iter(dist_train_ds)
    if HPARAMS['num_eval_steps'] > 0:
        eval_ds = eval_ds.take(HPARAMS['num_eval_steps'])

    eval_ds = eval_ds.cache()

    print(f"eval_ds: {eval_ds}")
    
else:
    print("eval dataset defined in `train_perarm` script")

eval dataset defined in `train_perarm` script


In [30]:
if RUN_IN_NOTEBOOK:
    # ====================================================
    # TB summary writer
    # ====================================================
    with distribution_strategy.scope():
        train_summary_writer = tf.compat.v2.summary.create_file_writer(
            f"{LOG_DIR}/train", flush_millis=10 * 1000
        )

        train_summary_writer.set_as_default()
        
else:
    print("set summary writer below...")

### Metrics

In [31]:
if RUN_IN_NOTEBOOK:
    # ====================================================
    # metrics
    # ====================================================
    step_metric = tf_metrics.EnvironmentSteps()
    metrics = [
        # tf_metrics.NumberOfEpisodes(),
        # tf_metrics.AverageEpisodeLengthMetric(batch_size=batch_size),
        tf_metrics.AverageReturnMetric(batch_size=HPARAMS['batch_size'])
    ]
    pprint(f"metrics: {metrics}")
    
else:
    print("metrics defined in `train_perarm` script")

metrics defined in `train_perarm` script


### Policy Saver & Chkpts

In [32]:
if RUN_IN_NOTEBOOK:
    
    # ====================================================
    # get checkpoint manager
    # ====================================================
    CHKPOINT_DIR = f"{ROOT_DIR}/chkpoint"
    print(f"setting checkpoint_manager: {CHKPOINT_DIR}")

    checkpoint_manager = train_utils.restore_and_get_checkpoint_manager(
        root_dir=CHKPOINT_DIR, 
        agent=agent, 
        metrics=metrics, 
        step_metric=step_metric
    )
    # ====================================================
    # policy saver
    # ====================================================
    saver = policy_saver.PolicySaver(
        agent.policy, 
        train_step=global_step
    )
    
else:
    print("chkpt and saver defined in `train_perarm` script")

chkpt and saver defined in `train_perarm` script


# [1] Run simple, in-notebook train loop

In [33]:
NUM_ITER_STEPS = 100
log_interval   = 10
CHKPT_INTERVAL = 200

# Reset the train step
agent.train_step_counter.assign(0)

<tf.Variable 'UnreadVariable' shape=() dtype=int64, numpy=0>

In [31]:
# tf.profiler.experimental.stop()

In [32]:
if RUN_IN_NOTEBOOK:
    # ====================================================
    # train loop
    # ====================================================
    list_o_loss = []

    print(f"starting train loop...")
    start_time = time.time()

    tf.profiler.experimental.start(LOG_DIR) # LOG_DIR | PROFILER_DIR

    # for i in tqdm(range(NUM_ITER_STEPS)):
    for i in range(NUM_ITER_STEPS):

        with train_summary_writer.as_default():

            data = next(train_ds_iterator)
            trajectories = _trajectory_fn(data)

            # All tensors in experience must be shaped [batch, time, ...] 
            step = agent.train_step_counter.numpy()
            loss = agent.train(experience=trajectories)
            list_o_loss.append(loss.loss.numpy())

            train_utils._export_metrics_and_summaries(
                step=i, 
                metrics=metrics
            )

            # print step loss
            if step % log_interval == 0:
                print(
                    'step = {0}: train loss = {1}'.format(
                        step, round(loss.loss.numpy(), 2)
                    )
                )

            if i > 0 and i % CHKPT_INTERVAL == 0:
                saver.save(os.path.join(CHKPOINT_DIR, 'policy_%d' % step_metric.result()))
                print(f"saved policy to: {CHKPOINT_DIR}")

    tf.profiler.experimental.stop()

    runtime_mins = int((time.time() - start_time) / 60)
    print(f"train runtime_mins: {runtime_mins}")

    saver.save(ARTIFACTS_DIR)
    print(f"saved trained policy to: {ARTIFACTS_DIR}")
    
else:
    print("skipping in-notebook loop")

starting train loop...
step = 0: train loss = 15.90999984741211
step = 10: train loss = 2.4000000953674316
step = 20: train loss = 1.6100000143051147
step = 30: train loss = 1.4900000095367432
step = 40: train loss = 1.059999942779541
step = 50: train loss = 1.4800000190734863
step = 60: train loss = 1.4199999570846558
step = 70: train loss = 1.4299999475479126
step = 80: train loss = 1.4199999570846558
step = 90: train loss = 1.399999976158142
step = 100: train loss = 1.590000033378601
step = 110: train loss = 1.4299999475479126
step = 120: train loss = 1.2400000095367432
step = 130: train loss = 1.0499999523162842
step = 140: train loss = 1.190000057220459
train runtime_mins: 2
saved trained policy to: gs://mabv1-hybrid-vertex-bucket/acc-paf-v2/run-20230822-114115/artifacts


In [33]:
LOG_DIR # LOG_DIR | PROFILER_DIR

'gs://mabv1-hybrid-vertex-bucket/acc-paf-v2/run-20230822-114115/logs'

### Tensorboard

In [34]:
# %load_ext tensorboard
%reload_ext tensorboard

In [35]:
# LOG_DIR | PROFILER_DIR

%tensorboard --logdir=$LOG_DIR  

# [2] Run `train_perarm.py` train loop

In [26]:
# tf.profiler.experimental.stop()

In [27]:
with distribution_strategy.scope():
    
    global_step = tf.compat.v1.train.get_or_create_global_step()

    agent, network = agent_factory._get_agent(
        agent_type=AGENT_TYPE, 
        network_type=NETWORK_TYPE, 
        time_step_spec=time_step_spec, 
        action_spec=action_spec, 
        observation_spec=observation_spec,
        global_step = global_step,
        global_layers = GLOBAL_LAYERS,
        arm_layers = ARM_LAYERS,
        common_layers = COMMON_LAYERS,
        agent_alpha = AGENT_ALPHA,
        learning_rate = LR,
        epsilon = EPSILON,
        encoding_dim = ENCODING_DIM,
        eps_phase_steps = EPS_PHASE_STEPS,
    )
    
agent.initialize()

print(f"Agent: {agent.name}")

if network:
    print(f"Network: {network}")

Agent: OffpolicyNeuralEpsGreedyAgent
Network: global_and_arm_common_tower_network


In [28]:
global_step

MirroredVariable:{
  0: <tf.Variable 'global_step:0' shape=() dtype=int64, numpy=0>
}

## Vertex Experiment

In [29]:
EXPERIMENT_NAME   = f'acc-paf-v2'

# new experiment
invoke_time       = time.strftime("%Y%m%d-%H%M%S")
RUN_NAME          = f'run-{invoke_time}'

BASE_OUTPUT_DIR   = f'{BUCKET_URI}/{EXPERIMENT_NAME}/{RUN_NAME}'
LOG_DIR           = f"{BASE_OUTPUT_DIR}/logs"
ROOT_DIR          = f"{BASE_OUTPUT_DIR}/root"       # Root directory for writing logs/summaries/checkpoints.
ARTIFACTS_DIR     = f"{BASE_OUTPUT_DIR}/artifacts"  # Where the trained model will be saved and restored.

vertex_ai.init(
    project=PROJECT_ID,
    location=REGION,
    experiment=EXPERIMENT_NAME
)

print(f"EXPERIMENT_NAME   : {EXPERIMENT_NAME}")
print(f"RUN_NAME          : {RUN_NAME}\n")
print(f"BASE_OUTPUT_DIR   : {BASE_OUTPUT_DIR}")
print(f"LOG_DIR           : {LOG_DIR}")
print(f"ROOT_DIR          : {ROOT_DIR}")
print(f"ARTIFACTS_DIR     : {ARTIFACTS_DIR}")

EXPERIMENT_NAME   : acc-paf-v2
RUN_NAME          : run-20230822-213342

BASE_OUTPUT_DIR   : gs://mabv1-hybrid-vertex-bucket/acc-paf-v2/run-20230822-213342
LOG_DIR           : gs://mabv1-hybrid-vertex-bucket/acc-paf-v2/run-20230822-213342/logs
ROOT_DIR          : gs://mabv1-hybrid-vertex-bucket/acc-paf-v2/run-20230822-213342/root
ARTIFACTS_DIR     : gs://mabv1-hybrid-vertex-bucket/acc-paf-v2/run-20230822-213342/artifacts


In [30]:
# ====================================================
# TB summary writer
# ====================================================
with distribution_strategy.scope():
    train_summary_writer = tf.compat.v2.summary.create_file_writer(
        f"{LOG_DIR}/train", flush_millis=10 * 1000
    )

    train_summary_writer.set_as_default()

## train loops

In [31]:
from src.perarm_features import train_perarm as train_perarm

NUM_ITER_STEPS       = 100
STEPS_PER_LOOP       = 1
LOG_INTERVAL         = 50
CHKPT_INTERVAL       = 200
NUM_EVAL_STEPS       = 100
ASYNC_STEPS_PER_LOOP = 1

print(f"NUM_ITER_STEPS       : {NUM_ITER_STEPS}")       #  = 50
print(f"STEPS_PER_LOOP       : {STEPS_PER_LOOP}")       #  = 1
print(f"LOG_INTERVAL         : {LOG_INTERVAL}")         #  = 10
print(f"CHKPT_INTERVAL       : {CHKPT_INTERVAL}")       #  = 200
print(f"NUM_EVAL_STEPS       : {NUM_EVAL_STEPS}")       #  = 100
print(f"ASYNC_STEPS_PER_LOOP : {ASYNC_STEPS_PER_LOOP}") # = 1

# Reset the train step
agent.train_step_counter.assign(0)

NUM_ITER_STEPS       : 100
STEPS_PER_LOOP       : 1
LOG_INTERVAL         : 50
CHKPT_INTERVAL       : 200
NUM_EVAL_STEPS       : 100
ASYNC_STEPS_PER_LOOP : 1


<tf.Variable 'UnreadVariable' shape=() dtype=int64, numpy=0>

In [32]:
TOTAL_TAKE = NUM_ITER_STEPS * HPARAMS['batch_size']
TOTAL_TAKE

12800

In [33]:
#start the timer and training
start_time = time.time()

metric_results, agent = train_perarm.train_perarm(
    agent = agent,
    global_dim = GLOBAL_DIM,
    per_arm_dim = PER_ARM_DIM,
    num_iterations = NUM_ITER_STEPS,
    steps_per_loop = STEPS_PER_LOOP,
    num_eval_steps = NUM_EVAL_STEPS,
    # data
    batch_size = HPARAMS['batch_size'],
    eval_batch_size = HPARAMS['eval_batch_size'],
    # functions
    _trajectory_fn = _trajectory_fn,
    # _run_bandit_eval_fn = _run_bandit_eval,
    # train intervals
    chkpt_interval = CHKPT_INTERVAL,
    log_interval = LOG_INTERVAL,
    # dirs
    bucket_name = BUCKET_NAME,
    data_dir_prefix_path = DATA_GCS_PREFIX,
    log_dir = LOG_DIR,
    model_dir = ARTIFACTS_DIR,
    root_dir = ROOT_DIR,
    async_steps_per_loop = ASYNC_STEPS_PER_LOOP,
    resume_training_loops = False,
    use_gpu = True,
    use_tpu = False,
    profiler = True,
    train_summary_writer = train_summary_writer,
    total_take = TOTAL_TAKE,
    global_step = global_step
    # additional_metrics = metrics,
)

end_time = time.time()
runtime_mins = int((end_time - start_time) / 60)
print(f"complete train job in {runtime_mins} minutes")

distribution_strategy: <tensorflow.python.distribute.mirrored_strategy.MirroredStrategy object at 0x7f82df4d1090>
train_files: ['gs://mabv1-hybrid-vertex-bucket/data/train/ml-ratings-100k-train.tfrecord']
train_ds_iterator: <tensorflow.python.data.ops.iterator_ops.OwnedIterator object at 0x7f81b0787a00>
setting checkpoint_manager: gs://mabv1-hybrid-vertex-bucket/acc-paf-v2/run-20230822-213342/root/chkpoint
Did not find a pre-existing checkpoint. Starting from scratch.
wrapping agent.train in tf-function
starting_loop: 0
starting train loop...
step = 0: loss = 15.90999984741211
step = 50: loss = 1.5700000524520874
runtime_mins: 1
saved trained policy to: gs://mabv1-hybrid-vertex-bucket/acc-paf-v2/run-20230822-213342/artifacts
complete train job in 1 minutes


In [34]:
# %load_ext tensorboard
%reload_ext tensorboard

In [35]:
%tensorboard --logdir=$LOG_DIR 