# Train Contextual Bandits with simple linear, stationary simulation environment

> In this notebook, we'll evaluate the performance of Contextual Bandits in stationary, stochastic environments. These are environments whose statistics (e.g., distribution of rewards) do not change over time

* A linear  environment is an environment where the expected reward for each arm `k` is linearly related to the context features `x`
* Train and evaluate linear and non-linear (neural) Contextual Bandit algorithms
* We'll show that linear-based agents converge faster on such an environment, compared to non-linear agents, becasue the problem is linear (by the reward calculation construction)

## Load notebook config

* use the prefix defined in `00-env-setup`

In [1]:
VERSION        = "v2"                       # TODO
PREFIX         = f'rec-bandits-{VERSION}'   # TODO

print(f"PREFIX: {PREFIX}")

PREFIX: rec-bandits-v2


In [2]:
# staging GCS
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}-bucket'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PROJECT_ID               = "hybrid-vertex"
PROJECT_NUM              = "934903580331"
LOCATION                 = "us-central1"

REGION                   = "us-central1"
BQ_LOCATION              = "US"
VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"

VERTEX_SA                = "934903580331-compute@developer.gserviceaccount.com"

PREFIX                   = "rec-bandits-v2"
VERSION                  = "v2"

BUCKET_NAME              = "rec-bandits-v2-hybrid-vertex-bucket"
BUCKET_URI               = "gs://rec-bandits-v2-hybrid-vertex-bucket"
DATA_GCS_PREFIX          = "data"
DATA_PATH                = "gs://rec-bandits-v2-hybrid-vertex-bucket/data"
VOCAB_SUBDIR             = "vocabs"
VOCAB_FILENAME           = "vocab_dict.pkl"
DATA_PATH_KFP_DEMO       = "gs://rec-bandits-v2-hybrid-vertex-bucket/data/kfp_demo_data/u.data"

VPC_NETWORK_FULL         = "projects/934903580331/global/networks/ucaip-haystack-vpc-network"

BIGQUERY_DATASET_NAME    = "mvlens_rec_bandits_v2"
BIGQUERY_TABLE_NA

## imports

In [3]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [4]:
import functools
from collections import defaultdict
from typing import Callable, Dict, List, Optional, TypeVar
from datetime import datetime
import time
from pprint import pprint
import pickle as pkl

from tqdm import tqdm

# logging
# logging
import logging
logging.disable(logging.WARNING)

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import numpy as np

# google cloud
from google.cloud import aiplatform, storage

# tensorflow
import tensorflow as tf
# from tf_agents.agents import TFAgent

from tf_agents.bandits.environments import stationary_stochastic_per_arm_py_environment as p_a_env
from tf_agents.bandits.metrics import tf_metrics as tf_bandit_metrics
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import tf_py_environment
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.specs import tensor_spec
from tf_agents.trajectories import time_step as ts

from tf_agents.bandits.agents import lin_ucb_agent
from tf_agents.bandits.agents import linear_thompson_sampling_agent as lin_ts_agent
from tf_agents.bandits.agents import neural_epsilon_greedy_agent
from tf_agents.bandits.agents import neural_linucb_agent
from tf_agents.bandits.networks import global_and_arm_feature_network
from tf_agents.bandits.policies import policy_utilities

from tf_agents.bandits.specs import utils as bandit_spec_utils
from tf_agents.trajectories import trajectory

from tf_agents.bandits.replay_buffers import bandit_replay_buffer
from tf_agents.drivers import dynamic_step_driver
from tf_agents.eval import metric_utils
# from tf_agents.google.metrics import export_utils
from tf_agents.metrics import export_utils
from tf_agents.metrics import tf_metrics
from tf_agents.policies import policy_saver

# GPU
from numba import cuda 
import gc

import sys
sys.path.append("..")

# this repo
from src.per_arm_rl import data_utils
from src.per_arm_rl import data_config
from src.per_arm_rl import train_utils as train_utils
from src.per_arm_rl import policy_util

# tf exceptions and vars
if tf.__version__[0] != "2":
    raise Exception("The trainer only runs with TensorFlow version 2.")

T = TypeVar("T")

In [5]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [7]:
device = cuda.get_current_device()
device.reset()
gc.collect()

14

In [8]:
# cloud storage client
storage_client = storage.Client(project=PROJECT_ID)

# Vertex client
aiplatform.init(project=PROJECT_ID, location=LOCATION)

# Data prep

### Read TF Records (for testing)

In [9]:
options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.AUTO

In [10]:
SPLIT = "train" # "train" | "val"

train_files = []
for blob in storage_client.list_blobs(f"{BUCKET_NAME}", prefix=f'{DATA_GCS_PREFIX}/{SPLIT}'):
    if '.tfrecord' in blob.name:
        train_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
        
train_files

['gs://rec-bandits-v2-hybrid-vertex-bucket/data/train/ml-ratings-100k-train.tfrecord']

In [11]:
train_dataset = tf.data.TFRecordDataset(train_files)
train_dataset = train_dataset.map(data_utils.parse_tfrecord)

for x in train_dataset.batch(1).take(1):
    pprint(x)

{'bucketized_user_age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([35.], dtype=float32)>,
 'movie_genres': <tf.Tensor: shape=(1, 1), dtype=int64, numpy=array([[7]])>,
 'movie_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'898'], dtype=object)>,
 'timestamp': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([885409515])>,
 'user_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'681'], dtype=object)>,
 'user_occupation_text': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'marketing'], dtype=object)>,
 'user_rating': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([4.], dtype=float32)>}


### get vocab

In [12]:
GENERATE_VOCABS = False

print(f"GENERATE_VOCABS: {GENERATE_VOCABS}")

GENERATE_VOCABS: False


In [13]:
if not GENERATE_VOCABS:

    EXISTING_VOCAB_FILE = f'gs://{BUCKET_NAME}/{VOCAB_SUBDIR}/{VOCAB_FILENAME}'
    print(f"Downloading vocab...")
    
    os.system(f'gsutil -q cp {EXISTING_VOCAB_FILE} .')
    print(f"Downloaded vocab from: {EXISTING_VOCAB_FILE}\n")

    filehandler = open(VOCAB_FILENAME, 'rb')
    vocab_dict = pkl.load(filehandler)
    filehandler.close()
    
    for key in vocab_dict.keys():
        pprint(key)

Downloading vocab...
Downloaded vocab from: gs://rec-bandits-v2-hybrid-vertex-bucket/vocabs/vocab_dict.pkl

'movie_id'
'user_id'
'user_occupation_text'
'movie_genres'
'bucketized_user_age'
'max_timestamp'
'min_timestamp'
'timestamp_buckets'


# Global & Per-Arm feature embedding models 

In [14]:
NUM_OOV_BUCKETS        = 1
GLOBAL_EMBEDDING_SIZE  = 16
MV_EMBEDDING_SIZE      = 32 #32

In [15]:
# test emb models
for i in range(1):
    
    iterator = iter(train_dataset.batch(1))
    data = next(iterator)

# data

In [16]:
from src.perarm_features import emb_features as emb_features

embs = emb_features.EmbeddingModel(
    vocab_dict = vocab_dict,
    num_oov_buckets = NUM_OOV_BUCKETS,
    global_emb_size = GLOBAL_EMBEDDING_SIZE,
    mv_emb_size = MV_EMBEDDING_SIZE,
)

embs

<src.perarm_features.emb_features.EmbeddingModel at 0x7fd50d721ea0>

In [17]:
test_globals = embs._get_global_context_features(data)

GLOBAL_DIM = test_globals.shape[1]            
# shape checks out at batch_dim, nactions, arm feats
print(f"GLOBAL_DIM: {GLOBAL_DIM}")

test_globals

GLOBAL_DIM: 64


array([[-0.04759464, -0.00214831,  0.01721566,  0.0187708 , -0.0373454 ,
         0.01577396,  0.02905227, -0.00304011, -0.02741228,  0.01382904,
         0.02101144, -0.00312436, -0.03728447,  0.03490255,  0.00524248,
         0.03051371, -0.04052462, -0.00137526,  0.03157919,  0.04315249,
         0.02337736, -0.02114924,  0.00967843,  0.0390544 ,  0.04298044,
        -0.02705672, -0.02637699, -0.03304051,  0.04473188,  0.0449608 ,
         0.02827417,  0.04939343, -0.01289447, -0.04441639, -0.01212721,
         0.04924201, -0.03900896,  0.03584861, -0.01095216,  0.00990552,
        -0.01494937,  0.01665027,  0.038335  , -0.01157652, -0.03773923,
        -0.03740966,  0.04190073, -0.02747232, -0.0273616 , -0.01514687,
        -0.01195233,  0.02162272,  0.01284235,  0.02455051,  0.04507736,
         0.02894325, -0.04565011, -0.01998092,  0.0319807 ,  0.00624039,
         0.02853158, -0.0445177 , -0.03798994, -0.02190473]],
      dtype=float32)

In [18]:
test_arms = embs._get_per_arm_features(data)

PER_ARM_DIM = test_arms.shape[1]            
# shape checks out at batch_dim, nactions, arm feats
print(f"PER_ARM_DIM: {PER_ARM_DIM}")

test_arms

PER_ARM_DIM: 64


array([[-0.02958772,  0.02897744, -0.03661335,  0.04742587, -0.04875497,
         0.01948533, -0.01095003,  0.01858981,  0.00386803, -0.03639517,
        -0.02595102, -0.00053853,  0.04987946,  0.00118763, -0.01411446,
         0.04728922, -0.04942104, -0.04120396, -0.03177999,  0.01887869,
        -0.04749788,  0.03166879, -0.01453044, -0.00444988, -0.04761336,
         0.0021589 ,  0.02910644, -0.04195349, -0.02880422,  0.01965005,
        -0.01373   , -0.04609764,  0.00417155, -0.03846981, -0.01318642,
        -0.00762614,  0.0135617 ,  0.02824224, -0.04315592, -0.00116351,
        -0.04233301,  0.01694881,  0.01993126, -0.04933593, -0.01437705,
        -0.00045543, -0.03693864, -0.00756365, -0.01560156, -0.01660501,
        -0.00648575, -0.04368071,  0.0378093 , -0.0417042 ,  0.04113103,
        -0.0407564 ,  0.0027818 , -0.03640454,  0.04335964, -0.03944666,
         0.00909806, -0.00306667, -0.02622664,  0.0296529 ]],
      dtype=float32)

# Stationary Stochastic Per-arm environment

## Per-arm environment
In this example we will use the [StationaryStochasticPerArmPyEnvironment](https://github.com/tensorflow/agents/blob/master/tf_agents/bandits/environments/stationary_stochastic_per_arm_py_environment.py) environment to simulate offline training data for training a contextual bandit with **per-arm features**
​
To initialize the per-arm environment, one has to define functions that generate
​
* *global and per-arm features*: These functions have no input parameters and generate a single (global or per-arm) feature vector when called.
​
* *rewards*: This function takes as parameter the concatenation of a global and a per-arm feature vector, and generates a reward. Basically this is the function that the agent will have to "guess". 
​
It is worth noting here that in the per-arm case the reward function is identical for every arm. This is a fundamental difference from the classic bandit case, where the agent has to estimate reward functions for each arm independently.

## Sampling and reward functions

The preproccesing layers will ultimately feed the two sampling functions described below. These sampling functions will be used to create [trajectories](https://github.com/tensorflow/agents/blob/master/tf_agents/trajectories/trajectory.py#L36) (i.e., the training examples for our model)

`global_context_sampling_fn`: 
* A function that outputs a random 1d array or list of ints or floats
* This output is the global context. Its shape and type must be consistent across calls.

`arm_context_sampling_fn`: 
* A function that outputs a random 1 array or list of ints or floats (same type as the output of `global_context_sampling_fn`). 
* This output is the per-arm context. Its shape must be consistent across calls.

`reward_fn`: function takes the concatenation of a global and a per-arm feature, and outputs a possibly random reward.

In [19]:
BATCH_SIZE      = 128
EVAL_BATCH_SIZE = 1
NUM_ACTIONS     = 20

### Sampling global context

In [20]:
def example_global_context_sampling_fn():
    # return np.random.randint(0, 10, [2])  # 2-dimensional global features.
    return np.random.randint(-1, 1, [GLOBAL_DIM]).astype(np.float32)

example_global_context_sampling_fn()

array([-1.,  0., -1.,  0.,  0.,  0., -1.,  0., -1., -1.,  0.,  0., -1.,
       -1., -1., -1.,  0.,  0.,  0.,  0.,  0., -1., -1.,  0., -1.,  0.,
       -1.,  0., -1.,  0.,  0., -1.,  0., -1.,  0.,  0.,  0., -1., -1.,
        0.,  0.,  0., -1.,  0., -1.,  0., -1.,  0.,  0., -1., -1.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0., -1.],
      dtype=float32)

In [21]:
global_iterator = iter(train_dataset.batch(1).repeat())

def global_context_sampling_fn():
    
    data = next(global_iterator)
    
    return embs._get_global_context_features(data)[0]

In [22]:
test_global_sampling = global_context_sampling_fn()
print(f"shape: {test_global_sampling.shape}")

test_global_sampling

shape: (64,)


array([-0.04759464, -0.00214831,  0.01721566,  0.0187708 , -0.0373454 ,
        0.01577396,  0.02905227, -0.00304011, -0.02741228,  0.01382904,
        0.02101144, -0.00312436, -0.03728447,  0.03490255,  0.00524248,
        0.03051371, -0.04052462, -0.00137526,  0.03157919,  0.04315249,
        0.02337736, -0.02114924,  0.00967843,  0.0390544 ,  0.04298044,
       -0.02705672, -0.02637699, -0.03304051,  0.04473188,  0.0449608 ,
        0.02827417,  0.04939343, -0.01289447, -0.04441639, -0.01212721,
        0.04924201, -0.03900896,  0.03584861, -0.01095216,  0.00990552,
       -0.01494937,  0.01665027,  0.038335  , -0.01157652, -0.03773923,
       -0.03740966,  0.04190073, -0.02747232, -0.0273616 , -0.01514687,
       -0.01195233,  0.02162272,  0.01284235,  0.02455051,  0.04507736,
        0.02894325, -0.04565011, -0.01998092,  0.0319807 ,  0.00624039,
        0.02853158, -0.0445177 , -0.03798994, -0.02190473], dtype=float32)

### Sampling arm context

In [23]:
def example_arm_context_sampling_fn():
    # return np.random.randint(-3, 4, [3])  # 3-dimensional arm features.
    return np.random.randint(-2, 3, [PER_ARM_DIM]).astype(np.float32)

example_arm_context_sampling_fn()

array([ 2.,  2.,  1.,  0., -1.,  0., -2.,  2.,  2.,  0., -1.,  1.,  1.,
        0.,  0.,  1., -1., -2.,  0.,  2., -2.,  1.,  0.,  1.,  2.,  2.,
       -1., -2.,  1., -1.,  0.,  1.,  2.,  0., -2., -1., -2.,  0., -2.,
        1.,  0.,  0.,  0., -1., -2.,  2.,  2., -2., -1., -2.,  1.,  1.,
       -1., -1., -1.,  2.,  2.,  1., -1.,  0.,  1.,  0.,  1., -2.],
      dtype=float32)

In [24]:
arm_iterator = iter(train_dataset.batch(1).repeat())
# arm_iterator = iter(train_dataset.repeat())

def arm_context_sampling_fn():
    
    data = next(arm_iterator)
    
    return embs._get_per_arm_features(data)[0]

In [25]:
test_arm_sampling = arm_context_sampling_fn()
print(f"shape: {test_arm_sampling.shape}")

test_arm_sampling

shape: (64,)


array([-0.02958772,  0.02897744, -0.03661335,  0.04742587, -0.04875497,
        0.01948533, -0.01095003,  0.01858981,  0.00386803, -0.03639517,
       -0.02595102, -0.00053853,  0.04987946,  0.00118763, -0.01411446,
        0.04728922, -0.04942104, -0.04120396, -0.03177999,  0.01887869,
       -0.04749788,  0.03166879, -0.01453044, -0.00444988, -0.04761336,
        0.0021589 ,  0.02910644, -0.04195349, -0.02880422,  0.01965005,
       -0.01373   , -0.04609764,  0.00417155, -0.03846981, -0.01318642,
       -0.00762614,  0.0135617 ,  0.02824224, -0.04315592, -0.00116351,
       -0.04233301,  0.01694881,  0.01993126, -0.04933593, -0.01437705,
       -0.00045543, -0.03693864, -0.00756365, -0.01560156, -0.01660501,
       -0.00648575, -0.04368071,  0.0378093 , -0.0417042 ,  0.04113103,
       -0.0407564 ,  0.0027818 , -0.03640454,  0.04335964, -0.03944666,
        0.00909806, -0.00306667, -0.02622664,  0.0296529 ], dtype=float32)

### Reward function 

> compute the dot product between global user context features and arm context features

In this environment, the reward function will take as input the concatenation of the global and arm features generated from the provided sampling functions. See it used in the environment's `_apply_action()` function [here](https://github.com/tensorflow/agents/blob/master/tf_agents/bandits/environments/stationary_stochastic_per_arm_py_environment.py#L170C18-L170C28)

```python
feat_concat = np.concatenate((test_global_sampling, test_arm_sampling))
reward = np.dot(feat_concat[:GLOBAL_DIM], feat_concat[PER_ARM_DIM:])
```

In [26]:
test_global_sampling #[0]

array([-0.04759464, -0.00214831,  0.01721566,  0.0187708 , -0.0373454 ,
        0.01577396,  0.02905227, -0.00304011, -0.02741228,  0.01382904,
        0.02101144, -0.00312436, -0.03728447,  0.03490255,  0.00524248,
        0.03051371, -0.04052462, -0.00137526,  0.03157919,  0.04315249,
        0.02337736, -0.02114924,  0.00967843,  0.0390544 ,  0.04298044,
       -0.02705672, -0.02637699, -0.03304051,  0.04473188,  0.0449608 ,
        0.02827417,  0.04939343, -0.01289447, -0.04441639, -0.01212721,
        0.04924201, -0.03900896,  0.03584861, -0.01095216,  0.00990552,
       -0.01494937,  0.01665027,  0.038335  , -0.01157652, -0.03773923,
       -0.03740966,  0.04190073, -0.02747232, -0.0273616 , -0.01514687,
       -0.01195233,  0.02162272,  0.01284235,  0.02455051,  0.04507736,
        0.02894325, -0.04565011, -0.01998092,  0.0319807 ,  0.00624039,
        0.02853158, -0.0445177 , -0.03798994, -0.02190473], dtype=float32)

In [27]:
test_arm_sampling #[0]

array([-0.02958772,  0.02897744, -0.03661335,  0.04742587, -0.04875497,
        0.01948533, -0.01095003,  0.01858981,  0.00386803, -0.03639517,
       -0.02595102, -0.00053853,  0.04987946,  0.00118763, -0.01411446,
        0.04728922, -0.04942104, -0.04120396, -0.03177999,  0.01887869,
       -0.04749788,  0.03166879, -0.01453044, -0.00444988, -0.04761336,
        0.0021589 ,  0.02910644, -0.04195349, -0.02880422,  0.01965005,
       -0.01373   , -0.04609764,  0.00417155, -0.03846981, -0.01318642,
       -0.00762614,  0.0135617 ,  0.02824224, -0.04315592, -0.00116351,
       -0.04233301,  0.01694881,  0.01993126, -0.04933593, -0.01437705,
       -0.00045543, -0.03693864, -0.00756365, -0.01560156, -0.01660501,
       -0.00648575, -0.04368071,  0.0378093 , -0.0417042 ,  0.04113103,
       -0.0407564 ,  0.0027818 , -0.03640454,  0.04335964, -0.03944666,
        0.00909806, -0.00306667, -0.02622664,  0.0296529 ], dtype=float32)

In [28]:
HIDDEN_PARAM = list(np.random.randint(0, 10, [GLOBAL_DIM + PER_ARM_DIM]))

class LinearNormalReward(object):
    def __init__(self, theta):
        self.theta = theta

    def __call__(self, x):
        mu = np.dot(x, self.theta)
        return np.random.normal(mu, 1)
    
reward_fn = LinearNormalReward(HIDDEN_PARAM)

## Initialize environment

In [29]:
print(f"BATCH_SIZE      : {BATCH_SIZE}")
print(f"EVAL_BATCH_SIZE : {EVAL_BATCH_SIZE}")
print(f"NUM_ACTIONS     : {NUM_ACTIONS}")
print(f"GLOBAL_DIM      : {GLOBAL_DIM}")
print(f"PER_ARM_DIM     : {PER_ARM_DIM}")

BATCH_SIZE      : 128
EVAL_BATCH_SIZE : 1
NUM_ACTIONS     : 20
GLOBAL_DIM      : 64
PER_ARM_DIM     : 64


In [30]:
per_arm_py_env = p_a_env.StationaryStochasticPerArmPyEnvironment(
    global_context_sampling_fn=global_context_sampling_fn,
    arm_context_sampling_fn=arm_context_sampling_fn,
    max_num_actions=NUM_ACTIONS,
    reward_fn=reward_fn,
    batch_size=BATCH_SIZE
)
per_arm_tf_env = tf_py_environment.TFPyEnvironment(per_arm_py_env)

In [31]:
# print('observation spec: ', per_arm_tf_env.observation_spec())
# print('\nAn observation: ', per_arm_tf_env.reset().observation)

action = tf.zeros(BATCH_SIZE, dtype=tf.int32)
time_step = per_arm_tf_env.step(action)
print('\nRewards after taking an action: ', time_step.reward)


Rewards after taking an action:  tf.Tensor(
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0.], shape=(128,), dtype=float32)


In [32]:
observation_spec = per_arm_tf_env.observation_spec()
# observation_spec

In [33]:
time_step_spec = ts.time_step_spec(observation_spec)
# time_step_spec

In [34]:
action_spec = tensor_spec.BoundedTensorSpec(
    dtype=tf.int32, 
    shape=(), 
    minimum=0, 
    maximum=NUM_ACTIONS - 1
)
# action_spec

## Define Agent Policy

In [36]:
# ================================
# Agents
# ================================
AGENT_TYPE      = 'NeuralLinUCB' # 'LinUCB' | 'LinTS |, 'epsGreedy' | 'NeuralLinUCB'

# Parameters for linear agents (LinUCB and LinTS).
AGENT_ALPHA     = 0.1

# Parameters for neural agents (NeuralEpsGreedy and NerualLinUCB).
EPSILON         = 0.01
LR              = 0.05

# Parameters for NeuralLinUCB
ENCODING_DIM    = 1
EPS_PHASE_STEPS = 1000

# ================================
# Agent's Preprocess Network
# ================================
GLOBAL_LAYERS   = [64, 32, 16] # beginning should be of size: GLOBAL_DIM
ARM_LAYERS      = [64, 32, 16] # beginning should be of size: PER_ARM_DIM
COMMON_LAYERS   = [16, 8]

NETWORK_TYPE    = "dotproduct" # 'dotproduct' | 'commontower'

if AGENT_TYPE == 'NeuralLinUCB':
    NETWORK_TYPE = 'commontower'
    ENCODING_DIM = COMMON_LAYERS[-1]

HPARAMS = {  # TODO - streamline and consolidate
    "batch_size": BATCH_SIZE,
    "eval_batch_size" : EVAL_BATCH_SIZE,
    "num_actions": NUM_ACTIONS,
    "model_type": AGENT_TYPE,
    "network_type": NETWORK_TYPE,
    "global_layers": GLOBAL_LAYERS,
    "per_arm_layers": ARM_LAYERS,
    "common_layers": COMMON_LAYERS,
    "learning_rate": LR,
    "epsilon": EPSILON,
}
pprint(HPARAMS)

{'batch_size': 128,
 'common_layers': [16, 8],
 'epsilon': 0.01,
 'eval_batch_size': 1,
 'global_layers': [64, 32, 16],
 'learning_rate': 0.05,
 'model_type': 'NeuralLinUCB',
 'network_type': 'commontower',
 'num_actions': 20,
 'per_arm_layers': [64, 32, 16]}


In [37]:
from src.perarm_features import agent_factory as agent_factory

global_step = tf.compat.v1.train.get_or_create_global_step()

agent = agent_factory.PerArmAgentFactory._get_agent(
    agent_type = AGENT_TYPE,
    network_type = NETWORK_TYPE,
    time_step_spec = time_step_spec,
    action_spec = action_spec,
    observation_spec=observation_spec,
    global_layers = GLOBAL_LAYERS,
    arm_layers = ARM_LAYERS,
    common_layers = COMMON_LAYERS,
    agent_alpha = AGENT_ALPHA,
    learning_rate = LR,
    epsilon = EPSILON,
    train_step_counter = global_step,
    output_dim = ENCODING_DIM,
    eps_phase_steps = EPS_PHASE_STEPS,
    summarize_grads_and_vars = True,
    debug_summaries = True
)
      
# agent.initialize()
print(f'agent: {agent.name}')

# if network:
#     print(f"Network: {network}")

if NETWORK_TYPE:
    print(f"Network: {NETWORK_TYPE}")

agent: neural_linucb_agent
Network: commontower


### The Flow of Training Data

look at the data specification in the agent. The training_data_spec attribute of the agent specifies what elements and structure the training data should have.

In [38]:
print('training data spec: ', agent.training_data_spec)

training data spec:  Trajectory(
{'action': BoundedTensorSpec(shape=(), dtype=tf.int32, name='action', minimum=array(0, dtype=int32), maximum=array(19, dtype=int32)),
 'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
 'next_step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type'),
 'observation': DictWrapper({'global': TensorSpec(shape=(64,), dtype=tf.float32, name=None)}),
 'policy_info': PerArmPolicyInfo(log_probability=(), predicted_rewards_mean=TensorSpec(shape=(20,), dtype=tf.float32, name=None), multiobjective_scalarized_predicted_rewards_mean=(), predicted_rewards_optimistic=(), predicted_rewards_sampled=(), bandit_policy_type=(), chosen_arm_features=TensorSpec(shape=(64,), dtype=tf.float32, name=None)),
 'reward': TensorSpec(shape=(), dtype=tf.float32, name='reward'),
 'step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type')})


In [39]:
def _all_rewards(observation, hidden_param):
    """Outputs rewards for all actions, given an observation."""
    hidden_param = tf.cast(hidden_param, dtype=tf.float32)
    global_obs = observation[bandit_spec_utils.GLOBAL_FEATURE_KEY]
    per_arm_obs = observation[bandit_spec_utils.PER_ARM_FEATURE_KEY]
    num_actions = tf.shape(per_arm_obs)[1]
    tiled_global = tf.tile(
        tf.expand_dims(global_obs, axis=1), [1, num_actions, 1]
    )
    concatenated = tf.concat([tiled_global, per_arm_obs], axis=-1)
    rewards = tf.linalg.matvec(concatenated, hidden_param)
    return rewards

def optimal_reward(observation):
    """
    Outputs the maximum expected reward 
    for every element in the batch.
    """
    return tf.reduce_max(_all_rewards(observation, HIDDEN_PARAM), axis=1)

regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward)

In [40]:
def optimal_action(observation, hidden_param):
    
    return tf.argmax(
        _all_rewards(observation, hidden_param), axis=1, output_type=tf.int32
    )

optimal_action_fn = functools.partial(
    optimal_action, hidden_param=HIDDEN_PARAM
)

suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric(
    optimal_action_fn
)

In [41]:
metrics = [regret_metric, suboptimal_arms_metric]
metrics

[<tf_agents.bandits.metrics.tf_metrics.RegretMetric at 0x7fd3e4090eb0>,
 <tf_agents.bandits.metrics.tf_metrics.SuboptimalArmsMetric at 0x7fd3e4090520>]

# Training

In [42]:
from src.per_arm_rl import trainer_baseline

## Vertex AI Experiments

In [43]:
EXPERIMENT_NAME   = f'train-stationary-env-{PREFIX}'

# new experiment
invoke_time       = time.strftime("%Y%m%d-%H%M%S")
RUN_NAME          = f'run-{invoke_time}'

BASE_OUTPUT_DIR   = f'{BUCKET_URI}/{EXPERIMENT_NAME}/{RUN_NAME}'
LOG_DIR           = f"{BASE_OUTPUT_DIR}/logs"
ROOT_DIR          = f"{BASE_OUTPUT_DIR}/root"       # Root directory for writing logs/summaries/checkpoints.
ARTIFACTS_DIR     = f"{BASE_OUTPUT_DIR}/artifacts"  # Where the trained model will be saved and restored.

# aiplatform.init(
#     project=PROJECT_ID,
#     location=REGION,
#     experiment=EXPERIMENT_NAME
# )

print(f"EXPERIMENT_NAME   : {EXPERIMENT_NAME}")
print(f"RUN_NAME          : {RUN_NAME}\n")
print(f"BASE_OUTPUT_DIR   : {BASE_OUTPUT_DIR}")
print(f"LOG_DIR           : {LOG_DIR}")
print(f"ROOT_DIR          : {ROOT_DIR}")
print(f"ARTIFACTS_DIR     : {ARTIFACTS_DIR}")

EXPERIMENT_NAME   : train-stationary-env-rec-bandits-v2
RUN_NAME          : run-20231110-173254

BASE_OUTPUT_DIR   : gs://rec-bandits-v2-hybrid-vertex-bucket/train-stationary-env-rec-bandits-v2/run-20231110-173254
LOG_DIR           : gs://rec-bandits-v2-hybrid-vertex-bucket/train-stationary-env-rec-bandits-v2/run-20231110-173254/logs
ROOT_DIR          : gs://rec-bandits-v2-hybrid-vertex-bucket/train-stationary-env-rec-bandits-v2/run-20231110-173254/root
ARTIFACTS_DIR     : gs://rec-bandits-v2-hybrid-vertex-bucket/train-stationary-env-rec-bandits-v2/run-20231110-173254/artifacts


### Create TensorBoard

In [44]:
# # create new TB instance
TENSORBOARD_DISPLAY_NAME=f"{EXPERIMENT_NAME}-{RUN_NAME}"

tensorboard = aiplatform.Tensorboard.create(
    display_name=TENSORBOARD_DISPLAY_NAME
    , project=PROJECT_ID
    , location=REGION
)

TB_RESOURCE_NAME = tensorboard.resource_name

TB_ID = TB_RESOURCE_NAME.split('/')[-1]

print(f"TB_RESOURCE_NAME : {TB_RESOURCE_NAME}")
print(f"TB display name  : {tensorboard.display_name}")
print(f"TB_ID            : {TB_ID}")

TB_RESOURCE_NAME : projects/934903580331/locations/us-central1/tensorboards/7085573986783854592
TB display name  : train-stationary-env-rec-bandits-v2-run-20231110-173254
TB_ID            : 7085573986783854592


## Trainer loop

In [45]:
aiplatform.init(
    project=PROJECT_ID,
    location=REGION,
    experiment=EXPERIMENT_NAME,
    experiment_tensorboard=TB_ID
)

In [46]:
TRAINING_LOOPS = 50
STEPS_PER_LOOP = 1
LOG_INTERVAL = TRAINING_LOOPS // 10

print(f'LOG_INTERVAL: {LOG_INTERVAL}')

LOG_INTERVAL: 5


In [48]:
# Continuous monitoring
aiplatform.start_upload_tb_log(
    # tensorboard_id=TB_RESOURCE_NAME,
    tensorboard_experiment_name=EXPERIMENT_NAME,
    logdir=LOG_DIR,
    experiment_display_name=EXPERIMENT_NAME,
    run_name_prefix=RUN_NAME,
    # description=description,
)

#start the timer and training
start_time = time.time()

trainer_baseline.train(
    # root_dir=LOG_DIR,
    artifact_dir=ARTIFACTS_DIR,
    log_dir=LOG_DIR,
    agent=agent,
    environment=per_arm_tf_env,
    training_loops=TRAINING_LOOPS,
    steps_per_loop=STEPS_PER_LOOP,
    additional_metrics=metrics,
    save_policy=True
)

aiplatform.end_upload_tb_log()

end_time = time.time()
runtime_mins = int((end_time - start_time) / 60)
print(f"train runtime_mins: {runtime_mins}")

## Evlauate trained policy

<img src="imgs/linear_ss_env_train.png" 
     align="center" 
     width="850"
     height="850"/>

In [45]:
%load_ext tensorboard
# %reload_ext tensorboard

In [46]:
%tensorboard --logdir=$LOG_DIR 

**Finished**