# Train Bandits with per-arm features

**Exploring linear and nonlinear** (e.g., those with neural network-based value functions) bandit methods for recommendations using TF-Agents

> Neural linear bandits provide a nice way to leverage the representation power of deep learning and the bandit approach for uncertainty measure and efficient exploration

## Load notebook config

* use the prefix defined in `00-env-setup`

In [1]:
PREFIX = 'mabv1'

In [2]:
# staging GCS
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}-bucket'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PROJECT_ID               = "hybrid-vertex"
PROJECT_NUM              = "934903580331"
LOCATION                 = "us-central1"

REGION                   = "us-central1"
BQ_LOCATION              = "US"
VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"

VERTEX_SA                = "934903580331-compute@developer.gserviceaccount.com"

PREFIX                   = "mabv1"
VERSION                  = "v1"

BUCKET_NAME              = "mabv1-hybrid-vertex-bucket"
DATA_GCS_PREFIX          = "data"
DATA_PATH                = "gs://mabv1-hybrid-vertex-bucket/data"
BUCKET_URI               = "gs://mabv1-hybrid-vertex-bucket"

VPC_NETWORK_FULL         = "projects/934903580331/global/networks/ucaip-haystack-vpc-network"

BIGQUERY_DATASET_ID      = "hybrid-vertex.movielens_dataset_mabv1"
BIGQUERY_TABLE_ID        = "hybrid-vertex.movielens_dataset_mabv1.training_dataset"

REPO_DOCKER_PATH_PREFIX  = "src"
RL_SUB_DIR               = "per_arm_rl"



## imports

In [3]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [44]:
import functools
from collections import defaultdict
from typing import Callable, Dict, List, Optional, TypeVar
from datetime import datetime
import time
from pprint import pprint
import pickle as pkl

# logging
import logging
logging.disable(logging.WARNING)

import matplotlib.pyplot as plt
import numpy as np

# google cloud
from google.cloud import aiplatform, storage

# tensorflow
import tensorflow as tf
# from tf_agents.agents import TFAgent

# from tf_agents.bandits.environments import stationary_stochastic_per_arm_py_environment as p_a_env
from tf_agents.bandits.metrics import tf_metrics as tf_bandit_metrics
# from tf_agents.drivers import dynamic_step_driver
# from tf_agents.environments import tf_py_environment
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.specs import tensor_spec
from tf_agents.trajectories import time_step as ts

# from tf_agents.bandits.agents import lin_ucb_agent
# from tf_agents.bandits.agents import linear_thompson_sampling_agent as lin_ts_agent
from tf_agents.bandits.agents import neural_epsilon_greedy_agent
from tf_agents.bandits.agents import neural_linucb_agent
from tf_agents.bandits.networks import global_and_arm_feature_network
from tf_agents.bandits.policies import policy_utilities

from tf_agents.bandits.specs import utils as bandit_spec_utils
from tf_agents.trajectories import trajectory

# GPU
from numba import cuda 
import gc

import sys
sys.path.append("..")

# this repo
from src.per_arm_rl import data_utils
from src.per_arm_rl import data_config

# tf exceptions and vars
if tf.__version__[0] != "2":
    raise Exception("The trainer only runs with TensorFlow version 2.")

T = TypeVar("T")

In [5]:
# gpus = tf.config.experimental.list_physical_devices('GPU')
# for gpu in gpus:
#     tf.config.experimental.set_memory_growth(gpu, True)
    
# gpus

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [6]:
device = cuda.get_current_device()
device.reset()
gc.collect()

14

In [7]:
# cloud storage client
storage_client = storage.Client(project=PROJECT_ID)

# Vertex client
aiplatform.init(project=PROJECT_ID, location=LOCATION)

# Data prep

### Read TF Records

In [8]:
options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.AUTO

In [9]:
SPLIT = "val" # "train" | "val"

train_files = []
for blob in storage_client.list_blobs(f"{BUCKET_NAME}", prefix=f'{DATA_GCS_PREFIX}/{SPLIT}'):
    if '.tfrecord' in blob.name:
        train_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
        
train_files

['gs://mabv1-hybrid-vertex-bucket/data/val/ml-ratings-100k-val.tfrecord']

In [10]:
train_dataset = tf.data.TFRecordDataset(train_files)
train_dataset = train_dataset.map(data_utils.parse_tfrecord)

for x in train_dataset.batch(1).take(1):
    pprint(x)

{'bucketized_user_age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([25.], dtype=float32)>,
 'movie_genres': <tf.Tensor: shape=(1, 1), dtype=int64, numpy=array([[4]])>,
 'movie_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'211'], dtype=object)>,
 'timestamp': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([874948475])>,
 'user_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'346'], dtype=object)>,
 'user_occupation_text': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'other'], dtype=object)>,
 'user_rating': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([4.], dtype=float32)>}


### get vocab

**TODO:** 
* streamline vocab calls

In [11]:
GENERATE_VOCABS = False
print(f"GENERATE_VOCABS: {GENERATE_VOCABS}")

VOCAB_SUBDIR   = "vocabs"
VOCAB_FILENAME = "vocab_dict.pkl"

GENERATE_VOCABS: False


In [12]:
if not GENERATE_VOCABS:

    EXISTING_VOCAB_FILE = f'gs://{BUCKET_NAME}/{VOCAB_SUBDIR}/{VOCAB_FILENAME}'
    print(f"Downloading vocab...")
    
    os.system(f'gsutil -q cp {EXISTING_VOCAB_FILE} .')
    print(f"Downloaded vocab from: {EXISTING_VOCAB_FILE}\n")

    filehandler = open(VOCAB_FILENAME, 'rb')
    vocab_dict = pkl.load(filehandler)
    filehandler.close()
    
    for key in vocab_dict.keys():
        pprint(key)

Downloading vocab...
Downloaded vocab from: gs://mabv1-hybrid-vertex-bucket/vocabs/vocab_dict.pkl

'movie_id'
'user_id'
'user_occupation_text'
'movie_genres'
'bucketized_user_age'
'max_timestamp'
'min_timestamp'
'timestamp_buckets'


## helper functions

**TODO:**
* modularize in a train_utils or similar

In [13]:
# def _get_global_feature_list(input_features):
#     """Return list of global features."""
#     global_feature_names = ['user_id', 'bucketized_user_age', 'user_occupation_text', 'timestamp']
#     global_features = []
#     for global_feature in global_feature_names:
#         if global_feature in input_features:
#             global_features.append(input_features[global_feature])
#         else:
#             logging.error('Missing global feature %s', global_feature)
#     return global_features

# def _get_per_arm_feature_dict(input_features):
#     """Returns a dictionary mapping feature key to per arm features."""
#     per_arm_feature_names = ['movie_id', 'movie_genres']
#     arm_features = {}
#     for per_arm_feature in per_arm_feature_names:
#         if per_arm_feature in input_features:
#             arm_features[per_arm_feature] = input_features[per_arm_feature]
#         else:
#             logging.error('Missing per arm feature %s', per_arm_feature)
#     return arm_features

def _add_outer_dimension(x):
    """Adds an extra outer dimension."""
    if isinstance(x, dict):
        for key, value in x.items():
            x[key] = tf.expand_dims(value, 1)
        return x
    return tf.expand_dims(x, 1)

# Multi-Armed Bandits with Per-Arm Features

In [14]:
# from tf_agents.bandits.agents import lin_ucb_agent
# from tf_agents.bandits.environments import stationary_stochastic_per_arm_py_environment as p_a_env
from tf_agents.bandits.metrics import tf_metrics as tf_bandit_metrics
# from tf_agents.drivers import dynamic_step_driver
# from tf_agents.environments import tf_py_environment
from tf_agents.replay_buffers import tf_uniform_replay_buffer

nest = tf.nest

## Preprocessing layers for global and arm features

The preproccesing layers will ultimately feed the two functions described below, both of which will ultimately feed the `Environment`

`global_context_sampling_fn`: 
* A function that outputs a random 1d array or list of ints or floats
* This output is the global context. Its shape and type must be consistent across calls.

`arm_context_sampling_fn`: 
* A function that outputs a random 1 array or list of ints or floats (same type as the output of `global_context_sampling_fn`). * This output is the per-arm context. Its shape must be consistent across calls.

In [15]:
NUM_OOV_BUCKETS        = 1
GLOBAL_EMBEDDING_SIZE  = 4
MV_EMBEDDING_SIZE      = 8 #32

# HPARAMS = { # TODO - explain these and their options
#     "batch_size": 8,
#     "num_actions": 3,
#     "model_type": "neural_epsilon_greedy",
#     "network_type": 'commontower',
#     "global_layers": [16,4],
#     "per_arm_layers": [16,4],
#     "common_layers": [4],
#     "learning_rate": 0.05,
#     "epsilon":0.01,
# }

### global context (user) features

#### user ID

In [16]:
user_id_input_layer = tf.keras.Input(
    name="user_id",
    shape=(1,),
    dtype=tf.string
)

user_id_lookup = tf.keras.layers.StringLookup(
    max_tokens=len(vocab_dict['user_id']) + NUM_OOV_BUCKETS,
    num_oov_indices=NUM_OOV_BUCKETS,
    mask_token=None,
    vocabulary=vocab_dict['user_id'],
)(user_id_input_layer)

user_id_embedding = tf.keras.layers.Embedding(
    # Let's use the explicit vocabulary lookup.
    input_dim=len(vocab_dict['user_id']) + NUM_OOV_BUCKETS,
    output_dim=GLOBAL_EMBEDDING_SIZE
)(user_id_lookup)

user_id_embedding = tf.reduce_sum(user_id_embedding, axis=-2)

# global_inputs.append(user_id_input_layer)
# global_features.append(user_id_embedding)

In [17]:
test_user_id_model = tf.keras.Model(inputs=user_id_input_layer, outputs=user_id_embedding)

for x in train_dataset.batch(1).take(1):
    print(x["user_id"])
    print(test_user_id_model(x["user_id"]))

tf.Tensor([b'346'], shape=(1,), dtype=string)
tf.Tensor([[-0.04257477 -0.0204685  -0.03652425 -0.0370932 ]], shape=(1, 4), dtype=float32)


#### user AGE

In [18]:
user_age_input_layer = tf.keras.Input(
    name="bucketized_user_age",
    shape=(1,),
    dtype=tf.float32
)

user_age_lookup = tf.keras.layers.IntegerLookup(
    vocabulary=vocab_dict['bucketized_user_age'],
    num_oov_indices=NUM_OOV_BUCKETS,
    oov_value=0,
)(user_age_input_layer)

user_age_embedding = tf.keras.layers.Embedding(
    # Let's use the explicit vocabulary lookup.
    input_dim=len(vocab_dict['bucketized_user_age']) + NUM_OOV_BUCKETS,
    output_dim=GLOBAL_EMBEDDING_SIZE
)(user_age_lookup)

user_age_embedding = tf.reduce_sum(user_age_embedding, axis=-2)

# global_inputs.append(user_age_input_layer)
# global_features.append(user_age_embedding)

In [19]:
test_user_age_model = tf.keras.Model(inputs=user_age_input_layer, outputs=user_age_embedding)

for x in train_dataset.batch(1).take(1):
    print(x["bucketized_user_age"])
    print(test_user_age_model(x["bucketized_user_age"]))

tf.Tensor([25.], shape=(1,), dtype=float32)
tf.Tensor([[-0.01470114  0.03408908 -0.01681744  0.04299467]], shape=(1, 4), dtype=float32)


#### user OCC

In [20]:
user_occ_input_layer = tf.keras.Input(
    name="user_occupation_text",
    shape=(1,),
    dtype=tf.string
)

user_occ_lookup = tf.keras.layers.StringLookup(
    max_tokens=len(vocab_dict['user_occupation_text']) + NUM_OOV_BUCKETS,
    num_oov_indices=NUM_OOV_BUCKETS,
    mask_token=None,
    vocabulary=vocab_dict['user_occupation_text'],
)(user_occ_input_layer)

user_occ_embedding = tf.keras.layers.Embedding(
    # Let's use the explicit vocabulary lookup.
    input_dim=len(vocab_dict['user_occupation_text']) + NUM_OOV_BUCKETS,
    output_dim=GLOBAL_EMBEDDING_SIZE
)(user_occ_lookup)

user_occ_embedding = tf.reduce_sum(user_occ_embedding, axis=-2)

# global_inputs.append(user_occ_input_layer)
# global_features.append(user_occ_embedding)

In [21]:
test_user_occ_model = tf.keras.Model(inputs=user_occ_input_layer, outputs=user_occ_embedding)

for x in train_dataset.batch(1).take(1):
    print(x["user_occupation_text"])
    print(test_user_occ_model(x["user_occupation_text"]))

tf.Tensor([b'other'], shape=(1,), dtype=string)
tf.Tensor([[-0.04057591 -0.01261982  0.04479137 -0.04452629]], shape=(1, 4), dtype=float32)


#### user Timestamp

In [22]:
user_ts_input_layer = tf.keras.Input(
    name="timestamp",
    shape=(1,),
    dtype=tf.int64
)

user_ts_lookup = tf.keras.layers.Discretization(
    vocab_dict['timestamp_buckets'].tolist()
)(user_ts_input_layer)

user_ts_embedding = tf.keras.layers.Embedding(
    # Let's use the explicit vocabulary lookup.
    input_dim=len(vocab_dict['timestamp_buckets'].tolist()) + NUM_OOV_BUCKETS,
    output_dim=GLOBAL_EMBEDDING_SIZE
)(user_ts_lookup)

user_ts_embedding = tf.reduce_sum(user_ts_embedding, axis=-2)

# global_inputs.append(user_ts_input_layer)
# global_features.append(user_ts_embedding)

In [23]:
test_user_ts_model = tf.keras.Model(inputs=user_ts_input_layer, outputs=user_ts_embedding)

for x in train_dataset.batch(1).take(1):
    print(x["timestamp"])
    print(test_user_ts_model(x["timestamp"]))

tf.Tensor([874948475], shape=(1,), dtype=int64)
tf.Tensor([[-0.04684504  0.03435652 -0.02922553  0.01416435]], shape=(1, 4), dtype=float32)


#### define global sampling function

In [24]:
def _get_global_context_features(x):
    """
    This function generates a single global observation vector.
    """
    user_id_value = x['user_id']
    user_age_value = x['bucketized_user_age']
    user_occ_value = x['user_occupation_text']
    user_ts_value = x['timestamp']

    _id = test_user_id_model(user_id_value) # input_tensor=tf.Tensor(shape=(4,), dtype=float32)
    _age = test_user_age_model(user_age_value)
    _occ = test_user_occ_model(user_occ_value)
    _ts = test_user_ts_model(user_ts_value)

    # # tmp - insepct numpy() values
    # print(_id.numpy()) #[0])
    # print(_age.numpy()) #[0])
    # print(_occ.numpy()) #[0])
    # print(_ts.numpy()) #[0])

    # to numpy array
    _id = np.array(_id.numpy()[0])
    _age = np.array(_age.numpy()[0])
    _occ = np.array(_occ.numpy()[0])
    _ts = np.array(_ts.numpy()[0])

    concat = np.concatenate(
        [_id, _age, _occ, _ts], axis=-1 # -1
    ).astype(np.float32)

    return concat

In [25]:
for x in train_dataset.batch(1).take(1):
    test_globals = _get_global_context_features(x)

GLOBAL_DIM = test_globals.shape[0]
print(f"GLOBAL_DIM: {GLOBAL_DIM}")

test_globals

GLOBAL_DIM: 16


array([-0.04257477, -0.0204685 , -0.03652425, -0.0370932 , -0.01470114,
        0.03408908, -0.01681744,  0.04299467, -0.04057591, -0.01261982,
        0.04479137, -0.04452629, -0.04684504,  0.03435652, -0.02922553,
        0.01416435], dtype=float32)

### arm preprocessing layers

#### movie ID

In [26]:
mv_id_input_layer = tf.keras.Input(
    name="movie_id",
    shape=(1,),
    dtype=tf.string
)

mv_id_lookup = tf.keras.layers.StringLookup(
    max_tokens=len(vocab_dict['movie_id']) + NUM_OOV_BUCKETS,
    num_oov_indices=NUM_OOV_BUCKETS,
    mask_token=None,
    vocabulary=vocab_dict['movie_id'],
)(mv_id_input_layer)

mv_id_embedding = tf.keras.layers.Embedding(
    # Let's use the explicit vocabulary lookup.
    input_dim=len(vocab_dict['movie_id']) + NUM_OOV_BUCKETS,
    output_dim=MV_EMBEDDING_SIZE
)(mv_id_lookup)

mv_id_embedding = tf.reduce_sum(mv_id_embedding, axis=-2)

# arm_inputs.append(mv_id_input_layer)
# arm_features.append(mv_id_embedding)

In [27]:
test_mv_id_model = tf.keras.Model(inputs=mv_id_input_layer, outputs=mv_id_embedding)

for x in train_dataset.batch(1).take(1):
    print(x["movie_id"])
    print(test_mv_id_model(x["movie_id"]))

tf.Tensor([b'211'], shape=(1,), dtype=string)
tf.Tensor(
[[-0.04094781  0.03707052 -0.02573409 -0.02102193 -0.0301262  -0.00709243
   0.03189244  0.02086892]], shape=(1, 8), dtype=float32)


#### movie genre

In [28]:
mv_genre_input_layer = tf.keras.Input(
    name="movie_genres",
    shape=(1,),
    dtype=tf.float32
)

mv_genre_lookup = tf.keras.layers.IntegerLookup(
    vocabulary=vocab_dict['movie_genres'],
    num_oov_indices=NUM_OOV_BUCKETS,
    oov_value=0,
)(mv_genre_input_layer)

mv_genre_embedding = tf.keras.layers.Embedding(
    # Let's use the explicit vocabulary lookup.
    input_dim=len(vocab_dict['movie_genres']) + NUM_OOV_BUCKETS,
    output_dim=MV_EMBEDDING_SIZE
)(mv_genre_lookup)

mv_genre_embedding = tf.reduce_sum(mv_genre_embedding, axis=-2)

# arm_inputs.append(mv_genre_input_layer)
# arm_features.append(mv_genre_embedding)

In [29]:
test_mv_gen_model = tf.keras.Model(inputs=mv_genre_input_layer, outputs=mv_genre_embedding)

for x in train_dataset.batch(1).take(1):
    print(x["movie_genres"])
    print(test_mv_gen_model(x["movie_genres"]))

tf.Tensor([[4]], shape=(1, 1), dtype=int64)
tf.Tensor(
[[-0.00483013 -0.03928108  0.0106586   0.01059937  0.00203977  0.00540863
  -0.04190093  0.03950352]], shape=(1, 8), dtype=float32)


#### define sampling function

In [30]:
def _get_per_arm_features(x):
    """
    This function generates a single per-arm observation vector
    """
    mv_id_value = x['movie_id']
    mv_gen_value = x['movie_genres'][0]

    _mid = test_mv_id_model(mv_id_value)
    _mgen = test_mv_gen_model(mv_gen_value)

    # to numpy array
    _mid = np.array(_mid.numpy()[0])
    _mgen = np.array(_mgen.numpy()[0])

    # print(_mid)
    # print(_mgen)

    concat = np.concatenate(
        [_mid, _mgen], axis=-1 # -1
    ).astype(np.float32)
    # concat = tf.concat([_mid, _mgen], axis=-1).astype(np.float32)

    return concat

In [31]:
for x in train_dataset.batch(1).take(1):
    test_arms = _get_per_arm_features(x)

PER_ARM_DIM = test_arms.shape[0]
print(f"PER_ARM_DIM: {PER_ARM_DIM}")

test_arms

PER_ARM_DIM: 16


array([-0.04094781,  0.03707052, -0.02573409, -0.02102193, -0.0301262 ,
       -0.00709243,  0.03189244,  0.02086892, -0.00483013, -0.03928108,
        0.0106586 ,  0.01059937,  0.00203977,  0.00540863, -0.04190093,
        0.03950352], dtype=float32)

In [32]:
# GLOBAL_DIM = global_context_sampling_fn()
# GLOBAL_DIM = GLOBAL_DIM.shape[0]
# print(GLOBAL_DIM)

# PER_ARM_DIM = per_arm_context_sampling_fn()
# PER_ARM_DIM = PER_ARM_DIM.shape[0]
# print(PER_ARM_DIM)

## TF-Agents implementation

In TF-Agents, the *per-arm features* implementation differs from the *global-only* feature examples in the following aspects:
* Reward is modeled not per-arm, but globally.
* The arms are permutation invariant: it doesn’t matter which arm is arm 1 or arm 2, only their features.
* One can have a different number of arms to choose from in every step (note that unspecified/dynamically changing number of arms will have a problem with XLA compatibility).

When implementing per-arm features in TF-Bandits, the following details have to be discussed:
* Observation spec and observations,
* Action spec and actions,
* Implementation of specific policies and agents.


**TODO:**
* outline the components and highlight their interactions, dependencies on eachother, etc.

In [33]:
BATCH_SIZE  = 8
NUM_ACTIONS = 1 

print(f"BATCH_SIZE  : {BATCH_SIZE}")
print(f"NUM_ACTIONS : {NUM_ACTIONS}")

BATCH_SIZE  : 8
NUM_ACTIONS : 1


## Tensor Specs

**TODO:**
* explain relationship between Tensor Specs and their Tensor counterparts
* highlight the errors, lessons learned, and utility functions to address these

### Observation spec

**This observation spec allows the user to have a global observation of fixed dimension**, and an unspecified number of *per-arm* features (also of fixed dimension)
* The actions output by the policy are still integers as usual, and they indicate which row of the arm-features it has chosen 
* The action spec must be a single integer value without boundaries:

```python
global_spec = tensor_spec.TensorSpec([GLOBAL_DIM], tf.float32)
per_arm_spec = tensor_spec.TensorSpec([None, PER_ARM_DIM], tf.float32)
observation_spec = {'global': global_spec, 'per_arm': per_arm_spec}

action_spec = tensor_spec.TensorSpec((), tf.int32)
```
> Here the only difference compared to the action spec with global features only is that the tensor spec is not bounded, as we don’t know how many arms there will be at any time step

**XLA compatibility:**
* Since dynamic tensor shapes are not compatible with XLA, the number of arm features (and consequently, number of arms for a step) cannot be dynamic. 
* One workaround is to fix the maximum number of arms for a problem, then pad the arm features in steps with fewer arms, and use action masking to indicate how many arms are actually active.

```python
per_arm_spec = tensor_spec.TensorSpec([NUM_ACTIONS, PER_ARM_DIM], tf.float32)

action_spec = tensor_spec.BoundedTensorSpec(
    shape=(), dtype=tf.int32, minimum = 0, maximum = NUM_ACTIONS - 1
)
```

In [34]:
# global_spec = tensor_spec.TensorSpec(shape=[GLOBAL_DIM], dtype=tf.float32)
# per_arm_spec = tensor_spec.TensorSpec(shape=[NUM_ACTIONS, PER_ARM_DIM], dtype=tf.float32)

# add outer nested dim
# global_spec = tensor_spec.add_outer_dims_nest(  # add_outer_dim
#     specs=global_spec,
#     outer_dims=[HPARAMS['batch_size']]
# )
# per_arm_spec = tensor_spec.add_outer_dims_nest( # add_outer_dim
#     specs=per_arm_spec,
#     outer_dims=[HPARAMS['batch_size']]
# )

# observation_spec = {'global': global_spec, 'per_arm': per_arm_spec}
# observation_spec

In [77]:
# TF-Agents has many helper and utility functions
observation_spec = bandit_spec_utils.create_per_arm_observation_spec(
    GLOBAL_DIM, PER_ARM_DIM, NUM_ACTIONS, 
    add_num_actions_feature=True
) # 2,3,4

observation_spec

{'global': TensorSpec(shape=(16,), dtype=tf.float32, name=None),
 'per_arm': TensorSpec(shape=(1, 16), dtype=tf.float32, name=None),
 'num_actions': BoundedTensorSpec(shape=(), dtype=tf.int32, name=None, minimum=array(1, dtype=int32), maximum=array(1, dtype=int32))}

### Action spec

> The time_step_spec and action_spec are specifications for the input time step and the output action

```python
    if (
        not tensor_spec.is_bounded(action_spec)
        or not tensor_spec.is_discrete(action_spec)
        or action_spec.shape.rank > 1
        or action_spec.shape.num_elements() != 1
    ):
      raise NotImplementedError(
          'action_spec must be a BoundedTensorSpec of type int32 and shape (). '
          'Found {}.'.format(action_spec)
      )
```

* [src](https://github.com/tensorflow/agents/blob/master/tf_agents/bandits/policies/reward_prediction_base_policy.py#L97)

In [38]:
action_spec = tensor_spec.BoundedTensorSpec(
    shape=(), 
    dtype=tf.int32,
    minimum=tf.constant(0),             # 0 
    maximum=NUM_ACTIONS-tf.constant(1), # -1
    name="action_spec"
)

# # len() should not be > 1
# flat_action_spec = tf.nest.flatten(action_spec)
# print(f"len(flat_action_spec): {len(flat_action_spec)}")
# print(f"flat_action_spec     : {flat_action_spec}")

# action_spec = flat_action_spec[0]
action_spec

len(flat_action_spec): 1
flat_action_spec     : [BoundedTensorSpec(shape=(), dtype=tf.int32, name='action_spec', minimum=array(0, dtype=int32), maximum=array(0, dtype=int32))]


BoundedTensorSpec(shape=(), dtype=tf.int32, name='action_spec', minimum=array(0, dtype=int32), maximum=array(0, dtype=int32))

In [39]:
expected_num_actions = action_spec.maximum - action_spec.minimum + 1
print(f"expected_num_actions: {expected_num_actions}")

predicted_rewards_mean = tensor_spec.TensorSpec([expected_num_actions])
print(f"predicted_rewards_mean: {predicted_rewards_mean}")

expected_num_actions: 1
predicted_rewards_mean: TensorSpec(shape=(1,), dtype=tf.float32, name=None)


### TimeStep spec

In [41]:
time_step_spec = ts.time_step_spec(
    observation_spec = observation_spec, 
    # reward_spec = _reward_spec
)
time_step_spec

TimeStep(
{'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
 'observation': {'global': TensorSpec(shape=(16,), dtype=tf.float32, name=None),
                 'per_arm': TensorSpec(shape=(1, 16), dtype=tf.float32, name=None)},
 'reward': TensorSpec(shape=(), dtype=tf.float32, name='reward'),
 'step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type')})

### Inspect chosen arm features spec

In [42]:
time_step_spec.observation

{'global': TensorSpec(shape=(16,), dtype=tf.float32, name=None),
 'per_arm': TensorSpec(shape=(1, 16), dtype=tf.float32, name=None)}

In [45]:
chosen_arm_features_info = (
  policy_utilities.create_chosen_arm_features_info_spec(
      time_step_spec.observation,
  )
)
chosen_arm_features_info

TensorSpec(shape=(16,), dtype=tf.float32, name=None)

In [47]:
# TODO - dont understand this
bandit_policy_type = (
    policy_utilities.create_bandit_policy_type_tensor_spec(shape=[1])
)
bandit_policy_type

BoundedTensorSpec(shape=(1,), dtype=tf.int32, name=None, minimum=array(0, dtype=int32), maximum=array(4, dtype=int32))

In [48]:
info_spec = policy_utilities.PerArmPolicyInfo(
  predicted_rewards_mean=predicted_rewards_mean,
  bandit_policy_type=bandit_policy_type,
  chosen_arm_features=chosen_arm_features_info,
)
info_spec

PerArmPolicyInfo(log_probability=(), predicted_rewards_mean=TensorSpec(shape=(1,), dtype=tf.float32, name=None), multiobjective_scalarized_predicted_rewards_mean=(), predicted_rewards_optimistic=(), predicted_rewards_sampled=(), bandit_policy_type=BoundedTensorSpec(shape=(1,), dtype=tf.int32, name=None, minimum=array(0, dtype=int32), maximum=array(4, dtype=int32)), chosen_arm_features=TensorSpec(shape=(16,), dtype=tf.float32, name=None))

## The Agent

**Note** that contextual bandits form a special case of RL, where the actions taken by the agent do not alter the state of the environment 

> “Contextual” refers to the fact that the agent chooses among a set of actions while having knowledge of the context (environment observation)

### Agent types

**Possible Agent Types:**

```
AGENT_TYPE = ['LinUCB', 'LinTS', 'epsGreedy', 'NeuralLinUCB']
```

**LinearUCBAgent:** (`LinUCB`)
* An agent implementing the Linear UCB bandit algorithm
* (whitepaper) [A contextual bandit approach to personalized news recommendation](https://arxiv.org/abs/1003.0146)
* [docs](https://www.tensorflow.org/agents/api_docs/python/tf_agents/bandits/agents/lin_ucb_agent/LinearUCBAgent)

**LinearThompsonSamplingAgent:** (`LinTS`)
* Implements the Linear Thompson Sampling Agent from the paper: [Thompson Sampling for Contextual Bandits with Linear Payoffs](https://arxiv.org/abs/1209.3352)
* the agent maintains two parameters `weight_covariances` and `parameter_estimators`, and updates them based on experience.
* The inverse of the weight covariance parameters are updated with the outer product of the observations using the Woodbury inverse matrix update, while the parameter estimators are updated by the reward-weighted observation vectors for every action
* [docs](https://www.tensorflow.org/agents/api_docs/python/tf_agents/bandits/agents/linear_thompson_sampling_agent/LinearThompsonSamplingAgent)

**NeuralEpsilonGreedyAgent:** (`epsGreedy`) 
* A neural network based epsilon greedy agent
* This agent receives a neural network that it trains to predict rewards
* The action is chosen greedily with respect to the prediction with probability `1 - epsilon`, and uniformly randomly with probability epsilon
* [docs](https://www.tensorflow.org/agents/api_docs/python/tf_agents/bandits/agents/neural_epsilon_greedy_agent/NeuralEpsilonGreedyAgent)

**NeuralLinUCBAgent:** (`NeuralLinUCB`)
* An agent implementing the LinUCB algorithm on top of a neural network
* `ENCODING_DIM` is the output dimension of the encoding network 
> * This output will be used by either a linear reward layer and epsilon greedy exploration, or by a LinUCB logic, depending on the number of training steps executed so far
* `EPS_PHASE_STEPS` is the number training steps to run for training the encoding network before switching to `LinUCB`
> * If negative, the encoding network is assumed to be already trained
> * If the number of steps is less than or equal to `EPS_PHASE_STEPS`, `epsilon greedy` is used, otherwise `LinUCB`
* [docs](https://www.tensorflow.org/agents/api_docs/python/tf_agents/bandits/agents/neural_linucb_agent/NeuralLinUCBAgent)

### network types

Which network architecture to use for the `epsGreedy` or `NeuralLinUCB` agents

```
NETWORK_TYPE = ['commontower', 'dotproduct']
```

**GlobalAndArmCommonTowerNetwork:** (`commontower`)
* This network takes the output of the global and per-arm networks, and leads them through a common network, that in turn outputs reward estimates
> * `GLOBAL_LAYERS` - Iterable of ints. Specifies the layers of the global tower
> * `ARM_LAYERS` - Iterable of ints. Specifies the layers of the arm tower
> * `COMMON_LAYERS` - Iterable of ints. Specifies the layers of the common tower
* The network produced by this function can be used either in `GreedyRewardPredictionPolicy`, or `NeuralLinUCBPolicy`
> * In the former case, the network must have `output_dim=1`, it is going to be an instance of `QNetwork`, and used in the policy as a reward prediction network
> * In the latter case, the network will be an encoding network with its output consumed by a reward layer or a `LinUCB` method. The specified `output_dim` will be the encoding dimension
* [docs](https://www.tensorflow.org/agents/api_docs/python/tf_agents/bandits/networks/global_and_arm_feature_network/GlobalAndArmCommonTowerNetwork)

**GlobalAndArmDotProductNetwork:** (`dotproduct`)
* This network calculates the **dot product** of the output of the global and per-arm networks and returns them as reward estimates
> * `GLOBAL_LAYERS` - Iterable of ints. Specifies the layers of the global tower
> * `ARM_LAYERS` - Iterable of ints. Specifies the layers of the arm tower
* [docs](https://www.tensorflow.org/agents/api_docs/python/tf_agents/bandits/networks/global_and_arm_feature_network/GlobalAndArmDotProductNetwork)

### define agent and network

In [49]:
# ================================
# Agents
# ================================
AGENT_TYPE      = 'epsGreedy' # 'LinUCB' | 'LinTS |, 'epsGreedy' | 'NeuralLinUCB'

# Parameters for linear agents (LinUCB and LinTS).
AGENT_ALPHA     = 0.1

# Parameters for neural agents (NeuralEpsGreedy and NerualLinUCB).
EPSILON         = 0.01
LR              = 0.05

# Parameters for NeuralLinUCB
ENCODING_DIM    = 1
EPS_PHASE_STEPS = 1000

# ================================
# Agent's Preprocess Network
# ================================
NETWORK_TYPE    = "dotproduct" # 'commontower' | 'dotproduct'

if AGENT_TYPE == 'NeuralLinUCB':
    NETWORK_TYPE = 'commontower'
    

GLOBAL_LAYERS   = [16, 4]
ARM_LAYERS      = [16, 4]
COMMON_LAYERS   = [4]

observation_and_action_constraint_splitter = None

HPARAMS = {  # TODO - streamline and consolidate
    "batch_size": BATCH_SIZE,
    "num_actions": NUM_ACTIONS,
    "model_type": AGENT_TYPE,
    "network_type": NETWORK_TYPE,
    "global_layers": GLOBAL_LAYERS,
    "per_arm_layers": ARM_LAYERS,
    "common_layers": COMMON_LAYERS,
    "learning_rate": LR,
    "epsilon": EPSILON,
}
pprint(HPARAMS)

{'batch_size': 8,
 'common_layers': [4],
 'epsilon': 0.01,
 'global_layers': [16, 4],
 'learning_rate': 0.05,
 'model_type': 'epsGreedy',
 'network_type': 'dotproduct',
 'num_actions': 1,
 'per_arm_layers': [16, 4]}


### Agent Factory

**TODO:**
* consolidate agent, network, and hparams

In [None]:
# from tf_agents.bandits.agents import greedy_reward_prediction_agent

# network = None
# observation_and_action_constraint_splitter = None

# # global_step = tf.Variable(0)
# global_step = tf.compat.v1.train.get_or_create_global_step()

# if HPARAMS['network_type'] == 'commontower':
#     network = global_and_arm_feature_network.create_feed_forward_common_tower_network(
#         observation_spec = observation_spec, 
#         global_layers = HPARAMS['global_layers'], 
#         arm_layers = HPARAMS['per_arm_layers'], 
#         common_layers = HPARAMS['common_layers'],
#         # output_dim = 1
#     )
# elif HPARAMS['network_type'] == 'dotproduct':
#     network = global_and_arm_feature_network.create_feed_forward_dot_product_network(
#         observation_spec = observation_spec, 
#         global_layers = HPARAMS['global_layers'], 
#         arm_layers = HPARAMS['per_arm_layers']
#     )
    
# # agent = greedy_reward_prediction_agent.GreedyRewardPredictionAgent()
    
# agent = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent(
#     time_step_spec=time_step_spec,
#     action_spec=action_spec,
#     reward_network=network,
#     optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=HPARAMS['learning_rate']),
#     epsilon=HPARAMS['epsilon'],
#     observation_and_action_constraint_splitter=(
#         observation_and_action_constraint_splitter
#     ),
#     accepts_per_arm_features=True,
#     emit_policy_info=policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN,
#     train_step_counter=global_step,
#     info_fields_to_inherit_from_greedy=[
#         policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN
#     ],
#     name='OffpolicyNeuralEpsGreedyAgent'
# )
# agent.initialize()

# print(f"Agent: {agent.name}\n")
# if network:
#     print(f"Network: {network.name}")

In [51]:
# from tf_agents.bandits.policies import policy_utilities
# from tf_agents.bandits.agents import greedy_reward_prediction_agent

network = None
observation_and_action_constraint_splitter = None
global_step = tf.compat.v1.train.get_or_create_global_step()

if AGENT_TYPE == 'LinUCB':
    agent = lin_ucb_agent.LinearUCBAgent(
        time_step_spec=time_step_spec,
        action_spec=action_spec,
        alpha=AGENT_ALPHA,
        accepts_per_arm_features=True,
        dtype=tf.float32,
    )
elif AGENT_TYPE == 'LinTS':
    agent = lin_ts_agent.LinearThompsonSamplingAgent(
        time_step_spec=time_step_spec,
        action_spec=action_spec,
        alpha=AGENT_ALPHA,
        observation_and_action_constraint_splitter=(
            observation_and_action_constraint_splitter
        ),
        accepts_per_arm_features=True,
        dtype=tf.float32,
    )
elif AGENT_TYPE == 'epsGreedy':
    # obs_spec = per_arm_tf_env.observation_spec()
    if NETWORK_TYPE == 'commontower':
        network = global_and_arm_feature_network.create_feed_forward_common_tower_network(
            observation_spec = observation_spec, 
            global_layers = GLOBAL_LAYERS, 
            arm_layers = ARM_LAYERS, 
            common_layers = COMMON_LAYERS,
            # output_dim = 1
        )
    elif NETWORK_TYPE == 'dotproduct':
        network = global_and_arm_feature_network.create_feed_forward_dot_product_network(
            observation_spec = observation_spec, 
            global_layers = GLOBAL_LAYERS, 
            arm_layers = ARM_LAYERS
        )
    agent = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent(
        time_step_spec=time_step_spec,
        action_spec=action_spec,
        reward_network=network,
        optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=HPARAMS['learning_rate']),
        epsilon=HPARAMS['epsilon'],
        observation_and_action_constraint_splitter=(
            observation_and_action_constraint_splitter
        ),
        accepts_per_arm_features=True,
        emit_policy_info=(policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN),
        train_step_counter=global_step,
        # info_fields_to_inherit_from_greedy=[
        #     policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN
        # ],
        name='OffpolicyNeuralEpsGreedyAgent'
    )

elif AGENT_TYPE == 'NeuralLinUCB':
    # obs_spec = per_arm_tf_env.observation_spec()
    network = (
        global_and_arm_feature_network.create_feed_forward_common_tower_network(
            observation_spec = observation_spec, 
            global_layers = GLOBAL_LAYERS, 
            arm_layers = ARM_LAYERS, 
            common_layers = COMMON_LAYERS,
            output_dim = ENCODING_DIM
        )
    )
    agent = neural_linucb_agent.NeuralLinUCBAgent(
        time_step_spec=per_arm_tf_env.time_step_spec(),
        action_spec=per_arm_tf_env.action_spec(),
        encoding_network=network,
        encoding_network_num_train_steps=EPS_PHASE_STEPS,
        encoding_dim=ENCODING_DIM,
        optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
        alpha=1.0,
        gamma=1.0,
        epsilon_greedy=EPSILON,
        accepts_per_arm_features=True,
        debug_summaries=True,
        summarize_grads_and_vars=True,
        emit_policy_info=policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN,
    )
    
agent.initialize() # TODO - does this go here?
    
print(f"Agent: {agent.name}\n")
if network:
    print(f"Network: {network.name}")

Agent: OffpolicyNeuralEpsGreedyAgent

Network: GlobalAndArmDotProductNetwork


In [52]:
pprint(agent.policy.trajectory_spec)

_TupleWrapper(Trajectory(
{'action': BoundedTensorSpec(shape=(), dtype=tf.int32, name='action_spec', minimum=array(0, dtype=int32), maximum=array(0, dtype=int32)),
 'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
 'next_step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type'),
 'observation': DictWrapper({'global': TensorSpec(shape=(16,), dtype=tf.float32, name=None), 'per_arm': TensorSpec(shape=(1, 16), dtype=tf.float32, name=None)}),
 'policy_info': PerArmPolicyInfo(log_probability=(), predicted_rewards_mean=TensorSpec(shape=(1,), dtype=tf.float32, name=None), multiobjective_scalarized_predicted_rewards_mean=(), predicted_rewards_optimistic=(), predicted_rewards_sampled=(), bandit_policy_type=(), chosen_arm_features=TensorSpec(shape=(16,), dtype=tf.float32, name=None)),
 'reward': TensorSpec(shape=(), dtype=tf.float32, name='reward'),
 'step_type': TensorSpec(shape=(), dtype=tf.i

In [53]:
print('training data spec: ', agent.training_data_spec)

training data spec:  Trajectory(
{'action': BoundedTensorSpec(shape=(), dtype=tf.int32, name='action_spec', minimum=array(0, dtype=int32), maximum=array(0, dtype=int32)),
 'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
 'next_step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type'),
 'observation': DictWrapper({'global': TensorSpec(shape=(16,), dtype=tf.float32, name=None)}),
 'policy_info': PerArmPolicyInfo(log_probability=(), predicted_rewards_mean=TensorSpec(shape=(1,), dtype=tf.float32, name=None), multiobjective_scalarized_predicted_rewards_mean=(), predicted_rewards_optimistic=(), predicted_rewards_sampled=(), bandit_policy_type=(), chosen_arm_features=TensorSpec(shape=(16,), dtype=tf.float32, name=None)),
 'reward': TensorSpec(shape=(), dtype=tf.float32, name='reward'),
 'step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type')})


In [54]:
print('observation spec in training: ', agent.training_data_spec.observation)

observation spec in training:  {'global': TensorSpec(shape=(16,), dtype=tf.float32, name=None)}


In [55]:
print('chosen arm features: ', agent.training_data_spec.policy_info.chosen_arm_features)

chosen arm features:  TensorSpec(shape=(16,), dtype=tf.float32, name=None)


In [56]:
print("TimeStep Spec (for each batch):\n", agent.time_step_spec, "\n")

TimeStep Spec (for each batch):
 TimeStep(
{'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
 'observation': DictWrapper({'global': TensorSpec(shape=(16,), dtype=tf.float32, name=None), 'per_arm': TensorSpec(shape=(1, 16), dtype=tf.float32, name=None)}),
 'reward': TensorSpec(shape=(), dtype=tf.float32, name='reward'),
 'step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type')}) 



In [57]:
print("Action Spec (for each batch):\n", agent.action_spec, "\n")

Action Spec (for each batch):
 BoundedTensorSpec(shape=(), dtype=tf.int32, name='action_spec', minimum=array(0, dtype=int32), maximum=array(0, dtype=int32)) 



In [59]:
agent.collect_policy

<tf_agents.policies.epsilon_greedy_policy.EpsilonGreedyPolicy at 0x7fb738520c70>

## Reward function

**TODO:**
* explain how to translate reward to this common recommendation objectives

In [60]:
def _get_rewards(element):
    """Calculates reward for the actions."""

    def _calc_reward(x):
        """Calculates reward for a single action."""
        r0 = lambda: tf.constant(0.0)
        r1 = lambda: tf.constant(-10.0)
        r2 = lambda: tf.constant(2.0)
        r3 = lambda: tf.constant(3.0)
        r4 = lambda: tf.constant(4.0)
        r5 = lambda: tf.constant(10.0)
        c1 = tf.equal(x, 1.0)
        c2 = tf.equal(x, 2.0)
        c3 = tf.equal(x, 3.0)
        c4 = tf.equal(x, 4.0)
        c5 = tf.equal(x, 5.0)
        return tf.case(
            [(c1, r1), (c2, r2), (c3, r3),(c4, r4),(c5, r5)], 
            default=r0, exclusive=True
        )

    return tf.map_fn(
        fn=_calc_reward, 
        elems=element['user_rating'], 
        dtype=tf.float32
    )

## Trajectory function

**parking lot**
* does trajectory fn need concept of `dummy_chosen_arm_features`, similar to [this](https://github.com/tensorflow/agents/blob/master/tf_agents/bandits/policies/reward_prediction_base_policy.py#L297)

```python
      dummy_chosen_arm_features = tf.nest.map_structure(
          lambda obs: tf.zeros_like(obs[:, 0, ...]),
          time_step.observation[bandit_spec_utils.PER_ARM_FEATURE_KEY],
      )
```

In [62]:
# # specs
# obs_spec = bandit_spec_utils.create_per_arm_observation_spec(
#     GLOBAL_DIM, PER_ARM_DIM, NUM_ACTIONS, add_num_actions_feature=False
# )
# time_step_spec = ts.time_step_spec(obs_spec)
# action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, num_actions-tf.constant(1))

# agent.policy.trajectory_spec

```python
def sample_spec_nest(
    structure,
    seed=None,
    outer_dims=(),
    minimum=None,
    maximum=None
):
  """Samples the given nest of specs.

  Args:
    structure: A nest of `TensorSpec`.
    seed: A seed used for sampling ops
    outer_dims: An optional `Tensor` specifying outer dimensions to add to the
      spec shape before sampling.
    minimum: An optional numeric value. If set, numeric specs within the nest
      (both bounded and unbounded) will be restricted to this minimum.
    maximum: Similar to the above but with maximums.

  Returns:
    A nest of sampled values following the ArraySpec definition.
```

In [64]:
observation_sample = tensor_spec.sample_spec_nest(
    observation_spec, outer_dims=[HPARAMS['batch_size']], minimum=0, maximum=HPARAMS['num_actions'] -1
)
observation_sample

{'global': <tf.Tensor: shape=(8, 16), dtype=float32, numpy=
 array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       dtype=float32)>,
 'per_arm': <tf.Tensor: shape=(8, 1, 16), dtype=float32, numpy=
 array([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
 
        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
 
        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
 
        [[0., 0., 0., 0.,

In [65]:
from tf_agents.bandits.specs import utils as bandit_spec_utils
from tf_agents.trajectories import trajectory

def _trajectory_fn(element): # hparams
    """Converts a dataset element into a trajectory."""
    global_features = _get_global_context_features(element)
    arm_features = _get_per_arm_features(element)
    
    # # tmp 
    # print(f"global_features: {global_features}")
    # print(f"arm_features: {arm_features}")
    
    # Adds a time dimension.
    # arm_features = _add_outer_dimension(arm_features)
    # arm_features = tensor_spec.add_outer_dim(arm_features)
    
    # obs spec
    observation = {
        bandit_spec_utils.GLOBAL_FEATURE_KEY:
            global_features,
            # _add_outer_dimension(global_features)
        # bandit_spec_utils.PER_ARM_FEATURE_KEY:
        #     _add_outer_dimension(arm_features),
    }
    # print("after adding extra dim...")
    # print(f"observation: {observation}")
    # print(f"arm_features: {arm_features}")
    
    # reward = tensor_spec.add_outer_dim(_get_rewards(element))
    reward = _get_rewards(element)
    # print(f"reward: {reward}")
    
    # To emit the predicted rewards in policy_info, we need to create dummy
    # rewards to match the definition in TensorSpec for the ones specified in
    # emit_policy_info set.
    dummy_rewards = tf.zeros([HPARAMS['batch_size'], 1, HPARAMS['num_actions']])
    # dummy_rewards = tf.zeros([HPARAMS['batch_size'], HPARAMS['num_actions']])
    # dummy_rewards = tf.zeros([HPARAMS['num_actions']])
    policy_info = policy_utilities.PerArmPolicyInfo(
        chosen_arm_features=arm_features,
        # Pass dummy mean rewards here to match the model_spec for emitting
        # mean rewards in policy info
        predicted_rewards_mean=dummy_rewards
    )
    
# tf_agents.policies.utils.create_chosen_arm_features_info_spec(
#     observation_spec: tf_agents.typing.types.NestedTensorSpec
# ) -> tf_agents.typing.types.NestedTensorSpec
    
    if HPARAMS['model_type'] == 'neural_ucb':
        policy_info = policy_info._replace(
            predicted_rewards_optimistic=dummy_rewards
        )
        
    # print(f"observation: {observation}")
    # print(f"reward: {reward}")
    # print(f"policy_info: {policy_info}")
    # print(f"dummy_rewards: {dummy_rewards}")
    
    return trajectory.single_step(
        observation=observation,
        action=tf.zeros_like(
            reward, dtype=tf.int32
        ),  # Arm features are copied from policy info, put dummy zeros here
        policy_info=policy_info,
        reward=reward,
        discount=tf.zeros_like(reward)
    )


In [66]:
# TODO

# def build_dict_from_trajectory(
#     step: int,
#     next_step: int,
#     trajectory: trajectories.Trajectory) -> Dict[str, Any]:
#     """Builds a dict from `trajectory` data.

#     Args:
#     trajectory: A `trajectories.Trajectory` object.

#     Returns:
#     A dict holding the same data as `trajectory`.
#     """
#     trajectory_dict = {
#         "step_type": [step].numpy(),
#         "observation": [{
#             "observation_batch": batch
#         } for batch in trajectory.observation.numpy().tolist()],
#         "action": trajectory.action.numpy().tolist(),
#         "policy_info": trajectory.policy_info,
#         "next_step_type": [next_step],
#         "reward": trajectory.reward.numpy().tolist(),
#         "discount": trajectory.discount.numpy().tolist(),
#     }
#     return trajectory_dict

### write trajectories to file

In [None]:
# VERSION = 'v1'

# DATASET_FILE = f'{VERSION}-off-policy-trajectories.json'
# !touch $DATASET_FILE

In [None]:
# dataset_size = len(list(train_dataset))
# print(f"dataset_size: {dataset_size}")

# small_count = dataset_size/100
# print(f"small_count: {small_count}")

In [None]:
# import time
# import json

# def write_trajectories_to_file(
#     dataset_size: int,
#     data_file: str,
#     batch_size: int,
# ):
#     batched_dataset = train_dataset.batch(batch_size)
#     print(f"writting file...")
    
#     data_list = []
    
#     start_time = time.time()
#     step = 1
#     with open(data_file, "w") as f:
#         for x in batched_dataset.take(count=dataset_size):
#             # print(f"step: {step}")
#             nexx_step = step + 1
#             # print(f"nexx_step: {nexx_step}")

#             single_traj = get_trajectory_from_environment(x)
#             print(single_traj)
            
#             _trajectory_dict = build_dict_from_trajectory(step=step, next_step=nexx_step, trajectory=single_traj)
#             # print(type(trajectory_dict))
#             decoded = _trajectory_dict.decode('utf-8')
#             print(f"decoded: {decoded}")
#             data_list.append(_trajectory_dict)

#             step+=1
            
#             break
            
#         for entry in data_list:
#             traj_dict_tmp = {}
#             traj_dict_tmp['step_type'] = entry['step_type']
#             traj_dict_tmp['observation'] = entry['observation']
#             traj_dict_tmp['action'] = entry['action']
#             traj_dict_tmp['policy_info'] = entry['policy_info']
#             traj_dict_tmp['next_step_type'] = entry['next_step_type']
#             traj_dict_tmp['reward'] = entry['reward']
#             traj_dict_tmp['discount'] = entry['discount']
            
#             # f.write(json.dumps(traj_dict_tmp) + "\n")
            
#         print(f"writting to file complete...")

#     end_time = time.time()
#     runtime_mins = int((end_time - start_time) / 60)
#     print(f"runtime_mins: {runtime_mins}")

#     return data_list

In [None]:
# sample_data_list = write_trajectories_to_file(
#     dataset_size=int(small_count),
#     data_file=DATASET_FILE,
#     batch_size=2
# )

# sample_data_list[0]
# sample_data_list[0]['observation']

In [None]:
# train_utils.upload_blob(
#     bucket_name='',
#     source_file_name=,
#     destination_blob_name=f'{RUN_NAME}/candidates/xxxx.json'
# )

### validate shapes and dims

**TODO:** add auto test for these

In [69]:
for x in train_dataset.batch(HPARAMS['batch_size']).take(1):
    sample_trajectory = _trajectory_fn(x)
    
sample_trajectory

Trajectory(
{'action': <tf.Tensor: shape=(8,), dtype=int32, numpy=array([0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)>,
 'discount': <tf.Tensor: shape=(8,), dtype=float32, numpy=array([0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)>,
 'next_step_type': <tf.Tensor: shape=(8,), dtype=int32, numpy=array([2, 2, 2, 2, 2, 2, 2, 2], dtype=int32)>,
 'observation': {'global': <tf.Tensor: shape=(16,), dtype=float32, numpy=
array([-0.04257477, -0.0204685 , -0.03652425, -0.0370932 , -0.01470114,
        0.03408908, -0.01681744,  0.04299467, -0.04057591, -0.01261982,
        0.04479137, -0.04452629, -0.04684504,  0.03435652, -0.02922553,
        0.01416435], dtype=float32)>},
 'policy_info': PerArmPolicyInfo(log_probability=(), predicted_rewards_mean=<tf.Tensor: shape=(8, 1, 1), dtype=float32, numpy=
array([[[0.]],

       [[0.]],

       [[0.]],

       [[0.]],

       [[0.]],

       [[0.]],

       [[0.]],

       [[0.]]], dtype=float32)>, multiobjective_scalarized_predicted_rewards_mean=(), predicted

In [74]:
from tf_agents.utils import nest_utils

# nest_utils.is_batched_nested_tensors(
#     tensors=sample_trajectory.policy_info.chosen_arm_features,
#     specs=agent.training_data_spec.policy_info.chosen_arm_features,
#     num_outer_dims=1,  # 2
#     allow_extra_fields=False,
#     check_dtypes=True
# )

# nest_utils.is_batched_nested_tensors(
#     tensors=sample_trajectory.observation['global'],
#     specs=agent.training_data_spec.observation['global'],
#     num_outer_dims=0,
#     allow_extra_fields=False,
#     check_dtypes=True
# )

nest_utils.is_batched_nested_tensors(
    tensors=sample_trajectory.action,
    specs=agent.training_data_spec.action,
    num_outer_dims=1,
    allow_extra_fields=False,
    check_dtypes=True
)

True

In [None]:
# arm_observations = per_arm_context_sampling_fn()
# print(arm_observations)

# outer_rank = nest_utils.get_outer_rank(tensors = arm_observations, specs = observation_spec['per_arm'])
# outer_rank

### replay buffer

> replay buffer and observers keep track of Trajectory data

**TODO** - is this needed?

In [None]:
# driver_steps = 2                           # number of steps to run per batch
# data_spec = agent.policy.trajectory_spec

# replay_buffer = trainer.get_replay_buffer(
#     data_spec, HPARAMS['batch_size'], driver_steps
# )
# observers = [replay_buffer.add_batch]

In [None]:
# # Read the replay buffer as a Dataset,
# # read batches of 4 elements, each with 2 timesteps:
# dataset = replay_buffer.as_dataset(
#     sample_batch_size=4,
#     num_steps=2
# )

# DRIVER_STEPS = 3

# replay_buffer = generate_simulation_data(
#     raw_data_path=DATA_PATH,
#     batch_size=BATCH_SIZE,
#     rank_k=RANK_K,
#     num_actions=NUM_ACTIONS,
#     driver_steps=DRIVER_STEPS
# )
# replay_buffer

# dataset = replay_buffer.as_dataset(
#     sample_batch_size=BATCH_SIZE,
#     num_steps=DRIVER_STEPS
# )
# dataset

## Train loop

`agent.train(experience=...)`

where `experience` is a batch of trajectories data in the form of a Trajectory. 
* The structure of experience must match that of `self.training_data_spec`. 
* All tensors in experience must be shaped [batch, time, ...] where time must be equal to self.train_step_length if that property is not None.

In [76]:
import collections
from tf_agents.utils import common

NUM_EPOCHS = 1

# global_step = tf.compat.v1.train.get_global_step()

# (Optional) Optimize by wrapping some of the code in a graph using TF function.
agent.train = common.function(agent.train)

# Reset the train step
agent.train_step_counter.assign(0)

train_loss = collections.defaultdict(list)

for epoch in range(NUM_EPOCHS):
    
    iterator = iter(train_dataset.batch(HPARAMS['batch_size']))
    data = next(iterator)
    print(f"print data: {data}")
    
    trajectories = _trajectory_fn(data)
    print(f"print trajectories: {trajectories}")
    
    # All tensors in experience must be shaped [batch, time, ...] 
    step = agent.train_step_counter.numpy()
    loss = agent.train(experience=trajectories)
    
    # break

print data: {'bucketized_user_age': <tf.Tensor: shape=(8,), dtype=float32, numpy=array([25., 45., 18., 25., 35., 45., 50., 25.], dtype=float32)>, 'movie_genres': <tf.Tensor: shape=(8, 1), dtype=int64, numpy=
array([[4],
       [7],
       [7],
       [1],
       [0],
       [4],
       [7],
       [0]])>, 'movie_id': <tf.Tensor: shape=(8,), dtype=string, numpy=
array([b'211', b'678', b'135', b'97', b'568', b'150', b'483', b'121'],
      dtype=object)>, 'timestamp': <tf.Tensor: shape=(8,), dtype=int64, numpy=
array([874948475, 888638193, 887747108, 882475618, 875350485, 875946055,
       879453933, 880149166])>, 'user_id': <tf.Tensor: shape=(8,), dtype=string, numpy=
array([b'346', b'602', b'393', b'152', b'738', b'382', b'85', b'152'],
      dtype=object)>, 'user_occupation_text': <tf.Tensor: shape=(8,), dtype=string, numpy=
array([b'other', b'other', b'student', b'educator', b'technician',
       b'engineer', b'educator', b'educator'], dtype=object)>, 'user_rating': <tf.Tensor: shape=

ValueError: Received a mix of batched and unbatched Tensors, or Tensors are not compatible with Specs.  num_outer_dims: 2.
Saw tensor_shapes:
   Trajectory(
{'action': TensorShape([8]),
 'discount': TensorShape([8]),
 'next_step_type': TensorShape([8]),
 'observation': DictWrapper({'global': TensorShape([16])}),
 'policy_info': PerArmPolicyInfo(log_probability=(), predicted_rewards_mean=TensorShape([8, 1, 1]), multiobjective_scalarized_predicted_rewards_mean=(), predicted_rewards_optimistic=(), predicted_rewards_sampled=(), bandit_policy_type=(), chosen_arm_features=TensorShape([16])),
 'reward': TensorShape([8]),
 'step_type': TensorShape([8])})
And spec_shapes:
   Trajectory(
{'action': TensorShape([]),
 'discount': TensorShape([]),
 'next_step_type': TensorShape([]),
 'observation': DictWrapper({'global': TensorShape([16])}),
 'policy_info': PerArmPolicyInfo(log_probability=(), predicted_rewards_mean=TensorShape([1]), multiobjective_scalarized_predicted_rewards_mean=(), predicted_rewards_optimistic=(), predicted_rewards_sampled=(), bandit_policy_type=(), chosen_arm_features=TensorShape([16])),
 'reward': TensorShape([]),
 'step_type': TensorShape([])})

## debugging notes

* say you have a global observation spec of [17]. And you have two batch dimensions [4, 5]. Then your observation has to have the shape [4, 5, 17]
* and then if you have arm_obs_spec with shape [9, 13], then the arm obs shape has to be exactly [4, 5, 9, 13]
* and this has to be true for every single tensor in your tensor nest
* the first 2 dims are the outer dims that are the same for all tensors, the rest of the dimensions have to follow the spec for each tensor