# Train Bandits with per-arm features

**Exploring linear and nonlinear** (e.g., those with neural network-based value functions) bandit methods for recommendations using TF-Agents

> Neural linear bandits provide a nice way to leverage the representation power of deep learning and the bandit approach for uncertainty measure and efficient exploration

## Load env config

* use the prefix from `00-env-setup`

In [1]:
PREFIX = 'mabv1'

In [2]:
# staging GCS
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}-bucket'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PROJECT_ID               = "wortz-project-352116"
PROJECT_NUM              = "679926387543"
LOCATION                 = "us-central1"

REGION                   = "us-central1"
BQ_LOCATION              = "US"
VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"

VERTEX_SA                = "679926387543-compute@developer.gserviceaccount.com"

PREFIX                   = "mabv1"
VERSION                  = "v1"

BUCKET_NAME              = "mabv1-wortz-project-352116-bucket"
BUCKET_URI               = "gs://mabv1-wortz-project-352116-bucket"
DATA_GCS_PREFIX          = "data"
DATA_PATH                = "gs://mabv1-wortz-project-352116-bucket/data"
VOCAB_SUBDIR             = "vocabs"
VOCAB_FILENAME           = "vocab_dict.pkl"

VPC_NETWORK_FULL         = "projects/679926387543/global/networks/ucaip-haystack-vpc-network"

BIGQUERY_DATASET_ID      = "wortz-project-352116.movielens_dataset_mabv1"
BIGQUERY_TABLE_ID        = "wortz-project-352116.movielens_dataset_mabv1.training_dataset"

REPO_D

## imports

In [3]:
## need to fix protobuffs I upgraded to tf 2.12
# !pip install protobuf==3.20.3 --user
# !pip install tensorflow==2.12.0 --user

In [4]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [5]:
import functools
from collections import defaultdict
from typing import Callable, Dict, List, Optional, TypeVar
from datetime import datetime
import time
from pprint import pprint
import pickle as pkl
import sys
import os
sys.path.append("..")

# logging
import logging
logging.disable(logging.WARNING)

import matplotlib.pyplot as plt
import numpy as np

# google cloud
from google.cloud import aiplatform, storage

# tensorflow
import tensorflow as tf
from tf_agents.agents import TFAgent

from tf_agents.bandits.environments import stationary_stochastic_per_arm_py_environment as p_a_env
from tf_agents.bandits.metrics import tf_metrics as tf_bandit_metrics
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import tf_py_environment
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.specs import tensor_spec
from tf_agents.trajectories import time_step as ts

from tf_agents.bandits.agents import lin_ucb_agent
from tf_agents.bandits.agents import linear_thompson_sampling_agent as lin_ts_agent
from tf_agents.bandits.agents import neural_epsilon_greedy_agent
from tf_agents.bandits.agents import neural_linucb_agent
from tf_agents.bandits.networks import global_and_arm_feature_network

# GPU
from numba import cuda 
import gc

# my project
from src.per_arm_rl import data_utils
from src.per_arm_rl import data_config

# tf exceptions and vars
if tf.__version__[0] != "2":
    raise Exception("The trainer only runs with TensorFlow version 2.")

T = TypeVar("T")

caused by: ['/opt/conda/envs/tensorflow/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/envs/tensorflow/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [6]:
# gpus = tf.config.experimental.list_physical_devices('GPU')
# for gpu in gpus:
#     tf.config.experimental.set_memory_growth(gpu, True)
    
# gpus

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


In [7]:
# device = cuda.get_current_device()
# device.reset()
# gc.collect()

In [8]:
# cloud storage client
storage_client = storage.Client(project=PROJECT_ID)
# Vertex client
aiplatform.init(project=PROJECT_ID, location=LOCATION)

# Data prep

### Read TF Records

In [9]:
options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.AUTO

In [10]:
train_files = []
for blob in storage_client.list_blobs(f"{BUCKET_NAME}", prefix=f'{DATA_GCS_PREFIX}/val'): # tmp TODO - "train"
    if '.tfrecord' in blob.name:
        train_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
        
train_files

['gs://mabv1-wortz-project-352116-bucket/data/val/ml-ratings-100k-val.tfrecord']

In [11]:
train_dataset = tf.data.TFRecordDataset(train_files)
train_dataset = train_dataset.map(data_utils.parse_tfrecord)

for x in train_dataset.batch(1).take(1):
    pprint(x)

{'bucketized_user_age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([25.], dtype=float32)>,
 'movie_genres': <tf.Tensor: shape=(1, 1), dtype=int64, numpy=array([[4]])>,
 'movie_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'211'], dtype=object)>,
 'timestamp': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([874948475])>,
 'user_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'346'], dtype=object)>,
 'user_occupation_text': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'other'], dtype=object)>,
 'user_rating': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([4.], dtype=float32)>}


In [81]:
# NEW Sorting by user_id, grabbing most recent 5 movies

NUM_ACTIONS = 5

from tensorflow.data.experimental import group_by_window

def key_f(row):
    return tf.strings.to_number(row['user_id']
                               ,out_type=tf.int64)  

def reduce_func(key, ds):
    
    ds=ds\
    .batch(1000000)\
    .map(
        lambda x: dict([(k, tf.gather(x[k], tf.argsort(x["timestamp"]))) for k, v in x.items()])
    )\
    .unbatch()

    return ds.take(NUM_ACTIONS)

t = group_by_window(key_func = key_f, reduce_func = reduce_func, window_size=100)

sorted_top_ten = train_dataset.apply(t)


In [84]:
for x in sorted_top_ten.batch(NUM_ACTIONS).take(3):
    pprint(x)

{'bucketized_user_age': <tf.Tensor: shape=(5,), dtype=float32, numpy=array([18., 18., 18., 18., 18.], dtype=float32)>,
 'movie_genres': <tf.Tensor: shape=(5, 1), dtype=int64, numpy=
array([[7],
       [7],
       [0],
       [0],
       [4]])>,
 'movie_id': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'468', b'193', b'174', b'187', b'204'], dtype=object)>,
 'timestamp': <tf.Tensor: shape=(5,), dtype=int64, numpy=array([885544698, 885544698, 885544739, 885544739, 885544769])>,
 'user_id': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'405', b'405', b'405', b'405', b'405'], dtype=object)>,
 'user_occupation_text': <tf.Tensor: shape=(5,), dtype=string, numpy=
array([b'healthcare', b'healthcare', b'healthcare', b'healthcare',
       b'healthcare'], dtype=object)>,
 'user_rating': <tf.Tensor: shape=(5,), dtype=float32, numpy=array([3., 4., 5., 5., 5.], dtype=float32)>}
{'bucketized_user_age': <tf.Tensor: shape=(5,), dtype=float32, numpy=array([50., 50., 50., 50., 50.], dtype=fl

### get vocab

**TODO:** 
* streamline vocab calls

In [12]:
GENERATE_VOCABS = False
print(f"GENERATE_VOCABS: {GENERATE_VOCABS}")

VOCAB_SUBDIR   = "vocabs"
VOCAB_FILENAME = "vocab_dict.pkl"

GENERATE_VOCABS: False


In [17]:
if not GENERATE_VOCABS:

    EXISTING_VOCAB_FILE = f'gs://{BUCKET_NAME}/{VOCAB_SUBDIR}/{VOCAB_FILENAME}'
    print(f"Downloading vocab...")
    
    os.system(f'gsutil -q cp {EXISTING_VOCAB_FILE} .')
    print(f"Downloaded vocab from: {EXISTING_VOCAB_FILE}\n")

    filehandler = open(VOCAB_FILENAME, 'rb')
    vocab_dict = pkl.load(filehandler)
    filehandler.close()
    
    for key in vocab_dict.keys():
        pprint(key)

Downloading vocab...
Downloaded vocab from: gs://mabv1-wortz-project-352116-bucket/vocabs/vocab_dict.pkl

'movie_id'
'user_id'
'user_occupation_text'
'movie_genres'
'bucketized_user_age'
'max_timestamp'
'min_timestamp'
'timestamp_buckets'


# Per-Arm Bandits

In [18]:
from tf_agents.bandits.agents import lin_ucb_agent
from tf_agents.bandits.environments import stationary_stochastic_per_arm_py_environment as p_a_env
from tf_agents.bandits.metrics import tf_metrics as tf_bandit_metrics
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import tf_py_environment
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.specs import tensor_spec
from tf_agents.trajectories import time_step as ts

nest = tf.nest

## Preprocessing layers for global and arm features

The preproccesing layers will ultimately feed the two functions described below, both of which will ultimately feed the `Environment`

`global_context_sampling_fn`: 
* A function that outputs a random 1d array or list of ints or floats
* This output is the global context. Its shape and type must be consistent across calls.

`arm_context_sampling_fn`: 
* A function that outputs a random 1 array or list of ints or floats (same type as the output of `global_context_sampling_fn`). * This output is the per-arm context. Its shape must be consistent across calls.

In [19]:
NUM_OOV_BUCKETS        = 1
GLOBAL_EMBEDDING_SIZE  = 4
MV_EMBEDDING_SIZE      = 8 #32

HPARAMS = {
    "batch_size":8,
    "num_docs_to_rank":3,
    "model_type": "neural_epsilon_greedy",
    "network_type": 'commontower',
    "global_layers": [16,4],
    "per_arm_layers": [16,4],
    "common_layers": [4],
    "learning_rate": 0.05,
    "epsilon":0.01,
}

### global context (user) features

#### user ID

In [20]:
user_id_input_layer = tf.keras.Input(
    name="user_id",
    shape=(1,),
    dtype=tf.string
)

user_id_lookup = tf.keras.layers.StringLookup(
    max_tokens=len(vocab_dict['user_id']) + NUM_OOV_BUCKETS,
    num_oov_indices=NUM_OOV_BUCKETS,
    mask_token=None,
    vocabulary=vocab_dict['user_id'],
)(user_id_input_layer)

user_id_embedding = tf.keras.layers.Embedding(
    # Let's use the explicit vocabulary lookup.
    input_dim=len(vocab_dict['user_id']) + NUM_OOV_BUCKETS,
    output_dim=GLOBAL_EMBEDDING_SIZE
)(user_id_lookup)

user_id_embedding = tf.reduce_sum(user_id_embedding, axis=-2)

# global_inputs.append(user_id_input_layer)
# global_features.append(user_id_embedding)

In [21]:
test_user_id_model = tf.keras.Model(inputs=user_id_input_layer, outputs=user_id_embedding)

for x in train_dataset.batch(1).take(1):
    print(x["user_id"])
    print(test_user_id_model(x["user_id"]))

tf.Tensor([b'346'], shape=(1,), dtype=string)
tf.Tensor([[-0.02258484 -0.00642679 -0.01963132 -0.04176483]], shape=(1, 4), dtype=float32)


#### user AGE

In [22]:
user_age_input_layer = tf.keras.Input(
    name="bucketized_user_age",
    shape=(1,),
    dtype=tf.float32
)

user_age_lookup = tf.keras.layers.IntegerLookup(
    vocabulary=vocab_dict['bucketized_user_age'],
    num_oov_indices=NUM_OOV_BUCKETS,
    oov_value=0,
)(user_age_input_layer)

user_age_embedding = tf.keras.layers.Embedding(
    # Let's use the explicit vocabulary lookup.
    input_dim=len(vocab_dict['bucketized_user_age']) + NUM_OOV_BUCKETS,
    output_dim=GLOBAL_EMBEDDING_SIZE
)(user_age_lookup)

user_age_embedding = tf.reduce_sum(user_age_embedding, axis=-2)

# global_inputs.append(user_age_input_layer)
# global_features.append(user_age_embedding)

In [23]:
test_user_age_model = tf.keras.Model(inputs=user_age_input_layer, outputs=user_age_embedding)

for x in train_dataset.batch(1).take(1):
    print(x["bucketized_user_age"])
    print(test_user_age_model(x["bucketized_user_age"]))

tf.Tensor([25.], shape=(1,), dtype=float32)
tf.Tensor([[-0.03874453  0.00775563 -0.04652616  0.04724056]], shape=(1, 4), dtype=float32)


#### user OCC

In [24]:
user_occ_input_layer = tf.keras.Input(
    name="user_occupation_text",
    shape=(1,),
    dtype=tf.string
)

user_occ_lookup = tf.keras.layers.StringLookup(
    max_tokens=len(vocab_dict['user_occupation_text']) + NUM_OOV_BUCKETS,
    num_oov_indices=NUM_OOV_BUCKETS,
    mask_token=None,
    vocabulary=vocab_dict['user_occupation_text'],
)(user_occ_input_layer)

user_occ_embedding = tf.keras.layers.Embedding(
    # Let's use the explicit vocabulary lookup.
    input_dim=len(vocab_dict['user_occupation_text']) + NUM_OOV_BUCKETS,
    output_dim=GLOBAL_EMBEDDING_SIZE
)(user_occ_lookup)

user_occ_embedding = tf.reduce_sum(user_occ_embedding, axis=-2)

# global_inputs.append(user_occ_input_layer)
# global_features.append(user_occ_embedding)

In [25]:
test_user_occ_model = tf.keras.Model(inputs=user_occ_input_layer, outputs=user_occ_embedding)

for x in train_dataset.batch(1).take(1):
    print(x["user_occupation_text"])
    print(test_user_occ_model(x["user_occupation_text"]))

tf.Tensor([b'other'], shape=(1,), dtype=string)
tf.Tensor([[ 0.00721186 -0.0224213  -0.01734282 -0.0030665 ]], shape=(1, 4), dtype=float32)


#### user Timestamp

In [26]:
user_ts_input_layer = tf.keras.Input(
    name="timestamp",
    shape=(1,),
    dtype=tf.int64
)

user_ts_lookup = tf.keras.layers.Discretization(
    vocab_dict['timestamp_buckets'].tolist()
)(user_ts_input_layer)

user_ts_embedding = tf.keras.layers.Embedding(
    # Let's use the explicit vocabulary lookup.
    input_dim=len(vocab_dict['timestamp_buckets'].tolist()) + NUM_OOV_BUCKETS,
    output_dim=GLOBAL_EMBEDDING_SIZE
)(user_ts_lookup)

user_ts_embedding = tf.reduce_sum(user_ts_embedding, axis=-2)

# global_inputs.append(user_ts_input_layer)
# global_features.append(user_ts_embedding)

In [27]:
test_user_ts_model = tf.keras.Model(inputs=user_ts_input_layer, outputs=user_ts_embedding)

for x in train_dataset.batch(1).take(1):
    print(x["timestamp"])
    print(test_user_ts_model(x["timestamp"]))

tf.Tensor([874948475], shape=(1,), dtype=int64)
tf.Tensor([[-0.0196561  -0.04081204 -0.0125438  -0.00011444]], shape=(1, 4), dtype=float32)


#### define sampling function

In [28]:
def global_context_sampling_fn():
    """
    This function generates a single global observation vector.
    """
    # for x in train_dataset.take(1).as_numpy_iterator():
    for x in train_dataset.batch(1).take(1):
        user_id_value = x['user_id']
        user_age_value = x['bucketized_user_age']
        user_occ_value = x['user_occupation_text']
        user_ts_value = x['timestamp']
        
        _id = test_user_id_model(user_id_value)
        _age = test_user_age_model(user_age_value)
        _occ = test_user_occ_model(user_occ_value)
        _ts = test_user_ts_model(user_ts_value)
        
        # # tmp - insepct numpy() values
        # print(_id.numpy()) #[0])
        # print(_age.numpy()) #[0])
        # print(_occ.numpy()) #[0])
        # print(_ts.numpy()) #[0])
        
        # to numpy array
        _id = np.array(_id.numpy()[0])
        _age = np.array(_age.numpy()[0])
        _occ = np.array(_occ.numpy()[0])
        _ts = np.array(_ts.numpy()[0])
        
        concat = np.concatenate(
            [_id, _age, _occ, _ts], axis=-1
        ).astype(np.float32)
        
        return concat

In [29]:
global_context_sampling_fn()

array([-0.02258484, -0.00642679, -0.01963132, -0.04176483, -0.03874453,
        0.00775563, -0.04652616,  0.04724056,  0.00721186, -0.0224213 ,
       -0.01734282, -0.0030665 , -0.0196561 , -0.04081204, -0.0125438 ,
       -0.00011444], dtype=float32)

### arm preprocessing layers

#### movie ID

In [30]:
mv_id_input_layer = tf.keras.Input(
    name="movie_id",
    shape=(1,),
    dtype=tf.string
)

mv_id_lookup = tf.keras.layers.StringLookup(
    max_tokens=len(vocab_dict['movie_id']) + NUM_OOV_BUCKETS,
    num_oov_indices=NUM_OOV_BUCKETS,
    mask_token=None,
    vocabulary=vocab_dict['movie_id'],
)(mv_id_input_layer)

mv_id_embedding = tf.keras.layers.Embedding(
    # Let's use the explicit vocabulary lookup.
    input_dim=len(vocab_dict['movie_id']) + NUM_OOV_BUCKETS,
    output_dim=MV_EMBEDDING_SIZE
)(mv_id_lookup)

mv_id_embedding = tf.reduce_sum(mv_id_embedding, axis=-2)

# arm_inputs.append(mv_id_input_layer)
# arm_features.append(mv_id_embedding)

In [31]:
test_mv_id_model = tf.keras.Model(inputs=mv_id_input_layer, outputs=mv_id_embedding)

for x in train_dataset.batch(1).take(1):
    print(x["movie_id"])
    print(test_mv_id_model(x["movie_id"]))

tf.Tensor([b'211'], shape=(1,), dtype=string)
tf.Tensor(
[[ 0.04284109 -0.04365351  0.02602072  0.011352   -0.00629252 -0.00146711
  -0.00050433  0.04409957]], shape=(1, 8), dtype=float32)


#### movie genre

In [32]:
mv_genre_input_layer = tf.keras.Input(
    name="movie_genres",
    shape=(1,),
    dtype=tf.float32
)

mv_genre_lookup = tf.keras.layers.IntegerLookup(
    vocabulary=vocab_dict['movie_genres'],
    num_oov_indices=NUM_OOV_BUCKETS,
    oov_value=0,
)(mv_genre_input_layer)

mv_genre_embedding = tf.keras.layers.Embedding(
    # Let's use the explicit vocabulary lookup.
    input_dim=len(vocab_dict['movie_genres']) + NUM_OOV_BUCKETS,
    output_dim=MV_EMBEDDING_SIZE
)(mv_genre_lookup)

mv_genre_embedding = tf.reduce_sum(mv_genre_embedding, axis=-2)

# arm_inputs.append(mv_genre_input_layer)
# arm_features.append(mv_genre_embedding)

In [33]:
test_mv_gen_model = tf.keras.Model(inputs=mv_genre_input_layer, outputs=mv_genre_embedding)

for x in train_dataset.batch(1).take(1):
    print(x["movie_genres"])
    print(test_mv_gen_model(x["movie_genres"]))

tf.Tensor([[4]], shape=(1, 1), dtype=int64)
tf.Tensor(
[[-0.04975596 -0.01630256 -0.00955644  0.03126628  0.04718431  0.04732894
   0.03216967 -0.02492697]], shape=(1, 8), dtype=float32)


#### define sampling function

In [34]:
def _get_per_arm_features(x):
    """
    This function generates a single per-arm observation vector
    """
    mv_id_value = x['movie_id']
    mv_gen_value = x['movie_genres'][0]

    _mid = test_mv_id_model(mv_id_value)
    _mgen = test_mv_gen_model(mv_gen_value)

    # to numpy array
    _mid = np.array(_mid.numpy()[0])
    _mgen = np.array(_mgen.numpy()[0])

    # print(_mid)
    # print(_mgen)

    concat = np.concatenate(
        [_mid, _mgen], axis=-1 # -1
    ).astype(np.float32)
    # concat = tf.concat([_mid, _mgen], axis=-1).astype(np.float32)

    return concat

In [35]:
def per_arm_context_sampling_fn():
    """
    This function generates a single per-arm observation vector
    """
    

NameError: name 'per_arm_context_sampling_fn' is not defined

In [62]:
GLOBAL_DIM = global_context_sampling_fn()
GLOBAL_DIM = GLOBAL_DIM.shape[0]
print(GLOBAL_DIM)

PER_ARM_DIM = per_arm_context_sampling_fn()
PER_ARM_DIM = PER_ARM_DIM.shape[0]
print(PER_ARM_DIM)

16


NameError: name 'per_arm_context_sampling_fn' is not defined

## define reward function

In [37]:
def _get_rewards(element):
    """Calculates reward for the actions."""

    def _calc_reward(x):
        """Calculates reward for a single action."""
        r0 = lambda: tf.constant(0.0)
        r1 = lambda: tf.constant(-10.0)
        r2 = lambda: tf.constant(2.0)
        r3 = lambda: tf.constant(3.0)
        r4 = lambda: tf.constant(4.0)
        r5 = lambda: tf.constant(10.0)
        c1 = tf.equal(x, 1.0)
        c2 = tf.equal(x, 2.0)
        c3 = tf.equal(x, 3.0)
        c4 = tf.equal(x, 4.0)
        c5 = tf.equal(x, 5.0)
        return tf.case([(c1, r1), (c2, r2), (c3, r3),(c4, r4),(c5, r5)], default=r0, exclusive=True)

    return tf.map_fn(
        fn=_calc_reward, 
        elems=element['user_rating'], 
        dtype=tf.float32
    )

### helper functions

**TODO:**
* modularize in a train_utils or similar

In [38]:
def _get_global_feature_list(input_features):
    """Return list of global features."""
    global_feature_names = ['user_id', 'bucketized_user_age', 'user_occupation_text', 'timestamp']
    global_features = []
    for global_feature in global_feature_names:
        if global_feature in input_features:
            global_features.append(input_features[global_feature])
        else:
            logging.error('Missing global feature %s', global_feature)
    return global_features

def _get_per_arm_feature_dict(input_features):
    """Returns a dictionary mapping feature key to per arm features."""
    per_arm_feature_names = ['movie_id', 'movie_genres']
    arm_features = {}
    for per_arm_feature in per_arm_feature_names:
        if per_arm_feature in inpbut_features:
            arm_features[per_arm_feature] = input_features[per_arm_feature]
        else:
            logging.error('Missing per arm feature %s', per_arm_feature)
    return arm_features

def _add_outer_dimension(x):
    """Adds an extra outer dimension."""
    if isinstance(x, dict):
        for key, value in x.items():
            x[key] = tf.expand_dims(value, 1)
        return x
    return tf.expand_dims(x, 1)

# NEW JW 

#### Creating an environment that iterates through one movie at a time



In [None]:
"""Class implementation of the per-arm MovieLens Bandit environment."""
from __future__ import absolute_import

import random
from typing import Optional, Text
import gin
import numpy as np

from tf_agents.bandits.environments import bandit_py_environment
from tf_agents.bandits.environments import dataset_utilities
from tf_agents.bandits.specs import utils as bandit_spec_utils
from tf_agents.specs import array_spec
from tf_agents.trajectories import time_step as ts


GLOBAL_KEY = bandit_spec_utils.GLOBAL_FEATURE_KEY
PER_ARM_KEY = bandit_spec_utils.PER_ARM_FEATURE_KEY


# @gin.configurable
class MovieLensPerArmPyEnvironment(bandit_py_environment.BanditPyEnvironment):
    """Implements the per-arm version of the MovieLens Bandit environment.

    """

    def __init__(self,
               batch_size,
               num_actions,
               dataset = train_dataset, 
               name: Optional[Text] = 'movielens_per_arm'):
        """Initializes the Per-arm MovieLens Bandit environment.

        Args:
          data_dir: (string) Directory where the data lies (in text form).
          batch_size: (int) Number of observations generated per call.
          num_actions: (int) How many movies to choose from per round.
        """
        self._batch_size = batch_size
        self._num_actions = num_actions


        self._action_spec = array_spec.BoundedArraySpec(
            shape=(),
            dtype=np.int32,
            minimum=0,
            maximum=num_actions - 1,
            name='action')
        observation_spec = {
            GLOBAL_KEY:
                array_spec.ArraySpec(shape=[16], dtype=np.float32), #creating +space for user age and occupation
            PER_ARM_KEY:
                array_spec.ArraySpec(
                    shape=[8], dtype=np.float32), #creating +1 space for movie genre
        }
        self._time_step_spec = ts.time_step_spec(observation_spec)

        self._current_user_indices = np.zeros(batch_size, dtype=np.int32)
        self._previous_user_indices = np.zeros(batch_size, dtype=np.int32)

        self._current_movie_indices = np.zeros([batch_size, num_actions],
                                               dtype=np.int32)
        self._previous_movie_indices = np.zeros([batch_size, num_actions],
                                                dtype=np.int32)

        self._observation = {
            GLOBAL_KEY:
                np.zeros([batch_size, rank_k+2], dtype=np.int32), #making space like above for dimensions
            PER_ARM_KEY:
                np.zeros([batch_size, num_actions, rank_k+1], dtype=np.int32),
        }

        super(MovieLensPerArmPyEnvironment, self).__init__(
            observation_spec, self._action_spec, name=name)

    @property
    def batch_size(self):
        return self._batch_size

    @property
    def batched(self):
        return True
    
    def _observe(self):
        for x in dataset.batch(1).take(1):
            user_id_value = x['user_id']
            user_age_value = x['bucketized_user_age']
            user_occ_value = x['user_occupation_text']
            user_ts_value = x['timestamp']

            _id = test_user_id_model(user_id_value)
            _age = test_user_age_model(user_age_value)
            _occ = test_user_occ_model(user_occ_value)
            _ts = test_user_ts_model(user_ts_value)

            # # tmp - insepct numpy() values
            # print(_id.numpy()) #[0])
            # print(_age.numpy()) #[0])
            # print(_occ.numpy()) #[0])
            # print(_ts.numpy()) #[0])

            # to numpy array
            _id = np.array(_id.numpy()[0])
            _age = np.array(_age.numpy()[0])
            _occ = np.array(_occ.numpy()[0])
            _ts = np.array(_ts.numpy()[0])

            combined_user_features = np.concatenate(
                [_id, _age, _occ, _ts], axis=-1
            ).astype(np.float32)

#         sampled_user_indices = np.random.randint(
#             self._num_users, size=self._batch_size)
#         self._previous_user_indices = self._current_user_indices
#         self._current_user_indices = sampled_user_indices

#         sampled_movie_indices = np.array([
#             random.sample(range(self._num_movies), self._num_actions)
#             for _ in range(self._batch_size)
#         ])
#         sampled_user_ages = self._user_age_int[sampled_user_indices]
#         sampled_user_occ = self._user_occ_int[sampled_user_indices]
#         combined_user_features = np.concatenate((self._u_hat[sampled_user_indices]
#                                                  , sampled_user_ages.reshape(-1,1)
#                                                  , sampled_user_occ.reshape(-1,1)), axis=1)
        # current_users = combined_user_features.reshape([self._batch_size, self.rank_k+2])
        
        movie_index_vector = sampled_movie_indices.reshape(-1)
        print(movie_index_vector.shape)
        flat_genre_list = self._mov_gen_int[movie_index_vector] #shape of 1
        flat_movie_list = self._v_hat[movie_index_vector] #shape of 2
        combined_movie_features = np.concatenate((flat_movie_list,flat_genre_list.reshape(-1,1)), axis=1)
        current_movies = combined_movie_features.reshape(
            [self._batch_size, self._num_actions, self.rank_k+1])

        self._previous_movie_indices = self._current_movie_indices
        self._current_movie_indices = sampled_movie_indices

        batched_observations = {
            GLOBAL_KEY:
                tf.convert_to_tensor(combined_user_features, dtype=tf.float32),
            PER_ARM_KEY:
                tf.convert_to_tensor(current_movies, dtype=tf.float32),
        }
        return batched_observations

    def _apply_action(self, action):
        chosen_arm_indices = self._current_movie_indices[range(self._batch_size),
                                                         action]
        return self._approx_ratings_matrix[self._current_user_indices,
                                           chosen_arm_indices]

    def _rewards_for_all_actions(self):
        rewards_matrix = self._approx_ratings_matrix[
            np.expand_dims(self._previous_user_indices, axis=-1),
            self._previous_movie_indices]
        return rewards_matrix

    def compute_optimal_action(self):
        return np.argmax(self._rewards_for_all_actions(), axis=-1)

    def compute_optimal_reward(self):
        return np.max(self._rewards_for_all_actions(), axis=-1)

# End New JW

### test sampling functions

In [39]:
# for x in train_dataset.batch(1).take(1):
#     test_globals = _get_global_context_features(x)

# print(test_globals.shape)
# test_globals

In [40]:
# for x in train_dataset.batch(1).take(1):
#     test_arms = _get_global_context_features(x)

# print(test_arms.shape)
# test_arms

## Observation Spec

**XLA compatibility:**
* Since dynamic tensor shapes are not compatible with XLA, the number of arm features (and consequently, number of arms for a step) cannot be dynamic. 
* One workaround is to fix the maximum number of arms for a problem, then pad the arm features in steps with fewer arms, and use action masking to indicate how many arms are actually active. 

In this case, the specs change as follows:

In [61]:
num_actions = HPARAMS['num_docs_to_rank']
# num_actions=tf.convert_to_tensor(num_actions, dtype=tf.int32)
print(num_actions)

global_spec = tensor_spec.TensorSpec(shape=[GLOBAL_DIM], dtype=tf.float32)
per_arm_spec = tensor_spec.TensorSpec(shape=[num_actions, PER_ARM_DIM], dtype=tf.float32)

# add outer nested dim
# global_spec = tensor_spec.add_outer_dims_nest(      # add_outer_dim
#     specs=global_spec,
#     outer_dims=[HPARAMS['batch_size']]
# )
# per_arm_spec = tensor_spec.add_outer_dims_nest( # add_outer_dim
#     specs=per_arm_spec,
#     outer_dims=[HPARAMS['batch_size']]
# )


observation_spec = {'global': global_spec, 'per_arm': per_arm_spec}
observation_spec

3


NameError: name 'PER_ARM_DIM' is not defined

In [None]:
# import tf_agents

# test_chosen_arm_feats = tf_agents.policies.utils.create_chosen_arm_features_info_spec(observation_spec=observation_spec)
# test_chosen_arm_feats

### Action Spec

> The time_step_spec and action_spec are specifications for the input time step and the output action

In [42]:
action_spec = tensor_spec.BoundedTensorSpec(
    shape=(), 
    dtype=tf.int32,
    minimum=tf.Variable(0), # 0 
    maximum=num_actions-tf.Variable(1), # -1
    name="action_spec"
)
action_spec

BoundedTensorSpec(shape=(), dtype=tf.int32, name='action_spec', minimum=array(0, dtype=int32), maximum=array(2, dtype=int32))

### TimeStep spec

In [43]:
time_step_spec = ts.time_step_spec(
    observation_spec = observation_spec, 
    # reward_spec = _reward_spec
)
# time_step_spec.discount

NameError: name 'observation_spec' is not defined

In [44]:
time_step_spec#['discount']

NameError: name 'time_step_spec' is not defined

## The Agent

**Note** that contextual bandits form a special case of RL, where the actions taken by the agent do not alter the state of the environment 

> “Contextual” refers to the fact that the agent chooses among a set of actions while having knowledge of the context (environment observation)

**Possible Agent Types:**

```
AGENT_TYPE = ['LinUCB', 'LinTS', 'epsGreedy', 'NeuralLinUCB']
```

**LinearUCBAgent:** (`LinUCB`)
* An agent implementing the Linear UCB bandit algorithm
* (whitepaper) [A contextual bandit approach to personalized news recommendation](https://arxiv.org/abs/1003.0146)
* [docs](https://www.tensorflow.org/agents/api_docs/python/tf_agents/bandits/agents/lin_ucb_agent/LinearUCBAgent)

**LinearThompsonSamplingAgent:** (`LinTS`)
* Implements the Linear Thompson Sampling Agent from the paper: [Thompson Sampling for Contextual Bandits with Linear Payoffs](https://arxiv.org/abs/1209.3352)
* the agent maintains two parameters `weight_covariances` and `parameter_estimators`, and updates them based on experience.
* The inverse of the weight covariance parameters are updated with the outer product of the observations using the Woodbury inverse matrix update, while the parameter estimators are updated by the reward-weighted observation vectors for every action
* [docs](https://www.tensorflow.org/agents/api_docs/python/tf_agents/bandits/agents/linear_thompson_sampling_agent/LinearThompsonSamplingAgent)

**NeuralEpsilonGreedyAgent:** (`epsGreedy`) 
* A neural network based epsilon greedy agent
* This agent receives a neural network that it trains to predict rewards
* The action is chosen greedily with respect to the prediction with probability `1 - epsilon`, and uniformly randomly with probability epsilon
* [docs](https://www.tensorflow.org/agents/api_docs/python/tf_agents/bandits/agents/neural_epsilon_greedy_agent/NeuralEpsilonGreedyAgent)

**NeuralLinUCBAgent:** (`NeuralLinUCB`)
* An agent implementing the LinUCB algorithm on top of a neural network
* `ENCODING_DIM` is the output dimension of the encoding network 
> * This output will be used by either a linear reward layer and epsilon greedy exploration, or by a LinUCB logic, depending on the number of training steps executed so far
* `EPS_PHASE_STEPS` is the number training steps to run for training the encoding network before switching to `LinUCB`
> * If negative, the encoding network is assumed to be already trained
> * If the number of steps is less than or equal to `EPS_PHASE_STEPS`, `epsilon greedy` is used, otherwise `LinUCB`
* [docs](https://www.tensorflow.org/agents/api_docs/python/tf_agents/bandits/agents/neural_linucb_agent/NeuralLinUCBAgent)

### network types

Which network architecture to use for the `epsGreedy` or `NeuralLinUCB` agents

```
NETWORK_TYPE = ['commontower', 'dotproduct']
```

**GlobalAndArmCommonTowerNetwork:** (`commontower`)
* This network takes the output of the global and per-arm networks, and leads them through a common network, that in turn outputs reward estimates
> * `GLOBAL_LAYERS` - Iterable of ints. Specifies the layers of the global tower
> * `ARM_LAYERS` - Iterable of ints. Specifies the layers of the arm tower
> * `COMMON_LAYERS` - Iterable of ints. Specifies the layers of the common tower
* The network produced by this function can be used either in `GreedyRewardPredictionPolicy`, or `NeuralLinUCBPolicy`
> * In the former case, the network must have `output_dim=1`, it is going to be an instance of `QNetwork`, and used in the policy as a reward prediction network
> * In the latter case, the network will be an encoding network with its output consumed by a reward layer or a `LinUCB` method. The specified `output_dim` will be the encoding dimension
* [docs](https://www.tensorflow.org/agents/api_docs/python/tf_agents/bandits/networks/global_and_arm_feature_network/GlobalAndArmCommonTowerNetwork)

**GlobalAndArmDotProductNetwork:** (`dotproduct`)
* This network calculates the **dot product** of the output of the global and per-arm networks and returns them as reward estimates
> * `GLOBAL_LAYERS` - Iterable of ints. Specifies the layers of the global tower
> * `ARM_LAYERS` - Iterable of ints. Specifies the layers of the arm tower
* [docs](https://www.tensorflow.org/agents/api_docs/python/tf_agents/bandits/networks/global_and_arm_feature_network/GlobalAndArmDotProductNetwork)

### define agent and network

In [45]:
# ================================
# Agents
# ================================
AGENT_TYPE      = 'epsGreedy' # 'LinUCB' | 'LinTS |, 'epsGreedy' | 'NeuralLinUCB'

# Parameters for linear agents (LinUCB and LinTS).
AGENT_ALPHA     = 0.1

# Parameters for neural agents (NeuralEpsGreedy and NerualLinUCB).
EPSILON         = 0.01
LR              = 0.05

# Parameters for NeuralLinUCB
ENCODING_DIM    = 5
EPS_PHASE_STEPS = 1000

# ================================
# Agent's Preprocess Network
# ================================
NETWORK_TYPE    = "dotproduct" # 'commontower' | 'dotproduct'

if AGENT_TYPE == 'NeuralLinUCB':
    NETWORK_TYPE = 'commontower'
    

GLOBAL_LAYERS   = [16, 4]
ARM_LAYERS      = [16, 4]
COMMON_LAYERS   = [4]

observation_and_action_constraint_splitter = None

print(f"AGENT_TYPE      : {AGENT_TYPE}")
print(f"NETWORK_TYPE    : {NETWORK_TYPE}")
print(f"AGENT_ALPHA     : {AGENT_ALPHA}")
print(f"EPSILON         : {EPSILON}")
print(f"LR              : {LR}")
print(f"ENCODING_DIM    : {ENCODING_DIM}")
print(f"EPS_PHASE_STEPS : {EPS_PHASE_STEPS}")
print(f"GLOBAL_LAYERS   : {GLOBAL_LAYERS}")
print(f"ARM_LAYERS      : {ARM_LAYERS}")
print(f"COMMON_LAYERS   : {COMMON_LAYERS}")

AGENT_TYPE      : epsGreedy
NETWORK_TYPE    : dotproduct
AGENT_ALPHA     : 0.1
EPSILON         : 0.01
LR              : 0.05
ENCODING_DIM    : 5
EPS_PHASE_STEPS : 1000
GLOBAL_LAYERS   : [16, 4]
ARM_LAYERS      : [16, 4]
COMMON_LAYERS   : [4]


### Agent Factory

**TODO:**
* consolidate agent, network, and hparams

In [46]:
# from tf_agents.bandits.agents import greedy_reward_prediction_agent

# network = None
# observation_and_action_constraint_splitter = None

# # global_step = tf.Variable(0)
# global_step = tf.compat.v1.train.get_or_create_global_step()

# if HPARAMS['network_type'] == 'commontower':
#     network = global_and_arm_feature_network.create_feed_forward_common_tower_network(
#         observation_spec = observation_spec, 
#         global_layers = HPARAMS['global_layers'], 
#         arm_layers = HPARAMS['per_arm_layers'], 
#         common_layers = HPARAMS['common_layers'],
#         # output_dim = 1
#     )
# elif HPARAMS['network_type'] == 'dotproduct':
#     network = global_and_arm_feature_network.create_feed_forward_dot_product_network(
#         observation_spec = observation_spec, 
#         global_layers = HPARAMS['global_layers'], 
#         arm_layers = HPARAMS['per_arm_layers']
#     )
    
# # agent = greedy_reward_prediction_agent.GreedyRewardPredictionAgent()
    
# agent = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent(
#     time_step_spec=time_step_spec,
#     action_spec=action_spec,
#     reward_network=network,
#     optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=HPARAMS['learning_rate']),
#     epsilon=HPARAMS['epsilon'],
#     observation_and_action_constraint_splitter=(
#         observation_and_action_constraint_splitter
#     ),
#     accepts_per_arm_features=True,
#     emit_policy_info=policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN,
#     train_step_counter=global_step,
#     info_fields_to_inherit_from_greedy=[
#         policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN
#     ],
#     name='OffpolicyNeuralEpsGreedyAgent'
# )
# agent.initialize()

# print(f"Agent: {agent.name}\n")
# if network:
#     print(f"Network: {network.name}")

In [63]:
from tf_agents.bandits.policies import policy_utilities

network = None

if AGENT_TYPE == 'LinUCB':
    agent = lin_ucb_agent.LinearUCBAgent(
        time_step_spec=per_arm_tf_env.time_step_spec(),
        action_spec=per_arm_tf_env.action_spec(),
        alpha=AGENT_ALPHA,
        accepts_per_arm_features=True,
        dtype=tf.float32,
    )
elif AGENT_TYPE == 'LinTS':
    agent = lin_ts_agent.LinearThompsonSamplingAgent(
        time_step_spec=per_arm_tf_env.time_step_spec(),
        action_spec=per_arm_tf_env.action_spec(),
        alpha=AGENT_ALPHA,
        observation_and_action_constraint_splitter=(
            observation_and_action_constraint_splitter
        ),
        accepts_per_arm_features=True,
        dtype=tf.float32,
    )
elif AGENT_TYPE == 'epsGreedy':
    obs_spec = per_arm_tf_env.observation_spec()
    if NETWORK_TYPE == 'commontower':
        network = global_and_arm_feature_network.create_feed_forward_common_tower_network(
            observation_spec = obs_spec, 
            global_layers = GLOBAL_LAYERS, 
            arm_layers = ARM_LAYERS, 
            common_layers = COMMON_LAYERS,
            # output_dim = 1
        )
    elif NETWORK_TYPE == 'dotproduct':
        network = global_and_arm_feature_network.create_feed_forward_dot_product_network(
            observation_spec = obs_spec, 
            global_layers = GLOBAL_LAYERS, 
            arm_layers = ARM_LAYERS
        )
    agent = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent(
        time_step_spec=per_arm_tf_env.time_step_spec(),
        action_spec=per_arm_tf_env.action_spec(),
        reward_network=network,
        optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
        epsilon=EPSILON,
        observation_and_action_constraint_splitter=(
            observation_and_action_constraint_splitter
        ),
        accepts_per_arm_features=True,
        emit_policy_info=policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN,
    )

elif AGENT_TYPE == 'NeuralLinUCB':
    obs_spec = per_arm_tf_env.observation_spec()
    network = (
        global_and_arm_feature_network.create_feed_forward_common_tower_network(
            observation_spec = obs_spec, 
            global_layers = GLOBAL_LAYERS, 
            arm_layers = ARM_LAYERS, 
            common_layers = COMMON_LAYERS,
            output_dim = ENCODING_DIM
        )
    )
    agent = neural_linucb_agent.NeuralLinUCBAgent(
        time_step_spec=per_arm_tf_env.time_step_spec(),
        action_spec=per_arm_tf_env.action_spec(),
        encoding_network=network,
        encoding_network_num_train_steps=EPS_PHASE_STEPS,
        encoding_dim=ENCODING_DIM,
        optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
        alpha=1.0,
        gamma=1.0,
        epsilon_greedy=EPSILON,
        accepts_per_arm_features=True,
        debug_summaries=True,
        summarize_grads_and_vars=True,
        emit_policy_info=policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN,
    )
    
print(f"Agent: {agent.name}\n")

if network:
    print(f"Network: {network.name}")

NameError: name 'per_arm_tf_env' is not defined

In [64]:
pprint(agent.policy.trajectory_spec)

NameError: name 'agent' is not defined

In [49]:
print('training data spec: ', agent.training_data_spec)

NameError: name 'agent' is not defined

In [50]:
print('observation spec in training: ', agent.training_data_spec.observation)

NameError: name 'agent' is not defined

In [51]:
print('chosen arm features: ', agent.training_data_spec.policy_info.chosen_arm_features)

NameError: name 'agent' is not defined

In [None]:
print("TimeStep Spec (for each batch):\n", agent.time_step_spec, "\n")

In [None]:
print("Action Spec (for each batch):\n", agent.action_spec, "\n")

In [None]:
pprint(agent.policy.trajectory_spec) # TODO check observation between this and next cell

### replay buffers (wip)

* Note that when the replay buffer object is initialized, it requires the `data_spec` of the elements that it will store. * This spec corresponds to the TensorSpec of trajectory elements that will be added to the buffer
* This spec is usually acquired by looking at an agent's `agent.collect_data_spec` which defines the shapes, types, and structures expected by the agent when training

In [None]:
import reverb
from tf_agents.replay_buffers import reverb_replay_buffer
from tf_agents.replay_buffers import reverb_utils

In [None]:
agent.collect_data_spec

In [None]:
replay_buffer_signature = tensor_spec.from_spec(
      agent.collect_data_spec
)
replay_buffer_signature

In [None]:
replay_buffer_signature = tensor_spec.add_outer_dim(
      replay_buffer_signature
)
replay_buffer_signature

In [None]:
# table_name = 'uniform_table'
# replay_buffer_capacity = 2000 # @param {type:"integer"}

# table = reverb.Table(
#     table_name,
#     max_size=replay_buffer_capacity,
#     sampler=reverb.selectors.Uniform(),
#     remover=reverb.selectors.Fifo(),
#     rate_limiter=reverb.rate_limiters.MinSize(1),
#     signature=replay_buffer_signature
# )
# reverb_server = reverb.Server([table])

# replay_buffer = reverb_replay_buffer.ReverbReplayBuffer(
#     agent.collect_data_spec,
#     table_name=table_name,
#     sequence_length=None,
#     local_server=reverb_server
# )

# rb_observer = reverb_utils.ReverbAddEpisodeObserver(
#     replay_buffer.py_client,
#     table_name,
#     replay_buffer_capacity
# )

# replay_buffer

In [None]:
num_iterations = 15                 # @param {type:"integer"}
collect_episodes_per_iteration = 2  # @param {type:"integer"}

fc_layer_params = (100,)

learning_rate = 1e-3   # @param {type:"number"}
log_interval = 25      # @param {type:"integer"}
num_eval_episodes = 10 # @param {type:"integer"}
eval_interval = 50     # @param {type:"integer"}

def collect_episode(
    environment, policy, num_episodes
):    
    driver = py_driver.PyDriver(
        environment,
        py_tf_eager_policy.PyTFEagerPolicy(
            policy, 
            use_tf_function=True
        ),
        [rb_observer],
        max_episodes=num_episodes
    )
    initial_time_step = environment.reset()
    driver.run(initial_time_step)

## trajectory function

In [52]:
from tf_agents.bandits.specs import utils as bandit_spec_utils
from tf_agents.trajectories import trajectory

# replay_buffer_signature = tensor_spec.add_outer_dim(
#       replay_buffer_signature
# )


def _trajectory_fn(element): # hparams
    """Converts a dataset element into a trajectory."""
    global_features = _get_global_context_features(element)
    arm_features = _get_per_arm_features(element)
    
    # # tmp 
    # print(f"global_features: {global_features}")
    # print(f"arm_features: {arm_features}")
    
    # Adds a time dimension.
    # arm_features = _add_outer_dimension(arm_features)
    # arm_features = tensor_spec.add_outer_dim(arm_features)
    
    # obs spec
    observation = {
        bandit_spec_utils.GLOBAL_FEATURE_KEY:
            global_features,
            # _add_outer_dimension(global_features)
        # bandit_spec_utils.PER_ARM_FEATURE_KEY:
        #     _add_outer_dimension(arm_features),
    }
    # print("after adding extra dim...")
    # print(f"observation: {observation}")
    # print(f"arm_features: {arm_features}")
    
    # reward = tensor_spec.add_outer_dim(_get_rewards(element))
    reward = _get_rewards(element)
    # print(f"reward: {reward}")
    
    # To emit the predicted rewards in policy_info, we need to create dummy
    # rewards to match the definition in TensorSpec for the ones specified in
    # emit_policy_info set.
    # dummy_rewards = tf.zeros([HPARAMS['batch_size'], 1, HPARAMS['num_docs_to_rank']])
    # dummy_rewards = tf.zeros([HPARAMS['batch_size'], HPARAMS['num_docs_to_rank']])
    dummy_rewards = tf.zeros([HPARAMS['num_docs_to_rank']])
    policy_info = policy_utilities.PerArmPolicyInfo(
        chosen_arm_features=arm_features,
        # Pass dummy mean rewards here to match the model_spec for emitting
        # mean rewards in policy info
        predicted_rewards_mean=dummy_rewards
    )
    
# tf_agents.policies.utils.create_chosen_arm_features_info_spec(
#     observation_spec: tf_agents.typing.types.NestedTensorSpec
# ) -> tf_agents.typing.types.NestedTensorSpec
    
    if HPARAMS['model_type'] == 'neural_ucb':
        policy_info = policy_info._replace(
            predicted_rewards_optimistic=dummy_rewards
        )
        
    # print(f"observation: {observation}")
    # print(f"reward: {reward}")
    # print(f"policy_info: {policy_info}")
    # print(f"dummy_rewards: {dummy_rewards}")
    
    return trajectory.single_step(
        observation=observation,
        action=tf.zeros_like(
            reward, dtype=tf.int32
        ),  # Arm features are copied from policy info, put dummy zeros here
        policy_info=policy_info,
        reward=reward,
        discount=tf.zeros_like(reward)
    )


In [53]:
# def get_trajectory_from_environment(element): # hparams
#     """Converts a dataset element into a trajectory."""
    
#     orig_trajectory = agent.policy.trajectory_spec
#     # print(f"orig_trajectory.step_type: {orig_trajectory.step_type}")
#     # print(f"orig_trajectory.next_step_type: {orig_trajectory.next_step_type}")
    
#     global_features = _get_global_context_features(element)
#     arm_features = _get_per_arm_features(element)
    
#     # Adds a time dimension.
#     # arm_features = _add_outer_dimension(arm_features)
#     observation = {
#         bandit_spec_utils.GLOBAL_FEATURE_KEY:
#             global_features,
#             # _add_outer_dimension(global_features)
#         # bandit_spec_utils.PER_ARM_FEATURE_KEY:
#         #     _add_outer_dimension(arm_features),
#     }
    
#     # reward = _add_outer_dimension(_get_rewards(element))
#     reward = _get_rewards(element)
#     # print(f"reward:  {reward}")
#     # print(f"reward shape:  {tf.shape(reward).numpy()}")
    
#     reward_2 = tf.expand_dims(reward, 0)
#     # print(f"reward_2:  {reward_2}")
#     # print(f"reward_2 shape:  {tf.shape(reward_2).numpy()}")
    
    
#     dummy_rewards = tf.zeros([HPARAMS['num_docs_to_rank']])
#     policy_info = policy_utilities.PerArmPolicyInfo(
#         chosen_arm_features=arm_features,
#         predicted_rewards_mean=dummy_rewards
#     )
#     if HPARAMS['model_type'] == 'neural_ucb':
#         policy_info = policy_info._replace(
#             predicted_rewards_optimistic=dummy_rewards
#         )
        
#     # observation
#     obs = observation['global']
#     # print(f"obs:  {obs}")
#     # print(f"obs shape:  {tf.shape(obs).numpy()}")
    
#     obs_2 = tf.expand_dims(obs, 0)
#     # print(f"obs_2:  {obs_2}")
#     # print(f"obs_2 shape:  {tf.shape(obs_2).numpy()}")
    
#     return trajectory.Trajectory(
#         observation=obs_2,
#         action=tf.zeros_like(
#             reward_2, dtype=tf.int32
#         ),  # Arm features are copied from policy info, put dummy zeros here
#         policy_info=policy_info,
#         reward=reward_2,
#         discount=tf.zeros_like(reward_2),
#         step_type=orig_trajectory.step_type,
#         next_step_type=orig_trajectory.next_step_type
#     )

In [54]:
def build_dict_from_trajectory(
    step: int,
    next_step: int,
    trajectory: trajectories.Trajectory) -> Dict[str, Any]:
    """Builds a dict from `trajectory` data.

    Args:
    trajectory: A `trajectories.Trajectory` object.

    Returns:
    A dict holding the same data as `trajectory`.
    """
    trajectory_dict = {
        "step_type": [step].numpy(),
        "observation": [{
            "observation_batch": batch
        } for batch in trajectory.observation.numpy().tolist()],
        "action": trajectory.action.numpy().tolist(),
        "policy_info": trajectory.policy_info,
        "next_step_type": [next_step],
        "reward": trajectory.reward.numpy().tolist(),
        "discount": trajectory.discount.numpy().tolist(),
    }
    return trajectory_dict

NameError: name 'trajectories' is not defined

### write trajectories to file

In [55]:
VERSION = 'v1'

DATASET_FILE = f'{VERSION}-off-policy-trajectories.json'
!touch $DATASET_FILE

In [56]:
dataset_size = len(list(train_dataset))
print(f"dataset_size: {dataset_size}")

small_count = dataset_size/100
print(f"small_count: {small_count}")

dataset_size: 20000
small_count: 200.0


In [57]:
import time
import json

def write_trajectories_to_file(
    dataset_size: int,
    data_file: str,
    batch_size: int,
):
    batched_dataset = train_dataset.batch(batch_size)
    print(f"writting file...")
    
    data_list = []
    
    start_time = time.time()
    step = 1
    with open(data_file, "w") as f:
        for x in batched_dataset.take(count=dataset_size):
            # print(f"step: {step}")
            nexx_step = step + 1
            # print(f"nexx_step: {nexx_step}")

            single_traj = get_trajectory_from_environment(x)
            print(single_traj)
            
            _trajectory_dict = build_dict_from_trajectory(step=step, next_step=nexx_step, trajectory=single_traj)
            # print(type(trajectory_dict))
            decoded = _trajectory_dict.decode('utf-8')
            print(f"decoded: {decoded}")
            data_list.append(_trajectory_dict)

            step+=1
            
            break
            
        for entry in data_list:
            traj_dict_tmp = {}
            traj_dict_tmp['step_type'] = entry['step_type']
            traj_dict_tmp['observation'] = entry['observation']
            traj_dict_tmp['action'] = entry['action']
            traj_dict_tmp['policy_info'] = entry['policy_info']
            traj_dict_tmp['next_step_type'] = entry['next_step_type']
            traj_dict_tmp['reward'] = entry['reward']
            traj_dict_tmp['discount'] = entry['discount']
            
            # f.write(json.dumps(traj_dict_tmp) + "\n")
            
        print(f"writting to file complete...")

    end_time = time.time()
    runtime_mins = int((end_time - start_time) / 60)
    print(f"runtime_mins: {runtime_mins}")

    return data_list

In [58]:
sample_data_list = write_trajectories_to_file(
    dataset_size=int(small_count),
    data_file=DATASET_FILE,
    batch_size=2
)

# sample_data_list[0]
# sample_data_list[0]['observation']

writting file...


NameError: name 'get_trajectory_from_environment' is not defined

In [None]:
# train_utils.upload_blob(
#     bucket_name='',
#     source_file_name=,
#     destination_blob_name=f'{RUN_NAME}/candidates/xxxx.json'
# )

### validate shapes and dims

In [None]:
from tf_agents.utils import nest_utils

nest_utils.is_batched_nested_tensors(
    tensors=single_traj.policy_info.chosen_arm_features,
    specs=agent.training_data_spec.policy_info.chosen_arm_features,
    num_outer_dims=1,
    allow_extra_fields=False,
    check_dtypes=True
)

# nest_utils.is_batched_nested_tensors(
#     tensors=single_traj.observation['global'],
#     specs=agent.training_data_spec.observation['global'],
#     num_outer_dims=0,
#     allow_extra_fields=False,
#     check_dtypes=True
# )

# nest_utils.is_batched_nested_tensors(
#     tensors=single_traj.action,
#     specs=agent.training_data_spec.action,
#     num_outer_dims=1,
#     allow_extra_fields=False,
#     check_dtypes=True
# )

In [59]:
# arm_observations = per_arm_context_sampling_fn()
# print(arm_observations)

# outer_rank = nest_utils.get_outer_rank(tensors = arm_observations, specs = observation_spec['per_arm'])
# outer_rank

In [60]:
# per_arm_spec = tensor_spec.add_outer_dims_nest( # add_outer_dim
#     specs=per_arm_spec,
#     outer_dims=[HPARAMS['batch_size']]
# )

In [None]:
# len(thing['global'].numpy().tolist())
test = thing['global'].numpy().tolist()
print(f"test: {len(test)}")
print(tf.shape(test).numpy())

thingy = tf.expand_dims(test, axis=0)
print(f"thingy: {thingy}")
print(tf.shape(thingy).numpy())

## train loop

`agent.train(experience=...)`

where `experience` is a batch of trajectories data in the form of a Trajectory. 
* The structure of experience must match that of `self.training_data_spec`. 
* All tensors in experience must be shaped [batch, time, ...] where time must be equal to self.train_step_length if that property is not None.

In [None]:
import collections
from tf_agents.utils import common

NUM_EPOCHS = 1

# global_step = tf.compat.v1.train.get_global_step()

# (Optional) Optimize by wrapping some of the code in a graph using TF function.
agent.train = common.function(agent.train)

# Reset the train step
agent.train_step_counter.assign(0)

train_loss = collections.defaultdict(list)

for epoch in range(NUM_EPOCHS):
    
    iterator = iter(train_dataset.batch(2))
    data = next(iterator)
    pprint(f"print data: {data}")
    
    trajectories = _trajectory_fn(data)
    pprint(f"print trajectories: {trajectories}")
    
    # All tensors in experience must be shaped [batch, time, ...] 
    step = agent.train_step_counter.numpy()
    loss = agent.train(experience=trajectories)
    
    # break
    
    
#     for x in train_dataset.batch(1).take(1): #HPARAMS['batch_size']).take(1):
#         # print(f"print X: {len(x)}")
#         # break
#         step = agent.train_step_counter.numpy()
#         print(f"step X: {step}")
#         trajectories = _trajectory_fn(x)
#         # print(f"print trajectories: {trajectories}")
#         # break
    
#         loss = agent.train(experience=trajectories)

#         train_loss[f"epoch:{epoch + 1}"].append(loss.numpy())
    
# train_outputs = collections.namedtuple(
#     "TrainOutputs",["policy", "train_loss"]
# )

# train_outputs(agent.policy, train_loss)