# Contextual Bandits for Ranking with TF-Agents

> see [ranking tutorial](https://www.tensorflow.org/agents/tutorials/ranking_tutorial)

### Overview

* The contextual bandits approach is classified as an extension of multi-armed bandits
* a contextual multi-armed banded problem is a simplified reinforcement learning algorithm where the agent takes an action from a set of possible actions 

> **TODO**

The **Bandit Ranking** agent will be similar to the `NeuralEpsilonGreedy` agent. Main differences:

* The item features are stored in the `per_arm` part of the observation, in the order of how they are recommended
* Since this ordered list of items expresses what action was taken by the policy,
the `action` value of the trajectory is not used by the agent.

> Note: difference between the "per-arm" observation recieved by the policy vs the agent:

While the agent receives the items in the recommendation slots, the policy receives the items that are available for recommendation. The user is responsible for converting the observation to the
syntax required by the agent.


The training observation contains the global features and the features of the items in the recommendation slots 
* The item features are stored in the `per_arm` part of the observation, in the order of how they are recommended
* Note: since this ordered list of items expresses what action was taken by the policy, the action value of the trajectory is not used by the agent

## Load notebook config

* use the prefix defined in `00-env-setup`

In [1]:
VERSION        = "v1"                       # TODO
PREFIX         = f'rec-bandits-{VERSION}'   # TODO

# PREFIX = 'mabv1'

print(f"PREFIX: {PREFIX}")

PREFIX: rec-bandits-v1


In [2]:
# staging GCS
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}-bucket'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PROJECT_ID               = "hybrid-vertex"
PROJECT_NUM              = "934903580331"
LOCATION                 = "us-central1"

REGION                   = "us-central1"
BQ_LOCATION              = "US"
VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"

VERTEX_SA                = "934903580331-compute@developer.gserviceaccount.com"

PREFIX                   = "rec-bandits-v1"
VERSION                  = "v1"

BUCKET_NAME              = "rec-bandits-v1-hybrid-vertex-bucket"
BUCKET_URI               = "gs://rec-bandits-v1-hybrid-vertex-bucket"
DATA_GCS_PREFIX          = "data"
DATA_PATH                = "gs://rec-bandits-v1-hybrid-vertex-bucket/data"
VOCAB_SUBDIR             = "vocabs"
VOCAB_FILENAME           = "vocab_dict.pkl"

VPC_NETWORK_FULL         = "projects/934903580331/global/networks/ucaip-haystack-vpc-network"

BIGQUERY_DATASET_ID      = "hybrid_vertex.movielens_ds_rec_bandits_v1"
BIGQUERY_TABLE_ID        = "hybrid_vertex.movielens_ds_rec_bandits_v1.training_dataset"

REPO

## imports

In [3]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [4]:
!pwd

/home/jupyter/tf_vertex_agents/03-ranking


In [54]:
import functools
from collections import defaultdict
from typing import Callable, Dict, List, Optional, TypeVar, Iterable
from datetime import datetime
import time
from pprint import pprint
import pickle as pkl
import pandas as pd

from tqdm import tqdm

# logging
import logging
logging.disable(logging.WARNING)

import matplotlib.pyplot as plt
import numpy as np

# google cloud
from google.cloud import aiplatform, storage

# tensorflow
import tensorflow as tf
# from tf_agents.agents import TFAgent

# from tf_agents.bandits.environments import stationary_stochastic_per_arm_py_environment as p_a_env
from tf_agents.bandits.metrics import tf_metrics as tf_bandit_metrics
# from tf_agents.drivers import dynamic_step_driver
# from tf_agents.environments import tf_py_environment
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.specs import tensor_spec
from tf_agents.trajectories import time_step as ts

# from tf_agents.bandits.agents import lin_ucb_agent
# from tf_agents.bandits.agents import linear_thompson_sampling_agent as lin_ts_agent
from tf_agents.bandits.agents import neural_epsilon_greedy_agent
from tf_agents.bandits.agents import neural_linucb_agent
from tf_agents.bandits.networks import global_and_arm_feature_network
from tf_agents.bandits.policies import policy_utilities

from tf_agents.bandits.specs import utils as bandit_spec_utils
from tf_agents.trajectories import trajectory
from tf_agents.bandits.environments import ranking_environment
from tf_agents.bandits.agents import ranking_agent

# GPU
from numba import cuda 
import gc

import sys
sys.path.append("..")

# this repo
from src.per_arm_rl import data_utils
from src.per_arm_rl import data_config
from src.per_arm_rl import train_utils as train_utils

# tf exceptions and vars
if tf.__version__[0] != "2":
    raise Exception("The trainer only runs with TensorFlow version 2.")

T = TypeVar("T")

In [6]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [7]:
device = cuda.get_current_device()
device.reset()
gc.collect()

23

In [8]:
# cloud storage client
storage_client = storage.Client(project=PROJECT_ID)

# Vertex client
aiplatform.init(project=PROJECT_ID, location=LOCATION)

# Data prep

In [9]:
options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.AUTO

In [10]:
NUM_EXAMPLES_PER_LIST = 5 # 3 | 5

# SPLIT = "val"
# SPLIT = "listwise-val"
SPLIT = f"listwise-{NUM_EXAMPLES_PER_LIST}n-val"

print(f"SPLIT: {SPLIT}")

SPLIT: listwise-5n-val


In [11]:
! gsutil ls $BUCKET_URI/$DATA_GCS_PREFIX/$SPLIT/

gs://rec-bandits-v1-hybrid-vertex-bucket/data/listwise-5n-val/ml-100k-listwise-5n-val.tfrecord


### Val

In [12]:
SPLIT = "val"
# SPLIT = "listwise-val"
SPLIT = f"listwise-{NUM_EXAMPLES_PER_LIST}n-val"

val_files = []
for blob in storage_client.list_blobs(f"{BUCKET_NAME}", prefix=f'{DATA_GCS_PREFIX}/{SPLIT}'):
    if '.tfrecord' in blob.name:
        val_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
        
val_dataset = tf.data.TFRecordDataset(val_files)
val_dataset = val_dataset.map(data_utils.parse_lw_tfrecord, num_parallel_calls=tf.data.AUTOTUNE)

for example in val_dataset.take(1):
    pprint(example)

{'movie_genres': <tf.Tensor: shape=(5,), dtype=int64, numpy=array([ 3,  0,  0,  0, 19])>,
 'movie_id': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'94', b'245', b'403', b'50', b'470'], dtype=object)>,
 'user_id': <tf.Tensor: shape=(), dtype=string, numpy=b'346'>,
 'user_rating': <tf.Tensor: shape=(5,), dtype=float32, numpy=array([3., 4., 3., 5., 3.], dtype=float32)>}


### Train

In [12]:
# SPLIT = "train"
# SPLIT = "listwise-train"
# SPLIT = f"listwise-{NUM_EXAMPLES_PER_LIST}n-train"

# train_files = []
# for blob in storage_client.list_blobs(f"{BUCKET_NAME}", prefix=f'{DATA_GCS_PREFIX}/{SPLIT}'):
#     if '.tfrecord' in blob.name:
#         train_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
        
# train_dataset = tf.data.TFRecordDataset(train_files)
# train_dataset = train_dataset.map(data_utils.parse_lw_tfrecord)

# for x in train_dataset.batch(1).take(1):
#     pprint(x)

In [14]:
train_dataset = val_dataset

### Vocab

In [15]:
GENERATE_VOCABS = False

print(f"GENERATE_VOCABS: {GENERATE_VOCABS}")

GENERATE_VOCABS: False


In [16]:
EXISTING_VOCAB_FILE = f'gs://{BUCKET_NAME}/{VOCAB_SUBDIR}/{VOCAB_FILENAME}'

!gsutil cp gs://mabv1-hybrid-vertex-bucket/vocabs/vocab_dict.pkl $EXISTING_VOCAB_FILE

Copying gs://mabv1-hybrid-vertex-bucket/vocabs/vocab_dict.pkl [Content-Type=application/octet-stream]...
/ [1 files][142.9 KiB/142.9 KiB]                                                
Operation completed over 1 objects/142.9 KiB.                                    


In [17]:
data_utils.download_blob(
    project_id = PROJECT_ID,
    bucket_name = BUCKET_NAME, 
    source_blob_name = f"{VOCAB_SUBDIR}/{VOCAB_FILENAME}", 
    destination_file_name= VOCAB_FILENAME
)

filehandler = open(VOCAB_FILENAME, 'rb')
vocab_dict = pkl.load(filehandler)
filehandler.close()

for key in vocab_dict.keys():
    pprint(key)

Downloaded storage object vocabs/vocab_dict.pkl from bucket rec-bandits-v1-hybrid-vertex-bucket to local file vocab_dict.pkl.
'movie_id'
'user_id'
'user_occupation_text'
'movie_genres'
'bucketized_user_age'
'max_timestamp'
'min_timestamp'
'timestamp_buckets'


# Global & Per-Arm feature embedding models 

In [18]:
NUM_OOV_BUCKETS       = 2
GLOBAL_EMBEDDING_SIZE = 32
MV_EMBEDDING_SIZE     = 32 #32

BATCH_SIZE            = 5 #128
EVAL_BATCH_SIZE       = 1

NUM_ITEMS             = NUM_EXAMPLES_PER_LIST # 3 | 5 
NUM_SLOTS             = 2

DISTANCE_THRESHOLD    = 0.5

print(f"NUM_OOV_BUCKETS    : {NUM_OOV_BUCKETS}")
print(f"GLOBAL_EMB_SIZE    : {GLOBAL_EMBEDDING_SIZE}")
print(f"MV_EMB_SIZE        : {MV_EMBEDDING_SIZE}")
print(f"BATCH_SIZE         : {BATCH_SIZE}")
print(f"EVAL_BATCH_SIZE    : {EVAL_BATCH_SIZE}")
print(f"NUM_ITEMS          : {NUM_ITEMS}")
print(f"NUM_SLOTS          : {NUM_SLOTS}")
print(f"DISTANCE_THRESHOLD : {DISTANCE_THRESHOLD}")

NUM_OOV_BUCKETS    : 2
GLOBAL_EMB_SIZE    : 32
MV_EMB_SIZE        : 32
BATCH_SIZE         : 5
EVAL_BATCH_SIZE    : 1
NUM_ITEMS          : 5
NUM_SLOTS          : 2
DISTANCE_THRESHOLD : 0.5


In [19]:
for i in range(1):
    
    iterator = iter(train_dataset.batch(1))
    data = next(iterator)

data

{'movie_genres': <tf.Tensor: shape=(1, 5), dtype=int64, numpy=array([[ 3,  0,  0,  0, 19]])>,
 'movie_id': <tf.Tensor: shape=(1, 5), dtype=string, numpy=array([[b'94', b'245', b'403', b'50', b'470']], dtype=object)>,
 'user_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'346'], dtype=object)>,
 'user_rating': <tf.Tensor: shape=(1, 5), dtype=float32, numpy=array([[3., 4., 3., 5., 3.]], dtype=float32)>}

## Embedding layers

#### User ID

In [20]:
user_id_input_layer = tf.keras.Input(
    name="user_id",
    shape=(1,),
    dtype=tf.string
)

user_id_lookup = tf.keras.layers.StringLookup(
    max_tokens=len(vocab_dict['user_id']) + NUM_OOV_BUCKETS,
    num_oov_indices=NUM_OOV_BUCKETS,
    mask_token=None,
    vocabulary=vocab_dict['user_id'],
)(user_id_input_layer)

user_id_embedding = tf.keras.layers.Embedding(
    # Let's use the explicit vocabulary lookup.
    input_dim=len(vocab_dict['user_id']) + NUM_OOV_BUCKETS,
    output_dim=GLOBAL_EMBEDDING_SIZE
)(user_id_lookup)

user_id_embedding = tf.reduce_sum(user_id_embedding, axis=-2)

In [21]:
test_user_id_model = tf.keras.Model(inputs=user_id_input_layer, outputs=user_id_embedding)

for x in train_dataset.batch(1).take(1):
    print(x["user_id"])
    print(test_user_id_model(x["user_id"]))

tf.Tensor([b'346'], shape=(1,), dtype=string)
tf.Tensor(
[[-0.02037551  0.00986262 -0.01213836  0.00730572 -0.01425978  0.03866805
   0.03549664 -0.00837642 -0.03165653 -0.01890605  0.00869458 -0.0490423
   0.03900332 -0.03679205 -0.01939126  0.01251271 -0.04736058 -0.01772742
   0.01514887  0.04953028 -0.01093968  0.04061886  0.01655659 -0.0499462
  -0.01355903 -0.0124302  -0.0064685  -0.04333378 -0.00015298  0.01025945
  -0.03613396  0.049801  ]], shape=(1, 32), dtype=float32)


#### Movie ID

In [22]:
len(vocab_dict['movie_id'])

1683

In [23]:
mv_id_input_layer = tf.keras.Input(
    name="movie_id",
    shape=(1,),
    dtype=tf.string
)

mv_id_lookup = tf.keras.layers.StringLookup(
    max_tokens=len(vocab_dict['movie_id']) + NUM_OOV_BUCKETS,
    num_oov_indices=NUM_OOV_BUCKETS,
    mask_token=None,
    vocabulary=vocab_dict['movie_id'],
)(mv_id_input_layer)

mv_id_embedding = tf.keras.layers.Embedding(
    # Let's use the explicit vocabulary lookup.
    input_dim=len(vocab_dict['movie_id']) + NUM_OOV_BUCKETS,
    output_dim=MV_EMBEDDING_SIZE
)(mv_id_lookup)

# mv_id_embedding = tf.reduce_sum(mv_id_embedding, axis=-2)

In [24]:
test_mv_id_model = tf.keras.Model(inputs=mv_id_input_layer, outputs=mv_id_embedding)

for x in train_dataset.batch(1).take(1):
    print(x["movie_id"])
    list_length = x["movie_id"].shape[1]
    print(test_mv_id_model(x["movie_id"]))

tf.Tensor([[b'94' b'245' b'403' b'50' b'470']], shape=(1, 5), dtype=string)
tf.Tensor(
[[[-0.0389544   0.03675831 -0.01776552  0.00942166 -0.01629441
   -0.00985629  0.00080961 -0.02068306  0.0052003  -0.0448141
    0.04415883 -0.03431238  0.0175052  -0.03762475  0.01078951
    0.01346042  0.04789348  0.0481027   0.04688222 -0.03028274
    0.03416605  0.02807817  0.01592631  0.02392964 -0.019668
    0.04940886  0.03796076 -0.02723338 -0.04419798  0.04680282
    0.03091855 -0.02986611]
  [-0.02931207  0.03181832 -0.02282451 -0.04951935 -0.01029684
    0.01362442 -0.02827499 -0.03442618 -0.02571412  0.04570082
   -0.03878728 -0.01451142 -0.02110769  0.02098748 -0.02328435
    0.02699569 -0.02113044 -0.04129428 -0.01271294  0.04279878
   -0.04521221 -0.00175774  0.03274662 -0.04742727 -0.03279878
    0.00193717  0.03231787  0.04362129 -0.04288712 -0.03266909
    0.00267907  0.02123124]
  [-0.04136787  0.01924491 -0.0117949  -0.02062864  0.04856871
   -0.04096658 -0.02402866  0.00342511 -0

In [25]:
test_mv_id_model = tf.keras.Model(inputs=mv_id_input_layer, outputs=mv_id_embedding)

for x in train_dataset.batch(1).take(1):
    print(x["movie_id"])
    list_length = x["movie_id"].shape[1]
    print(test_mv_id_model(x["movie_id"]))

tf.Tensor([[b'94' b'245' b'403' b'50' b'470']], shape=(1, 5), dtype=string)
tf.Tensor(
[[[-0.0389544   0.03675831 -0.01776552  0.00942166 -0.01629441
   -0.00985629  0.00080961 -0.02068306  0.0052003  -0.0448141
    0.04415883 -0.03431238  0.0175052  -0.03762475  0.01078951
    0.01346042  0.04789348  0.0481027   0.04688222 -0.03028274
    0.03416605  0.02807817  0.01592631  0.02392964 -0.019668
    0.04940886  0.03796076 -0.02723338 -0.04419798  0.04680282
    0.03091855 -0.02986611]
  [-0.02931207  0.03181832 -0.02282451 -0.04951935 -0.01029684
    0.01362442 -0.02827499 -0.03442618 -0.02571412  0.04570082
   -0.03878728 -0.01451142 -0.02110769  0.02098748 -0.02328435
    0.02699569 -0.02113044 -0.04129428 -0.01271294  0.04279878
   -0.04521221 -0.00175774  0.03274662 -0.04742727 -0.03279878
    0.00193717  0.03231787  0.04362129 -0.04288712 -0.03266909
    0.00267907  0.02123124]
  [-0.04136787  0.01924491 -0.0117949  -0.02062864  0.04856871
   -0.04096658 -0.02402866  0.00342511 -0

In [26]:
list_length

5

In [27]:
for x in train_dataset.batch(1).take(1):
    print(x["movie_id"])
    single_sample = x["movie_id"][0]
    item_1 = tf.gather(x["movie_id"][0], 0)
    print(item_1)

tf.Tensor([[b'94' b'245' b'403' b'50' b'470']], shape=(1, 5), dtype=string)
tf.Tensor(b'94', shape=(), dtype=string)


In [28]:
single_sample[0]

<tf.Tensor: shape=(), dtype=string, numpy=b'94'>

#### Movie Genres

In [29]:
len(vocab_dict['movie_genres'])

20

In [30]:
mv_genre_input_layer = tf.keras.Input(
    name="movie_genres",
    shape=(1,),
    dtype=tf.float32
)

mv_genre_lookup = tf.keras.layers.IntegerLookup(
    vocabulary=vocab_dict['movie_genres'],
    num_oov_indices=NUM_OOV_BUCKETS,
    oov_value=0,
)(mv_genre_input_layer)

mv_genre_embedding = tf.keras.layers.Embedding(
    # Let's use the explicit vocabulary lookup.
    input_dim=len(vocab_dict['movie_genres']) + NUM_OOV_BUCKETS,
    output_dim=MV_EMBEDDING_SIZE
)(mv_genre_lookup)

# mv_genre_embedding = tf.reduce_sum(mv_genre_embedding, axis=-2)

In [31]:
test_mv_gen_model = tf.keras.Model(inputs=mv_genre_input_layer, outputs=mv_genre_embedding)

for x in train_dataset.batch(1).take(1):
    print(x["movie_genres"])
    print(test_mv_gen_model(x["movie_genres"]))

tf.Tensor([[ 3  0  0  0 19]], shape=(1, 5), dtype=int64)
tf.Tensor(
[[[ 4.2853285e-02  1.8181492e-02 -1.4315654e-02 -1.6139604e-02
   -1.6651712e-02  4.9005236e-02 -3.1097233e-02 -3.1315077e-02
    1.4289271e-02 -2.6588297e-02  2.6090298e-02 -4.6074580e-02
    4.2879704e-02 -4.7649909e-02 -1.0367822e-02  3.7677322e-02
   -2.9790236e-02 -3.5468124e-02  3.4085635e-02 -4.1464724e-02
   -2.2671569e-02  1.5857939e-02  1.3166379e-02 -2.0465899e-02
    3.4682453e-06  4.8607323e-02  4.4099465e-03  7.7720173e-03
    1.0728013e-02 -3.7845731e-02 -4.4549312e-02 -4.3600656e-02]
  [-2.0621419e-02  3.5770312e-03  2.3640063e-02 -2.5104785e-02
   -1.4601719e-02  1.2637090e-02 -2.0740962e-02  3.7907671e-02
    2.2053827e-02 -2.7528429e-02  3.2271352e-02  2.5504831e-02
    1.4312867e-02  3.3271078e-02 -4.9614873e-02  1.3418306e-02
    5.2506700e-03 -1.8708408e-02 -4.1839767e-02  1.7626408e-02
   -2.5064290e-02 -4.1479096e-03 -3.1266391e-02 -4.9992859e-02
   -3.0914938e-02 -2.9711951e-02  4.4089805e-02 -

In [32]:
for x in train_dataset.batch(1).take(1):
    print(x["movie_genres"])
    mv_gen_sample = x["movie_genres"]
    item_gen = tf.gather(x["movie_genres"], 0)
    print(item_gen)

tf.Tensor([[ 3  0  0  0 19]], shape=(1, 5), dtype=int64)
tf.Tensor([ 3  0  0  0 19], shape=(5,), dtype=int64)


In [33]:
for x in train_dataset.batch(1).take(1):
    print(x["user_rating"])
    # mv_gen_sample = x["movie_genres"]
    # item_gen = tf.gather(x["movie_genres"], 0)
    # print(item_gen)

tf.Tensor([[3. 4. 3. 5. 3.]], shape=(1, 5), dtype=float32)


## Sampling Functions

#### item sampling

In [None]:
# def _get_per_arm_features(x):
#     """
#     This function generates a single per-arm observation vector.
#     """
    
#     id_1 = tf.gather(x["movie_id"][0], 0)
#     id_2 = tf.gather(x["movie_id"][0], 1)
#     id_3 = tf.gather(x["movie_id"][0], 2)
#     # print(f"id_1 shape: {id_1.shape}")
    
#     stacked_ids = tf.stack(
#         [
#             test_mv_id_model(id_1), 
#             test_mv_id_model(id_2), 
#             test_mv_id_model(id_3)
#         ], axis = 0
#     )
#     # print(f"stacked_ids shape: {stacked_ids.shape}")
#     # print(f"stacked_ids: {stacked_ids}")
        
    
#     gen_1 = tf.gather(x["movie_genres"][0], 0)
#     gen_2 = tf.gather(x["movie_genres"][0], 1)
#     gen_3 = tf.gather(x["movie_genres"][0], 2)
#     # print(f"gen_1 shape: {gen_1.shape}")
    
#     stacked_gens = tf.stack(
#         [
#             test_mv_gen_model(gen_1), 
#             test_mv_gen_model(gen_2), 
#             test_mv_gen_model(gen_3)
#         ], axis = 0
#     )
#     # print(f"stacked_gens shape: {stacked_gens.shape}")
#     # print(f"stacked_gens: {stacked_gens}")
    
#     ratings_list = []

    
#     concat_embeddings = tf.concat(
#         [stacked_ids, stacked_gens], axis=1
#     )
#     # print(f"concat_embeddings shape: {concat_embeddings.shape}")
    
#     return concat_embeddings

In [141]:
def _get_per_arm_features(x):
    """
    This function generates a single per-arm observation vector.
    """
    
    # user rating
    ratings_list = x["user_rating"][0]
    indices = tf.argsort(ratings_list, direction="DESCENDING")
    
    # arm features
    arm_ids = []
    arm_gens = []
    
    for i in range(0, NUM_EXAMPLES_PER_LIST):
        arm_ids.append(test_mv_id_model(tf.gather(x["movie_id"][0], i)))
        
    stacked_ids = tf.stack(arm_ids, axis = 0)
    
    for i in range(0, NUM_EXAMPLES_PER_LIST):
        arm_gens.append(test_mv_gen_model(tf.gather(x["movie_genres"][0], i)))
        
    stacked_gens = tf.stack(arm_gens, axis = 0)
    
    concat_embeddings = tf.concat(
        [stacked_ids, stacked_gens], axis=1
    )
    print(f"concat_embeddings shape: {concat_embeddings.shape}")
    
    ordered_concat = tf.gather(concat_embeddings, indices, batch_dims=0) #.numpy()
    ordered_concat_embeddings = tf.stack(ordered_concat, axis = 0)
    
    # return concat_embeddings
    return ordered_concat_embeddings

In [142]:
def return_ranking_rewards(x):
    
    # ratings_list = []
    
    # for i in range(0, NUM_EXAMPLES_PER_LIST):
        # ratings_list.append(tf.gather(x["user_rating"][0], i))
        # ratings_list.append(x["user_rating"][0])
    
    ratings_list = x["user_rating"][0]
    print(f"ratings_list: {ratings_list}")
    
    indices = tf.argsort(ratings_list, direction="DESCENDING")
    print(f"indices: {indices}")
    
    ordered_list = tf.gather(ratings_list, indices, batch_dims=-1) #.numpy()
    print(f"ordered_list: {ordered_list}")
    
    return ordered_list

    
ratings_list = return_ranking_rewards(data)
ratings_list

ratings_list: [3. 4. 3. 5. 3.]
indices: [3 1 0 2 4]
ordered_list: [5. 4. 3. 3. 3.]


<tf.Tensor: shape=(5,), dtype=float32, numpy=array([5., 4., 3., 3., 3.], dtype=float32)>

In [143]:
test_arms = _get_per_arm_features(data)

PER_ARM_DIM = test_arms.shape[1]            
print(f"PER_ARM_DIM: {PER_ARM_DIM}")

test_arms

concat_embeddings shape: (5, 64)
PER_ARM_DIM: 64


<tf.Tensor: shape=(5, 64), dtype=float32, numpy=
array([[ 4.98176925e-02,  4.28270437e-02,  1.54910944e-02,
        -4.94843982e-02, -2.98609380e-02,  8.16265494e-03,
         3.23163904e-02, -1.76907070e-02, -2.73703225e-02,
        -2.78761983e-02, -8.61145183e-03,  2.37810276e-02,
         4.18272503e-02, -3.04570552e-02,  8.59882683e-03,
         3.04696672e-02, -1.34807937e-02,  4.67852838e-02,
        -1.81840882e-02,  3.22087444e-02, -3.49641554e-02,
         4.20926102e-02,  4.93626855e-02,  3.21062915e-02,
         1.10036619e-02, -2.24628579e-02,  1.69007666e-02,
        -4.72860709e-02, -3.34315449e-02, -2.40819696e-02,
        -3.73691432e-02, -2.64766570e-02, -2.06214190e-02,
         3.57703120e-03,  2.36400627e-02, -2.51047853e-02,
        -1.46017186e-02,  1.26370899e-02, -2.07409617e-02,
         3.79076712e-02,  2.20538266e-02, -2.75284294e-02,
         3.22713517e-02,  2.55048312e-02,  1.43128671e-02,
         3.32710780e-02, -4.96148728e-02,  1.34183057e-02,
       

In [144]:
test_arms = _get_per_arm_features(data)

PER_ARM_DIM = test_arms.shape[1]            
print(f"PER_ARM_DIM: {PER_ARM_DIM}")

test_arms

concat_embeddings shape: (5, 64)
PER_ARM_DIM: 64


<tf.Tensor: shape=(5, 64), dtype=float32, numpy=
array([[ 4.98176925e-02,  4.28270437e-02,  1.54910944e-02,
        -4.94843982e-02, -2.98609380e-02,  8.16265494e-03,
         3.23163904e-02, -1.76907070e-02, -2.73703225e-02,
        -2.78761983e-02, -8.61145183e-03,  2.37810276e-02,
         4.18272503e-02, -3.04570552e-02,  8.59882683e-03,
         3.04696672e-02, -1.34807937e-02,  4.67852838e-02,
        -1.81840882e-02,  3.22087444e-02, -3.49641554e-02,
         4.20926102e-02,  4.93626855e-02,  3.21062915e-02,
         1.10036619e-02, -2.24628579e-02,  1.69007666e-02,
        -4.72860709e-02, -3.34315449e-02, -2.40819696e-02,
        -3.73691432e-02, -2.64766570e-02, -2.06214190e-02,
         3.57703120e-03,  2.36400627e-02, -2.51047853e-02,
        -1.46017186e-02,  1.26370899e-02, -2.07409617e-02,
         3.79076712e-02,  2.20538266e-02, -2.75284294e-02,
         3.22713517e-02,  2.55048312e-02,  1.43128671e-02,
         3.32710780e-02, -4.96148728e-02,  1.34183057e-02,
       

#### global sampling

In [145]:
def _get_global_context_features(x):
    """
    This function generates a single global observation vector.
    """
    # _id_1 = test_user_id_model(x['user_id'])
    
    return test_user_id_model(x['user_id'])
    
    # list_length = x["movie_id"].shape[1]

In [146]:
test_globals = _get_global_context_features(data)

GLOBAL_DIM = test_globals.shape[1] 
print(f"GLOBAL_DIM: {GLOBAL_DIM}")

test_globals

GLOBAL_DIM: 32


<tf.Tensor: shape=(1, 32), dtype=float32, numpy=
array([[-0.02037551,  0.00986262, -0.01213836,  0.00730572, -0.01425978,
         0.03866805,  0.03549664, -0.00837642, -0.03165653, -0.01890605,
         0.00869458, -0.0490423 ,  0.03900332, -0.03679205, -0.01939126,
         0.01251271, -0.04736058, -0.01772742,  0.01514887,  0.04953028,
        -0.01093968,  0.04061886,  0.01655659, -0.0499462 , -0.01355903,
        -0.0124302 , -0.0064685 , -0.04333378, -0.00015298,  0.01025945,
        -0.03613396,  0.049801  ]], dtype=float32)>

# Ranking Agent

## Feedback type

Ranking agents assume either a `score_vector` or `cascading feedback` framework for the feedback signal (reward). 

* `score_vector`: feedback is a vector of scores for every item in the slots. 
* `cascading feedback`: if the kth item was clicked, then the items up to k-1 receive a score of -1, the kth item receives a score based on a feedback value, while the rest of the items receive feedback of 0. 

Ranking agent objective: train the scoring network to be able to estimate the above scores

In [77]:
# feedback_model = ranking_environment.FeedbackModel.CASCADING
feedback_model = ranking_environment.FeedbackModel.SCORE_VECTOR

# feedback_model

## Tensor Specs

example Tensor Spec structures...

`observation_spec()`

```python
{'global': TensorSpec(shape=(9,), dtype=tf.float32, name=None),
 'per_arm': TensorSpec(shape=(50, 11), dtype=tf.float32, name=None)}
```

`action_spec()`

```python
BoundedTensorSpec(shape=(3,), dtype=tf.int32, name='action', minimum=array(0, dtype=int32), maximum=array(49, dtype=int32))
```

`reward_spec()`

```python
{'chosen_index': BoundedTensorSpec(shape=(), dtype=tf.int32, name='chosen_index', minimum=array(0, dtype=int32), maximum=array(3, dtype=int32)),
 'chosen_value': TensorSpec(shape=(), dtype=tf.float32, name='chosen_value')}
```

`time_step_spec()`

```python
TimeStep(
{'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
 'observation': {'global': TensorSpec(shape=(9,), dtype=tf.float32, name=None),
                 'per_arm': TensorSpec(shape=(50, 11), dtype=tf.float32, name=None)},
 'reward': {'chosen_index': BoundedTensorSpec(shape=(), dtype=tf.int32, name='chosen_index', minimum=array(0, dtype=int32), maximum=array(3, dtype=int32)),
            'chosen_value': TensorSpec(shape=(), dtype=tf.float32, name='chosen_value')},
 'step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type')})
```

set vars

In [78]:
from tf_agents.specs import array_spec

print(f"BATCH_SIZE         : {BATCH_SIZE}")
print(f"EVAL_BATCH_SIZE    : {EVAL_BATCH_SIZE}")
print(f"NUM_ITEMS          : {NUM_ITEMS}")
print(f"NUM_SLOTS          : {NUM_SLOTS}")
print(f"DISTANCE_THRESHOLD : {DISTANCE_THRESHOLD}")
print(f"GLOBAL_DIM         : {GLOBAL_DIM}")
print(f"PER_ARM_DIM        : {PER_ARM_DIM}")

BATCH_SIZE         : 5
EVAL_BATCH_SIZE    : 1
NUM_ITEMS          : 5
NUM_SLOTS          : 2
DISTANCE_THRESHOLD : 0.5
GLOBAL_DIM         : 32
PER_ARM_DIM        : 64


### Observation spec

In [147]:
observation_spec = {
    'global': tf.TensorSpec([GLOBAL_DIM], tf.float32),
    'per_arm': tf.TensorSpec([NUM_ITEMS, PER_ARM_DIM], tf.float32) #excluding action dim here
}
observation_spec

{'global': TensorSpec(shape=(32,), dtype=tf.float32, name=None),
 'per_arm': TensorSpec(shape=(5, 64), dtype=tf.float32, name=None)}

### Action spec

> Action spec for ranking models must have rank 1

In [148]:
action_spec = tensor_spec.BoundedTensorSpec(
    shape=(NUM_SLOTS,),
    dtype=tf.int32,
    minimum=tf.constant(0),            
    maximum=NUM_ITEMS-1, # n degrees of freedom and will dictate the expected mean reward spec shape
    name="action_spec"
)

print(f"action_spec rank: {action_spec.shape.rank}")

action_spec

action_spec rank: 1


BoundedTensorSpec(shape=(2,), dtype=tf.int32, name='action_spec', minimum=array(0, dtype=int32), maximum=array(4, dtype=int32))

### Reward spec

In [149]:
if feedback_model == ranking_environment.FeedbackModel.CASCADING:
    # `chosen_index == num_slots` means no recommended item was clicked.
    reward_spec = {
        'chosen_index': array_spec.BoundedArraySpec(
            shape=[],
            minimum=0,
            maximum=NUM_SLOTS,
            dtype=np.int32,
            name='chosen_index',
        ),
        'chosen_value': array_spec.ArraySpec(
            shape=[], dtype=np.float32, name='chosen_value'
        ),
    }
elif feedback_model == ranking_environment.FeedbackModel.SCORE_VECTOR:
    reward_spec = array_spec.ArraySpec(
        shape=[NUM_SLOTS], dtype=np.float32, name='score_vector'
    )
else:
    reward_spec = f"Feedback model: {feedback_model}, not implemented"
    
reward_spec

ArraySpec(shape=(2,), dtype=dtype('float32'), name='score_vector')

### TimeStep spec

In [150]:
time_step_spec = ts.time_step_spec(
    observation_spec = observation_spec, 
    # reward_spec = reward_spec
)
time_step_spec

TimeStep(
{'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
 'observation': {'global': TensorSpec(shape=(32,), dtype=tf.float32, name=None),
                 'per_arm': TensorSpec(shape=(5, 64), dtype=tf.float32, name=None)},
 'reward': TensorSpec(shape=(), dtype=tf.float32, name='reward'),
 'step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type')})

Inspect chosen arm features spec

In [151]:
time_step_spec.observation

{'global': TensorSpec(shape=(32,), dtype=tf.float32, name=None),
 'per_arm': TensorSpec(shape=(5, 64), dtype=tf.float32, name=None)}

## Policy and Scoring Network

> all ranking agents train a network that estimates scores of item/user pairs

**Ranking Policies**
* `DESCENDING_SCORES` - Stack rank deterministically by scores
* `NO_PENALTY` - Sampling sequentially based on scores; no penalty applied
* `COSINE_DISTANCE` - Sampling sequentally and taking diversity into account

`penalty_mixture` parameter governs the balance between ranking based on scores and accounting for diversity
* low positive value --> ranking has less diversity
* higher value --> enforces more diversity

`logits_temperature` - temperature parameter for non-deterministic policies
* This value must be positive

In [152]:
AGENT_TYPE = "Ranking"
NETWORK_TYPE = "dotproduct"
POLICY_TYPE = ranking_agent.RankingPolicyType.COSINE_DISTANCE # COSINE_DISTANCE | NO_PENALTY | DESCENDING_SCORES

PENALTY_MIXTURE = 1.0
LOGITS_TEMPERATURE = 1.0

LEARNING_RATE = 0.005

GLOBAL_LAYERS   = [GLOBAL_DIM, int(GLOBAL_DIM / 2)]
ARM_LAYERS      = [PER_ARM_DIM, int(PER_ARM_DIM / 2), int(PER_ARM_DIM / 4)]
COMMON_LAYERS   = [16, 8]

HPARAMS = {  # TODO - streamline and consolidate
    "batch_size": BATCH_SIZE,
    "eval_batch_size" : EVAL_BATCH_SIZE,
    "num_itmes": NUM_ITEMS,
    "num_slots": NUM_SLOTS,
    "model_type": AGENT_TYPE,
    "network_type": NETWORK_TYPE,
    "global_layers": GLOBAL_LAYERS,
    "per_arm_layers": ARM_LAYERS,
    "common_layers": COMMON_LAYERS,
    "learning_rate": LEARNING_RATE,
    "policy_type": POLICY_TYPE,
    "feedback_model" : feedback_model,
    "penalty_mixture": PENALTY_MIXTURE,
    "logits_temperature": LOGITS_TEMPERATURE,
}
pprint(HPARAMS)

{'batch_size': 5,
 'common_layers': [16, 8],
 'eval_batch_size': 1,
 'feedback_model': 2,
 'global_layers': [32, 16],
 'learning_rate': 0.005,
 'logits_temperature': 1.0,
 'model_type': 'Ranking',
 'network_type': 'dotproduct',
 'num_itmes': 5,
 'num_slots': 2,
 'penalty_mixture': 1.0,
 'per_arm_layers': [64, 32, 16],
 'policy_type': <RankingPolicyType.COSINE_DISTANCE: 1>}


In [153]:
if NETWORK_TYPE == 'commontower':
    scoring_network = global_and_arm_feature_network.create_feed_forward_common_tower_network(
        observation_spec = observation_spec, 
        global_layers = GLOBAL_LAYERS, 
        arm_layers = ARM_LAYERS, 
        common_layers = COMMON_LAYERS,
        # output_dim = output_dim,
    )
    
elif NETWORK_TYPE == 'dotproduct':
    scoring_network = global_and_arm_feature_network.create_feed_forward_dot_product_network(
        observation_spec = observation_spec, 
        global_layers = GLOBAL_LAYERS, 
        arm_layers = ARM_LAYERS
    )
    
if NETWORK_TYPE:
    print(f"Network: {scoring_network.name}")

Network: GlobalAndArmDotProductNetwork


## Define Agent 

In [154]:
rank_agent = ranking_agent.RankingAgent(
    time_step_spec=time_step_spec,
    action_spec=action_spec,
    scoring_network=scoring_network,
    optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=HPARAMS['learning_rate']),
    feedback_model=feedback_model,
    policy_type=HPARAMS['policy_type'],
    logits_temperature=HPARAMS['logits_temperature'],
    penalty_mixture_coefficient=HPARAMS['penalty_mixture'],
    summarize_grads_and_vars=True
)

rank_agent

<tf_agents.bandits.agents.ranking_agent.RankingAgent at 0x7f101041b670>

In [155]:
rank_agent.action_spec

BoundedTensorSpec(shape=(2,), dtype=tf.int32, name='action_spec', minimum=array(0, dtype=int32), maximum=array(4, dtype=int32))

In [156]:
rank_agent.time_step_spec

_TupleWrapper(TimeStep(
{'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
 'observation': DictWrapper({'global': TensorSpec(shape=(32,), dtype=tf.float32, name=None), 'per_arm': TensorSpec(shape=(5, 64), dtype=tf.float32, name=None)}),
 'reward': TensorSpec(shape=(), dtype=tf.float32, name='reward'),
 'step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type')}))

In [157]:
rank_agent.training_data_spec

_TupleWrapper(Trajectory(
{'action': (),
 'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
 'next_step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type'),
 'observation': DictWrapper({'global': TensorSpec(shape=(32,), dtype=tf.float32, name=None), 'per_arm': TensorSpec(shape=(2, 64), dtype=tf.float32, name=None)}),
 'policy_info': (),
 'reward': TensorSpec(shape=(), dtype=tf.float32, name='reward'),
 'step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type')}))

### Reward function

* `_create_ranking_reward_features` - [src](https://source.corp.google.com/piper///depot/google3/learning/smartchoices/training/models/agent_factory_test.py?q=%22ranking_model_factory%22&sq=package:piper%20file:%2F%2Fdepot%2Fgoogle3%20-file:google3%2Fexperimental&start=1)

* `_get_rewards_from_arm_features` 

#### tmp - debugging

In [104]:
# test_arms

In [99]:
# arm_feature_values = [feature_values.numpy() for feature_name, feature_values in data.items()]

# # data.items()
# arm_feature_values

[array([[ 3,  0,  0,  0, 19]]),
 array([[b'94', b'245', b'403', b'50', b'470']], dtype=object),
 array([b'346'], dtype=object),
 array([[3., 4., 3., 5., 3.]], dtype=float32)]

In [158]:
def _get_rewards_from_arm_features(arm_features: Iterable[tf.Tensor]
  ) -> tf.Tensor:
    """Computes the reward for each arm based on the arm's features.

    NOTE: The reward for an arm is simply the product of its features.

    Args:
      arm_features: 2-dimensional tensors [t_0, t_1, ..., t_{n-1}], where
        t_j[batch_id, i] represents the jth feature of the ith arm.

    Returns:
      A 2-dimensional tensor `r` of size (batch_size, num_arms), where
      r[batch_id, i] corresponds to the reward for selecting the ith arm in
      batch `batch_id`.
    """
    return tf.math.reduce_prod(
        [tf.cast(arm_feature, tf.float32) for arm_feature in arm_features], 0
    )

In [162]:
test_arms_reward = _get_rewards_from_arm_features(test_arms)
test_arms_reward

<tf.Tensor: shape=(64,), dtype=float32, numpy=
array([-3.3672174e-09, -3.6273156e-09,  2.5876428e-09,  1.1760295e-08,
        7.5950766e-09,  9.6893793e-10,  1.4476101e-10,  1.9960902e-09,
        7.7965645e-10,  6.3142149e-09, -6.7853860e-09, -7.7916354e-11,
       -3.1730438e-10,  2.5806667e-08, -8.9416592e-11, -6.7893757e-09,
       -2.8820466e-09, -4.2740123e-08,  4.4476725e-09,  4.6120671e-10,
       -7.3157459e-08, -5.0551802e-10, -2.0620032e-08, -1.3601258e-09,
        3.9365773e-09, -4.5898596e-10, -3.4506231e-09,  9.4172015e-10,
       -1.6425222e-08,  5.8425154e-10, -2.3376734e-09, -9.4473513e-09,
       -4.6019428e-09,  3.5111154e-11, -5.8178675e-09, -9.4127095e-10,
       -5.9904676e-10, -2.1665225e-09, -6.5387127e-09,  5.8688819e-08,
        5.7501865e-09, -2.5891046e-08, -2.9939841e-08,  3.7000568e-08,
       -4.6793307e-09, -1.5175756e-09, -1.8572743e-08,  4.2282795e-09,
       -2.1070240e-10,  1.0791393e-08,  9.6277070e-08, -5.1364673e-09,
        1.4981792e-08,  4.6242

In [161]:
test_arms

# tf.math.reduce_prod(
#         [tf.cast(arm_feature, tf.float32) for arm_feature in arm_features], 0
#     )

<tf.Tensor: shape=(5, 64), dtype=float32, numpy=
array([[ 4.98176925e-02,  4.28270437e-02,  1.54910944e-02,
        -4.94843982e-02, -2.98609380e-02,  8.16265494e-03,
         3.23163904e-02, -1.76907070e-02, -2.73703225e-02,
        -2.78761983e-02, -8.61145183e-03,  2.37810276e-02,
         4.18272503e-02, -3.04570552e-02,  8.59882683e-03,
         3.04696672e-02, -1.34807937e-02,  4.67852838e-02,
        -1.81840882e-02,  3.22087444e-02, -3.49641554e-02,
         4.20926102e-02,  4.93626855e-02,  3.21062915e-02,
         1.10036619e-02, -2.24628579e-02,  1.69007666e-02,
        -4.72860709e-02, -3.34315449e-02, -2.40819696e-02,
        -3.73691432e-02, -2.64766570e-02, -2.06214190e-02,
         3.57703120e-03,  2.36400627e-02, -2.51047853e-02,
        -1.46017186e-02,  1.26370899e-02, -2.07409617e-02,
         3.79076712e-02,  2.20538266e-02, -2.75284294e-02,
         3.22713517e-02,  2.55048312e-02,  1.43128671e-02,
         3.32710780e-02, -4.96148728e-02,  1.34183057e-02,
       

In [160]:
# test_arms_reward = _get_rewards_from_arm_features(test_arms)
# test_arms_reward

In [None]:
def _get_ranking_rewards_sv(element):
    """Calculates reward for the actions."""
    
    # NUM_EXAMPLES_PER_LIST
    num_valid_indices = tf.math.minimum(num_allowed_values, num_ranked_items)

In [None]:
# def _get_rewards(element):
#     """Calculates reward for the actions."""

#     def _calc_reward(x):
#         """Calculates reward for a single action."""
#         r0 = lambda: tf.constant(0.0)
#         r1 = lambda: tf.constant(1.0)
#         r2 = lambda: tf.constant(2.0)
#         r3 = lambda: tf.constant(3.0)
#         r4 = lambda: tf.constant(4.0)
#         r5 = lambda: tf.constant(5.0)
#         c1 = tf.equal(x, 1.0)
#         c2 = tf.equal(x, 2.0)
#         c3 = tf.equal(x, 3.0)
#         c4 = tf.equal(x, 4.0)
#         c5 = tf.equal(x, 5.0)
#         return tf.case(
#             [(c1, r1), (c2, r2), (c3, r3),(c4, r4),(c5, r5)], 
#             default=r0, exclusive=True
#         )

#     return tf.map_fn(
#         fn=_calc_reward, 
#         elems=element['user_rating'], 
#         dtype=tf.float32
#     )

In [None]:
_get_rewards_from_arm_features

In [228]:
def _rank_trajectory_fn(element): # hparams
    """Converts a dataset element into a trajectory."""
    # global_features = _get_global_context_features(element)
    # arm_features = _get_per_arm_features(element)
    
    global_features = _get_global_context_features(element)
    arm_features = _get_per_arm_features(element)
    
    # obs spec
    observation = {
        bandit_spec_utils.GLOBAL_FEATURE_KEY:
            train_utils._add_outer_dimension(global_features),
        bandit_spec_utils.PER_ARM_FEATURE_KEY: 
            train_utils._add_outer_dimension(arm_features)
    }
    
    reward = element['user_rating']
    
    policy_info = policy_utilities.PolicyInfo(
        predicted_rewards_mean=tf.zeros([HPARAMS['batch_size'], 1, HPARAMS['num_slots']]),
        # Ranking policies do not use the `bandit_policy_type` info field
        # bandit_policy_type=tf.zeros([HPARAMS['batch_size'], 1, 1], dtype=tf.int32),
    )
    
    # dummy_rewards = tf.zeros([HPARAMS['batch_size'], 1, HPARAMS['num_actions']])
    return trajectory.single_step(
        observation=observation,
        action=tf.zeros_like(
            reward, dtype=tf.int32
        ),
        policy_info=policy_info,
        reward=reward,
        discount=tf.zeros_like(reward)
    )


In [230]:
for x in train_dataset.batch(HPARAMS['batch_size']).take(1):
    test_traj = _rank_trajectory_fn(x)
    
# test_traj

In [231]:
print(f"test_traj.action.shape: {test_traj.action.shape}") 

test_traj.action.shape: (5, 3)


In [232]:
print(f"test_traj.discount.shape: {test_traj.discount.shape}") 

test_traj.discount.shape: (5, 3)


In [233]:
print(f"test_traj.reward.shape: {test_traj.reward.shape}") 

test_traj.reward.shape: (5, 3)


In [234]:
print(f"test_traj.observation.shape: {test_traj.observation['global'].shape}") 

test_traj.observation.shape: (5, 1, 32)


In [235]:
print(f"test_traj.observation.shape: {test_traj.observation['per_arm'].shape}") 

test_traj.observation.shape: (3, 1, 64)


In [217]:
observation_spec

{'global': TensorSpec(shape=(32,), dtype=tf.float32, name=None),
 'per_arm': TensorSpec(shape=(3, 64), dtype=tf.float32, name=None)}

In [236]:
expected_num_actions = action_spec.maximum - action_spec.minimum + 1
print(f"expected_num_actions: {expected_num_actions}")

predicted_rewards_mean = tensor_spec.TensorSpec([expected_num_actions])
print(f"predicted_rewards_mean: {predicted_rewards_mean}")

expected_num_actions: 3
predicted_rewards_mean: TensorSpec(shape=(3,), dtype=tf.float32, name=None)


# Train Ranking Agent

In [237]:
EXPERIMENT_NAME   = f'local-ranker-{PREFIX}'

# new experiment
invoke_time       = time.strftime("%Y%m%d-%H%M%S")
RUN_NAME          = f'run-{invoke_time}'

BASE_OUTPUT_DIR   = f'{BUCKET_URI}/{EXPERIMENT_NAME}/{RUN_NAME}'
LOG_DIR           = f"{BASE_OUTPUT_DIR}/logs"
ROOT_DIR          = f"{BASE_OUTPUT_DIR}/root"       # Root directory for writing logs/summaries/checkpoints.
ARTIFACTS_DIR     = f"{BASE_OUTPUT_DIR}/artifacts"  # Where the trained model will be saved and restored.

aiplatform.init(
    project=PROJECT_ID,
    location=REGION,
    experiment=EXPERIMENT_NAME
)

print(f"EXPERIMENT_NAME   : {EXPERIMENT_NAME}")
print(f"RUN_NAME          : {RUN_NAME}\n")
print(f"BASE_OUTPUT_DIR   : {BASE_OUTPUT_DIR}")
print(f"LOG_DIR           : {LOG_DIR}")
print(f"ROOT_DIR          : {ROOT_DIR}")
print(f"ARTIFACTS_DIR     : {ARTIFACTS_DIR}")

EXPERIMENT_NAME   : local-ranker-rec-bandits-v1
RUN_NAME          : run-20230928-082819

BASE_OUTPUT_DIR   : gs://rec-bandits-v1-hybrid-vertex-bucket/local-ranker-rec-bandits-v1/run-20230928-082819
LOG_DIR           : gs://rec-bandits-v1-hybrid-vertex-bucket/local-ranker-rec-bandits-v1/run-20230928-082819/logs
ROOT_DIR          : gs://rec-bandits-v1-hybrid-vertex-bucket/local-ranker-rec-bandits-v1/run-20230928-082819/root
ARTIFACTS_DIR     : gs://rec-bandits-v1-hybrid-vertex-bucket/local-ranker-rec-bandits-v1/run-20230928-082819/artifacts


In [256]:
import collections

from tf_agents.policies import py_tf_eager_policy
from tf_agents.policies import tf_policy
from tf_agents.trajectories import time_step as ts
from tf_agents.policies import policy_saver
from tf_agents.metrics import export_utils
from tf_agents.metrics import tf_metrics
from tf_agents.eval import metric_utils
from tf_agents.utils import common

import tf_agents

print(f"tf_agents version: {tf_agents.__version__}")
print(f"tensorflow version: {tf.__version__}")

tf_agents version: 0.17.0
tensorflow version: 2.13.0


### logs & checkpoints

In [247]:
global_step = tf.compat.v1.train.get_or_create_global_step()

rank_agent.initialize()
print(f'agent: {rank_agent.name}')
print(f'agent: {rank_agent.policy.name}')

agent: ranking_agent
agent: penalize_cosine_distance_ranking_policy


In [252]:
# ====================================================
# TB summary writer
# ====================================================
train_summary_writer = tf.compat.v2.summary.create_file_writer(
    f"{LOG_DIR}/train", flush_millis=10 * 1000
)
# train_summary_writer.set_as_default()

# eval_summary_writer = tf.compat.v2.summary.create_file_writer(
#     f"{LOG_DIR}/eval", flush_millis=10 * 1000
# )
# ====================================================
# metrics
# ====================================================
# `step_metric` records the number of individual rounds of bandit interaction;
# that is, (number of trajectories) * batch_size
step_metric = tf_metrics.EnvironmentSteps()

if feedback_model == ranking_environment.FeedbackModel.SCORE_VECTOR:
    reward_metric = tf_metrics.AverageReturnMetric(
        batch_size=HPARAMS['batch_size'],
        buffer_size=20
    )
else:
    reward_metric = tf_metrics.AverageReturnMultiMetric(
        reward_spec=environment.reward_spec(),
        batch_size=HPARAMS['batch_size'],
        buffer_size=20
    )
    
metrics = [reward_metric]

pprint(f"metrics: {metrics}")

# ====================================================
# get checkpoint manager
# ====================================================
CHKPOINT_DIR = f"{ROOT_DIR}/chkpoint"
print(f"setting checkpoint_manager: {CHKPOINT_DIR}")

checkpoint_manager = train_utils.restore_and_get_checkpoint_manager(
    root_dir=CHKPOINT_DIR, 
    agent=rank_agent, 
    metrics=metrics, 
    step_metric=step_metric
)

('metrics: [<tf_agents.metrics.tf_metrics.AverageReturnMetric object at '
 '0x7fd85c29c340>]')
setting checkpoint_manager: gs://rec-bandits-v1-hybrid-vertex-bucket/local-ranker-rec-bandits-v1/run-20230928-082819/root/chkpoint


In [258]:
# # ====================================================
# # policy saver
# # ====================================================
# saver = policy_saver.PolicySaver(
#     policy = rank_agent.policy, 
#     # train_step=global_step
# )

In [None]:
policy = rank_agent.policy
isinstance(policy, tf_policy.TFPolicy)

## Train config

In [262]:
TRAIN_DATA_SIZE = 900          # len(list(train_dataset))
NUM_TRAIN_STEPS = 180            # TRAIN_DATA_SIZE // HPARAMS['batch_size']

EVAL_DATA_SIZE  = 900          # len(list(val_dataset))
NUM_EVAL_STEPS  = 100           # EVAL_DATA_SIZE // HPARAMS['eval_batch_size']

CHKPT_INTERVAL  = NUM_TRAIN_STEPS # // 5
LOG_INTERVAL    = 10
# EVAL_INTERVAL = NUM_TRAIN_STEPS // 2

print(f"TRAIN_DATA_SIZE : {TRAIN_DATA_SIZE}")
print(f"NUM_TRAIN_STEPS : {NUM_TRAIN_STEPS}")
print(f"EVAL_DATA_SIZE : {EVAL_DATA_SIZE}")
print(f"NUM_EVAL_STEPS : {NUM_EVAL_STEPS}")
print(f"CHKPT_INTERVAL: {CHKPT_INTERVAL}")
print(f"LOG_INTERVAL : {LOG_INTERVAL}")
# print(f"EVAL_INTERVAL : {EVAL_INTERVAL}")

TRAIN_DATA_SIZE : 900
NUM_TRAIN_STEPS : 180
EVAL_DATA_SIZE : 900
NUM_EVAL_STEPS : 100
CHKPT_INTERVAL: 180
LOG_INTERVAL : 10


In [263]:
# train data
train_ds_iterator = iter(train_dataset.batch(HPARAMS['batch_size']).repeat())

# eval dataset
eval_ds = val_dataset.batch(HPARAMS["eval_batch_size"])

if NUM_EVAL_STEPS > 0:
    eval_ds = eval_ds.take(NUM_EVAL_STEPS)

# eval_ds = eval_ds.prefetch(tf.data.AUTOTUNE)

eval_ds

<_TakeDataset element_spec={'movie_genres': TensorSpec(shape=(None, 3), dtype=tf.int64, name=None), 'movie_id': TensorSpec(shape=(None, 3), dtype=tf.string, name=None), 'user_id': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'user_rating': TensorSpec(shape=(None, 3), dtype=tf.float32, name=None)}>

### pre-training eval loop

In [264]:
from src.perarm_features import eval_perarm as eval_perarm

In [266]:
dummy_arm = tf.zeros([HPARAMS['eval_batch_size'], PER_ARM_DIM], dtype=tf.float32)
dummy_arm

<tf.Tensor: shape=(1, 64), dtype=float32, numpy=
array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
      dtype=float32)>

In [None]:
serving_trajectory_fn(experience.observation)

In [277]:
for x in eval_ds:

    filter_mask = None
    # get feature tensors

    global_feat_infer = _get_global_context_features(x)
    arm_feat_infer = _get_per_arm_features(x)
    
    feature = {'global': global_feat_infer, 'per_arm': arm_feat_infer}

    rewards = x['user_rating']
    
    # reshape arm features
    # arm_feat_infer = tf.reshape(arm_feat_infer, [HPARAMS['eval_batch_size'], PER_ARM_DIM])
    # concat_arm = tf.concat([arm_feat_infer, dummy_arm], axis=0)
    
    trajectory_step = train_utils._get_eval_step(feature, rewards.numpy()[0])
    
    break

TypeError: Eager execution of tf.constant with unsupported shape. Tensor [3. 4. 3.] (converted from [3. 4. 3.]) has 3 elements, but got `shape` () with 1 elements).

In [None]:
trajectory_step

In [276]:
rewards.numpy()[0]

array([3., 4., 3.], dtype=float32)

In [271]:
feature

{'global': <tf.Tensor: shape=(1, 32), dtype=float32, numpy=
 array([[-0.00393863, -0.01478862, -0.01024078,  0.0088415 , -0.04419738,
          0.00215956,  0.04627681,  0.00932204, -0.03822122,  0.01907711,
          0.04298531,  0.03985386, -0.02062283,  0.04563275, -0.04217682,
         -0.00670524, -0.00452264,  0.00888044, -0.00630587, -0.04180532,
          0.02169652, -0.03667367, -0.04886571, -0.02383516, -0.02696766,
         -0.01847826,  0.04542513,  0.03135038,  0.03640084, -0.00571319,
         -0.03599992,  0.04219032]], dtype=float32)>,
 'per_arm': <tf.Tensor: shape=(3, 64), dtype=float32, numpy=
 array([[ 0.03050664,  0.03452467,  0.01929859, -0.03016851, -0.04490181,
          0.04495033, -0.01774796,  0.04843504,  0.01318759,  0.04657273,
          0.02429909, -0.02757381, -0.00995559,  0.01978866,  0.01136253,
          0.04364896,  0.04103352,  0.0455398 , -0.01450343,  0.00424021,
         -0.00881096, -0.0438289 ,  0.00205433,  0.04403717,  0.00738447,
          0

In [268]:
arm_feat_infer

<tf.Tensor: shape=(3, 64), dtype=float32, numpy=
array([[ 0.03050664,  0.03452467,  0.01929859, -0.03016851, -0.04490181,
         0.04495033, -0.01774796,  0.04843504,  0.01318759,  0.04657273,
         0.02429909, -0.02757381, -0.00995559,  0.01978866,  0.01136253,
         0.04364896,  0.04103352,  0.0455398 , -0.01450343,  0.00424021,
        -0.00881096, -0.0438289 ,  0.00205433,  0.04403717,  0.00738447,
         0.03068563, -0.00179168, -0.00264094,  0.03268982, -0.02675745,
        -0.03784177,  0.01306761,  0.04654622, -0.02054758,  0.04463777,
         0.04845568,  0.04217536,  0.02371256, -0.00654475,  0.04740763,
         0.01182047,  0.01892675,  0.02788329,  0.01762689, -0.02070785,
         0.04260923, -0.01979904,  0.03063042, -0.00118067,  0.00089129,
         0.00308929,  0.02104905, -0.03149561,  0.02938651, -0.04440167,
         0.04309526,  0.02933786, -0.03683791, -0.03504019, -0.01172578,
        -0.04470687,  0.02467886, -0.01176022,  0.03032222],
       [ 0.005

In [None]:
# ====================================================
# Evaluate the agent's policy once before training
# ====================================================
# Reset the train step
rank_agent.train_step_counter.assign(0)

pre_policy_tf = py_tf_eager_policy.PyTFEagerPolicy(rank_agent.policy, use_tf_function=True)

print(f"evaluating pre-trained Agent...")
start_time = time.time()

pre_val_loss, pre_preds, pre_tr_rewards = eval_perarm._run_bandit_eval(
    policy = pre_policy_tf,
    data = eval_ds,
    eval_batch_size = HPARAMS['eval_batch_size'],
    per_arm_dim = PER_ARM_DIM,
    global_dim = GLOBAL_DIM,
    vocab_dict = vocab_dict,
    num_oov_buckets = NUM_OOV_BUCKETS,
    global_emb_size = GLOBAL_EMBEDDING_SIZE,
    mv_emb_size = MV_EMBEDDING_SIZE,
)

runtime_mins = int((time.time() - start_time) / 60)
print(f"pre-train val_loss     : {pre_val_loss}")
print(f"pre-train eval runtime : {runtime_mins}")
# ====================================================
# train loop
# ====================================================
print(f"starting train loop...")
start_time = time.time()

### Ranking Task

In [None]:
task = tfrs.tasks.Ranking(
      loss=loss,
      metrics=[
        tfr.keras.metrics.NDCGMetric(name="ndcg_metric"),
        tf.keras.metrics.RootMeanSquaredError()
      ]
    )

In [21]:
from src.perarm_features import emb_features as emb_features

embs = emb_features.EmbeddingModel(
    vocab_dict = vocab_dict,
    num_oov_buckets = NUM_OOV_BUCKETS,
    global_emb_size = GLOBAL_EMBEDDING_SIZE,
    mv_emb_size = MV_EMBEDDING_SIZE,
)

embs

<src.perarm_features.emb_features.EmbeddingModel at 0x7efbaa8d6fe0>

In [23]:
test_globals = embs._get_global_context_features(data)
 
GLOBAL_DIM = test_globals.shape[1]   

# test_globals = embs._get_global_context_features_v2(data)
# GLOBAL_DIM, _ = test_globals #.shape[1]  
# GLOBAL_DIM = GLOBAL_DIM.shape[1]

print(f"GLOBAL_DIM: {GLOBAL_DIM}")
test_globals

GLOBAL_DIM: 64


array([[ 0.02324566,  0.03847075,  0.0009382 ,  0.01037087, -0.0168501 ,
         0.02229491, -0.00090174, -0.03733226,  0.04767699, -0.04324825,
         0.03009412, -0.00325273,  0.00937269,  0.01162167,  0.04858048,
        -0.03228705, -0.00880662,  0.00480435,  0.0266616 , -0.01046257,
        -0.01173887, -0.04642478,  0.0118764 , -0.03422302,  0.0312058 ,
        -0.02052242,  0.00459113, -0.0279305 , -0.00947496, -0.00711267,
         0.04388683, -0.02854126, -0.02312779, -0.04988689,  0.03593904,
        -0.02224693, -0.01872089,  0.0384705 ,  0.0432454 ,  0.01695739,
        -0.02577996, -0.01301978, -0.01603407,  0.02314747, -0.01779956,
         0.03170073,  0.00098064, -0.0470944 , -0.02522329,  0.01904691,
         0.00721593,  0.0179752 ,  0.00104992,  0.00527094, -0.04577625,
        -0.02623909, -0.04071148, -0.01163318, -0.02864906, -0.00920088,
        -0.00450635, -0.0427685 ,  0.04297205,  0.04952253]],
      dtype=float32)

In [24]:
test_arms = embs._get_per_arm_features(data)

PER_ARM_DIM = test_arms.shape[1]            
# shape checks out at batch_dim, nactions, arm feats
print(f"PER_ARM_DIM: {PER_ARM_DIM}")

test_arms

PER_ARM_DIM: 64


array([[ 0.02343505,  0.0119458 ,  0.02910166, -0.03252503,  0.00803228,
         0.03554758,  0.01453496, -0.01542882, -0.01640511, -0.03279047,
         0.01140917, -0.03903098,  0.03479561,  0.0006261 ,  0.01132431,
         0.0158616 , -0.0487823 ,  0.04276118,  0.01437578,  0.02662202,
         0.01141692,  0.03396836,  0.04457219,  0.01392839, -0.04691635,
         0.01952115,  0.00976838,  0.04489179,  0.04724428, -0.02655888,
         0.04153249, -0.00589923,  0.01623068,  0.00170443, -0.00777126,
        -0.0101654 ,  0.01170176, -0.01047592, -0.04902004,  0.01853789,
        -0.04450876,  0.0009967 , -0.01994429,  0.02006539, -0.02091893,
         0.01542037, -0.01819051, -0.03646524, -0.02382381,  0.01859686,
         0.03186891,  0.01153149, -0.03137907, -0.01640514,  0.01935514,
        -0.03642275, -0.02170839, -0.04142233,  0.01269498, -0.0153484 ,
        -0.01838543, -0.00584408, -0.01540409,  0.03383145]],
      dtype=float32)

### Create a moive lookup Table

In [25]:
movie_lookup_table = {
    'id': [],
    'movie_features': [],
    # 'movie_title': [],
    'movie_genres': []
}
    
iterator = iter(train_dataset.batch(1000))

for data in iterator:
    embs._get_per_arm_features(data)
    movie_lookup_table['id'].extend(data['movie_id'].numpy())
    # movie_lookup_table['movie_title'].extend(data['movie_title'].numpy())
    movie_lookup_table['movie_genres'].extend(data['movie_genres'].numpy())
    movie_lookup_table['movie_features'].extend(embs._get_per_arm_features(data))
    
#fix string ids to integers for random lookup later
movie_lookup_table['id'] = [int(x) for x in movie_lookup_table['id']]

In [26]:
movie_lookup_table = pd.DataFrame(movie_lookup_table)
movie_lookup_table.set_index(['id'])

unique_table = movie_lookup_table.groupby(['id'])[
    [
        'movie_features', 
        # 'movie_title', 
        'movie_genres'
    ]
].first().reset_index() #resetting index to get consecutive counts from min-max (no gaps)
# unique_table = unique_table['movie_features']
MAX_ARM_ID = len(unique_table)-1
MIN_ARM_ID = 0

# unique_table
# print(f"Max movie id is: {MAX_ARM_ID} \nMin movie id is: {MIN_ARM_ID}")

In [27]:
unique_table.iloc[2,:]['movie_features'] #example of getting a ra movie

array([ 0.02495834, -0.01222584,  0.02913784,  0.00382224, -0.00345637,
       -0.02219919,  0.03300631,  0.02205736, -0.03309649, -0.02158695,
        0.04930801, -0.00045602, -0.00740384,  0.04815162,  0.00566232,
       -0.01206527, -0.04011983,  0.00283178,  0.03639659, -0.04393387,
        0.02940616,  0.02252943, -0.04776486, -0.00800429,  0.02306003,
        0.03619469, -0.02831327, -0.01686976,  0.04269502, -0.02647526,
        0.00538781,  0.04991165,  0.02267591,  0.01639385, -0.02517967,
        0.01744567, -0.03633409, -0.01866313,  0.01826422,  0.04594373,
        0.01793351, -0.04969008,  0.03802439, -0.00140975,  0.02682305,
       -0.0081509 , -0.0046325 ,  0.01475661,  0.03066139, -0.0255517 ,
        0.00646546,  0.00037592, -0.0198789 ,  0.0383074 , -0.04167994,
       -0.00177239, -0.01200641, -0.00897124,  0.03143324, -0.03409   ,
       -0.01962967, -0.0414306 , -0.04632906,  0.00900277], dtype=float32)

In [28]:
def get_random_arm_features(movie_id):
    movie_info = unique_table.iloc[movie_id]
    tensor = tf.constant(movie_info['movie_features'], dtype=tf.float32)
    return tf.reshape(tensor, [1, tensor.shape[0]]), [movie_info['movie_genres']],
                                                     # movie_info['movie_genres']]

get_random_arm_features(222)

(<tf.Tensor: shape=(1, 64), dtype=float32, numpy=
 array([[ 0.02758646, -0.02364726,  0.00326383, -0.00670724, -0.02581365,
          0.02811625, -0.02409014,  0.02277763,  0.03388413,  0.01355955,
         -0.04816724, -0.04114227,  0.01485746,  0.01675477,  0.02701057,
         -0.018884  ,  0.00393082, -0.01786947,  0.01814145,  0.00791826,
         -0.03671928,  0.02370023, -0.03795796,  0.00273353,  0.01422632,
          0.00815446,  0.03875173, -0.00820974,  0.03643649, -0.04565177,
         -0.01958307, -0.00754724,  0.01623068,  0.00170443, -0.00777126,
         -0.0101654 ,  0.01170176, -0.01047592, -0.04902004,  0.01853789,
         -0.04450876,  0.0009967 , -0.01994429,  0.02006539, -0.02091893,
          0.01542037, -0.01819051, -0.03646524, -0.02382381,  0.01859686,
          0.03186891,  0.01153149, -0.03137907, -0.01640514,  0.01935514,
         -0.03642275, -0.02170839, -0.04142233,  0.01269498, -0.0153484 ,
         -0.01838543, -0.00584408, -0.01540409,  0.03383145]],

In [29]:
def get_random_set_of_arm_features(n_actions):
    random_arm_ids = list(np.random.randint(MIN_ARM_ID, MAX_ARM_ID, n_actions))
    features = [get_random_arm_features(x) for x in random_arm_ids]
    just_features = [x[0] for x in features]
    movie_info = [x[1] for x in features]
    return tf.concat(just_features, axis=0), movie_info

In [30]:
#NEW - there's a tuple returned with the movies we will use for PALM!
get_random_set_of_arm_features(n_actions=2)[0]

<tf.Tensor: shape=(2, 64), dtype=float32, numpy=
array([[ 0.03378742,  0.01706399,  0.03097352, -0.03447683,  0.00684074,
        -0.02467006, -0.03587176, -0.00194854,  0.00817044,  0.00660542,
         0.02049113,  0.02787698, -0.01884164,  0.0250637 , -0.02671324,
         0.00575078,  0.00014221,  0.02990893, -0.01085119, -0.03606417,
         0.02069755, -0.02867526,  0.03644859,  0.02053731, -0.03130388,
         0.02352465,  0.04758057, -0.02958646, -0.02110904, -0.02467135,
        -0.04586595,  0.00947623,  0.03001981,  0.02514908,  0.03943818,
         0.03974951, -0.02048717,  0.00589327, -0.01928915, -0.01875315,
         0.03245391,  0.02953072, -0.01104504, -0.04738101, -0.01941533,
        -0.03232567, -0.04955465,  0.04904519, -0.01213044,  0.01257041,
         0.0311668 , -0.03899008, -0.0284349 ,  0.02622536,  0.02824564,
         0.00210474,  0.03348925, -0.00880535,  0.04751078,  0.03034684,
         0.02982635, -0.00188724, -0.0043234 , -0.0364937 ],
       [ 0.001

In [31]:
# ### Look at the raw input features to format a good prompt for ranking movies
# NUM_ACTIONS = 5
# batch_size = 8
# iterator = iter(train_dataset.batch(batch_size))
# data = next(iterator)

# _, user_info = embs._get_global_context_features_v2(data)
# ###NEW - we are getting the arm features here
# _, movie_info = get_random_set_of_arm_features(n_actions=NUM_ACTIONS)

# print(user_info, movie_info)

In [32]:
from datetime import datetime
dt = datetime.utcfromtimestamp(885409515)
dt.ctime()

'Wed Jan 21 19:05:15 1998'

In [33]:
# movie_info

## Ranking Bandit

In [32]:
BATCH_SIZE      = 128
EVAL_BATCH_SIZE = 1

NUM_ACTIONS = 2
NUM_ITEMS  = 50
NUM_SLOTS  = 3 

print(f"BATCH_SIZE      : {BATCH_SIZE}")
print(f"EVAL_BATCH_SIZE : {EVAL_BATCH_SIZE}")
print(f"NUM_ACTIONS     : {NUM_ACTIONS}")
print(f"GLOBAL_DIM      : {GLOBAL_DIM}")
print(f"PER_ARM_DIM     : {PER_ARM_DIM}")

BATCH_SIZE      : 128
EVAL_BATCH_SIZE : 1
NUM_ACTIONS     : 2
GLOBAL_DIM      : 64
PER_ARM_DIM     : 64


## specs

### Observation Spec

**example config:**
```
global_dim = 9  #@param{ type: "integer"}
item_dim   = 11  #@param{ type: "integer"}
num_items  = 50 #@param{ type: "integer"}
num_slots  = 3  #@param{ type: "integer"}
```
**example obs sepc:**

```
{'global': TensorSpec(shape=(9,), dtype=tf.float32, name=None),
 'per_arm': TensorSpec(shape=(50, 11), dtype=tf.float32, name=None)}
 ```

In [33]:
observation_spec = {
    'global': tf.TensorSpec([GLOBAL_DIM], tf.float32),
    'per_arm': tf.TensorSpec([NUM_ACTIONS, PER_ARM_DIM], tf.float32) #excluding action dim here
}
observation_spec

{'global': TensorSpec(shape=(64,), dtype=tf.float32, name=None),
 'per_arm': TensorSpec(shape=(2, 64), dtype=tf.float32, name=None)}

### Action Spec

```
BoundedTensorSpec(shape=(3,), dtype=tf.int32, name='action', minimum=array(0, dtype=int32), maximum=array(49, dtype=int32))

```

In [34]:
action_spec = tensor_spec.BoundedTensorSpec(
    shape=[], 
    dtype=tf.int32,
    minimum=tf.constant(0),            
    maximum=NUM_SLOTS-1, # n degrees of freedom and will dictate the expected mean reward spec shape
    name="action_spec"
)

action_spec

BoundedTensorSpec(shape=(), dtype=tf.int32, name='action_spec', minimum=array(0, dtype=int32), maximum=array(2, dtype=int32))

### TimeStep Spec

```
TimeStep(
{'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
 'observation': {'global': TensorSpec(shape=(9,), dtype=tf.float32, name=None),
                 'per_arm': TensorSpec(shape=(50, 11), dtype=tf.float32, name=None)},
 'reward': {'chosen_index': BoundedTensorSpec(shape=(), dtype=tf.int32, name='chosen_index', minimum=array(0, dtype=int32), maximum=array(3, dtype=int32)),
            'chosen_value': TensorSpec(shape=(), dtype=tf.float32, name='chosen_value')},
 'step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type')})
 ```

In [35]:
time_step_spec = ts.time_step_spec(observation_spec)#, reward_spec=tf.TensorSpec([1, NUM_ACTIONS]))
time_step_spec

TimeStep(
{'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
 'observation': {'global': TensorSpec(shape=(64,), dtype=tf.float32, name=None),
                 'per_arm': TensorSpec(shape=(2, 64), dtype=tf.float32, name=None)},
 'reward': TensorSpec(shape=(), dtype=tf.float32, name='reward'),
 'step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type')})

In [None]:
feedback_model = ranking_environment.FeedbackModel

In [None]:
from src.perarm_features import ranking_bandit_policy
# policy = ranking_bandit_policy.GenLinearRankingBanditPolicy(

In [None]:
from src.perarm_features import agent_factory as agent_factory
ranking_bandit_agent = agent_factory.GenLinearRankingBanditAgent(XXX)