# Contextual Bandits for Ranking with TF-Agents

> see [ranking tutorial](https://www.tensorflow.org/agents/tutorials/ranking_tutorial)

### Overview

* The contextual bandits approach is classified as an extension of multi-armed bandits
* a contextual multi-armed banded problem is a simplified reinforcement learning algorithm where the agent takes an action from a set of possible actions 

> **TODO**

The **Bandit Ranking** agent will be similar to the `NeuralEpsilonGreedy` agent. Main differences:

* The item features are stored in the `per_arm` part of the observation, in the order of how they are recommended
* Since this ordered list of items expresses what action was taken by the policy,
the `action` value of the trajectory is not used by the agent.

> Note: difference between the "per-arm" observation recieved by the policy vs the agent:

While the agent receives the items in the recommendation slots, the policy receives the items that are available for recommendation. The user is responsible for converting the observation to the
syntax required by the agent.


The training observation contains the global features and the features of the items in the recommendation slots 
* The item features are stored in the `per_arm` part of the observation, in the order of how they are recommended
* Note: since this ordered list of items expresses what action was taken by the policy, the action value of the trajectory is not used by the agent

## Load notebook config

* use the prefix defined in `00-env-setup`

In [1]:
VERSION        = "v2"                       # TODO
PREFIX         = f'rec-bandits-{VERSION}'   # TODO

# PREFIX = 'mabv1'

print(f"PREFIX: {PREFIX}")

PREFIX: rec-bandits-v2


In [2]:
# staging GCS
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}-bucket'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PROJECT_ID               = "hybrid-vertex"
PROJECT_NUM              = "934903580331"
LOCATION                 = "us-central1"

REGION                   = "us-central1"
BQ_LOCATION              = "US"
VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"

VERTEX_SA                = "934903580331-compute@developer.gserviceaccount.com"

PREFIX                   = "rec-bandits-v2"
VERSION                  = "v2"

BUCKET_NAME              = "rec-bandits-v2-hybrid-vertex-bucket"
BUCKET_URI               = "gs://rec-bandits-v2-hybrid-vertex-bucket"
DATA_GCS_PREFIX          = "data"
DATA_PATH                = "gs://rec-bandits-v2-hybrid-vertex-bucket/data"
VOCAB_SUBDIR             = "vocabs"
VOCAB_FILENAME           = "vocab_dict.pkl"

VPC_NETWORK_FULL         = "projects/934903580331/global/networks/ucaip-haystack-vpc-network"

BIGQUERY_DATASET_ID      = "hybrid_vertex.movielens_ds_rec_bandits_v2"
BIGQUERY_TABLE_ID        = "hybrid_vertex.movielens_ds_rec_bandits_v2.training_dataset"

REPO

## imports

In [3]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [4]:
!pwd

/home/jupyter/tf_vertex_agents/03-ranking


In [5]:
import functools
from collections import defaultdict
from typing import Callable, Dict, List, Optional, TypeVar, Iterable
from datetime import datetime
import time
from pprint import pprint
import pickle as pkl
import pandas as pd

from tqdm import tqdm

# logging
import logging
logging.disable(logging.WARNING)

import matplotlib.pyplot as plt
import numpy as np

# google cloud
from google.cloud import aiplatform, storage

# tensorflow
import tensorflow as tf
# from tf_agents.agents import TFAgent

# from tf_agents.bandits.environments import stationary_stochastic_per_arm_py_environment as p_a_env
from tf_agents.bandits.environments.ranking_environment import FeedbackModel
from tf_agents.bandits.metrics import tf_metrics as tf_bandit_metrics
# from tf_agents.drivers import dynamic_step_driver
# from tf_agents.environments import tf_py_environment
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.specs import tensor_spec
from tf_agents.trajectories import time_step as ts

# from tf_agents.bandits.agents import lin_ucb_agent
# from tf_agents.bandits.agents import linear_thompson_sampling_agent as lin_ts_agent
from tf_agents.bandits.agents import neural_epsilon_greedy_agent
from tf_agents.bandits.agents import neural_linucb_agent
from tf_agents.bandits.networks import global_and_arm_feature_network
from tf_agents.bandits.policies import policy_utilities

from tf_agents.bandits.specs import utils as bandit_spec_utils
from tf_agents.trajectories import trajectory
from tf_agents.bandits.environments import ranking_environment
from tf_agents.bandits.agents import ranking_agent

from tf_agents.utils import nest_utils
from tf_agents.specs import array_spec

# GPU
from numba import cuda 
import gc

import sys
sys.path.append("..")

# this repo
from src.per_arm_rl import data_utils
from src.per_arm_rl import data_config
from src.per_arm_rl import train_utils as train_utils

# tf exceptions and vars
if tf.__version__[0] != "2":
    raise Exception("The trainer only runs with TensorFlow version 2.")

T = TypeVar("T")

In [6]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [8]:
device = cuda.get_current_device()
device.reset()
gc.collect()

14

In [9]:
# cloud storage client
storage_client = storage.Client(project=PROJECT_ID)

# Vertex client
aiplatform.init(project=PROJECT_ID, location=LOCATION)

# Data prep

In [10]:
options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.AUTO

In [11]:
NUM_EXAMPLES_PER_LIST = 3 # 3 | 5

# SPLIT = "val"
# SPLIT = "listwise-val"
SPLIT = f"listwise-{NUM_EXAMPLES_PER_LIST}n-val"

print(f"SPLIT: {SPLIT}")

SPLIT: listwise-3n-val


In [12]:
! gsutil ls $BUCKET_URI/$DATA_GCS_PREFIX/$SPLIT/

gs://rec-bandits-v2-hybrid-vertex-bucket/data/listwise-3n-val/ml-100k-listwise-3n-val.tfrecord


### Val

In [13]:
SPLIT = "val"
# SPLIT = "listwise-val"
SPLIT = f"listwise-{NUM_EXAMPLES_PER_LIST}n-val"

val_files = []
for blob in storage_client.list_blobs(f"{BUCKET_NAME}", prefix=f'{DATA_GCS_PREFIX}/{SPLIT}'):
    if '.tfrecord' in blob.name:
        val_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
        
val_dataset = tf.data.TFRecordDataset(val_files)
val_dataset = val_dataset.map(data_utils.parse_lw_tfrecord, num_parallel_calls=tf.data.AUTOTUNE)

for example in val_dataset.take(1):
    pprint(example)

{'movie_genres': <tf.Tensor: shape=(3,), dtype=int64, numpy=array([3, 0, 0])>,
 'movie_id': <tf.Tensor: shape=(3,), dtype=string, numpy=array([b'94', b'245', b'403'], dtype=object)>,
 'user_id': <tf.Tensor: shape=(), dtype=string, numpy=b'346'>,
 'user_rating': <tf.Tensor: shape=(3,), dtype=float32, numpy=array([3., 4., 3.], dtype=float32)>}


### Train

In [14]:
# SPLIT = "train"
# SPLIT = "listwise-train"
# SPLIT = f"listwise-{NUM_EXAMPLES_PER_LIST}n-train"

# train_files = []
# for blob in storage_client.list_blobs(f"{BUCKET_NAME}", prefix=f'{DATA_GCS_PREFIX}/{SPLIT}'):
#     if '.tfrecord' in blob.name:
#         train_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
        
# train_dataset = tf.data.TFRecordDataset(train_files)
# train_dataset = train_dataset.map(data_utils.parse_lw_tfrecord)

# for x in train_dataset.batch(1).take(1):
#     pprint(x)

In [15]:
train_dataset = val_dataset

### Vocab

In [16]:
GENERATE_VOCABS = False

print(f"GENERATE_VOCABS: {GENERATE_VOCABS}")

GENERATE_VOCABS: False


In [17]:
EXISTING_VOCAB_FILE = f'gs://{BUCKET_NAME}/{VOCAB_SUBDIR}/{VOCAB_FILENAME}'

!gsutil cp gs://mabv1-hybrid-vertex-bucket/vocabs/vocab_dict.pkl $EXISTING_VOCAB_FILE

Copying gs://mabv1-hybrid-vertex-bucket/vocabs/vocab_dict.pkl [Content-Type=application/octet-stream]...
/ [1 files][142.9 KiB/142.9 KiB]                                                
Operation completed over 1 objects/142.9 KiB.                                    


In [18]:
data_utils.download_blob(
    project_id = PROJECT_ID,
    bucket_name = BUCKET_NAME, 
    source_blob_name = f"{VOCAB_SUBDIR}/{VOCAB_FILENAME}", 
    destination_file_name= VOCAB_FILENAME
)

filehandler = open(VOCAB_FILENAME, 'rb')
vocab_dict = pkl.load(filehandler)
filehandler.close()

for key in vocab_dict.keys():
    pprint(key)

Downloaded storage object vocabs/vocab_dict.pkl from bucket rec-bandits-v2-hybrid-vertex-bucket to local file vocab_dict.pkl.
'movie_id'
'user_id'
'user_occupation_text'
'movie_genres'
'bucketized_user_age'
'max_timestamp'
'min_timestamp'
'timestamp_buckets'


# Global & Per-Arm feature embedding models 

In [19]:
NUM_OOV_BUCKETS       = 2
GLOBAL_EMBEDDING_SIZE = 64 #64
MV_EMBEDDING_SIZE     = 32 #32

BATCH_SIZE            = 5 #128
EVAL_BATCH_SIZE       = 1

NUM_ITEMS             = NUM_EXAMPLES_PER_LIST # 3 | 5 
NUM_SLOTS             = 2

DISTANCE_THRESHOLD    = 0.5

print(f"NUM_OOV_BUCKETS    : {NUM_OOV_BUCKETS}")
print(f"GLOBAL_EMB_SIZE    : {GLOBAL_EMBEDDING_SIZE}")
print(f"MV_EMB_SIZE        : {MV_EMBEDDING_SIZE}")
print(f"BATCH_SIZE         : {BATCH_SIZE}")
print(f"EVAL_BATCH_SIZE    : {EVAL_BATCH_SIZE}")
print(f"NUM_ITEMS          : {NUM_ITEMS}")
print(f"NUM_SLOTS          : {NUM_SLOTS}")
print(f"DISTANCE_THRESHOLD : {DISTANCE_THRESHOLD}")

NUM_OOV_BUCKETS    : 2
GLOBAL_EMB_SIZE    : 64
MV_EMB_SIZE        : 32
BATCH_SIZE         : 5
EVAL_BATCH_SIZE    : 1
NUM_ITEMS          : 3
NUM_SLOTS          : 2
DISTANCE_THRESHOLD : 0.5


In [20]:
for i in range(1):
    
    iterator = iter(train_dataset.batch(2))
    data = next(iterator)

data

{'movie_genres': <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
 array([[3, 0, 0],
        [7, 0, 0]])>,
 'movie_id': <tf.Tensor: shape=(2, 3), dtype=string, numpy=
 array([[b'94', b'245', b'403'],
        [b'678', b'127', b'343']], dtype=object)>,
 'user_id': <tf.Tensor: shape=(2,), dtype=string, numpy=array([b'346', b'602'], dtype=object)>,
 'user_rating': <tf.Tensor: shape=(2, 3), dtype=float32, numpy=
 array([[3., 4., 3.],
        [4., 5., 2.]], dtype=float32)>}

## Embedding layers

#### User ID

In [21]:
user_id_input_layer = tf.keras.Input(
    name="user_id",
    shape=(1,),
    dtype=tf.string
)

user_id_lookup = tf.keras.layers.StringLookup(
    max_tokens=len(vocab_dict['user_id']) + NUM_OOV_BUCKETS,
    num_oov_indices=NUM_OOV_BUCKETS,
    mask_token=None,
    vocabulary=vocab_dict['user_id'],
)(user_id_input_layer)

user_id_embedding = tf.keras.layers.Embedding(
    # Let's use the explicit vocabulary lookup.
    input_dim=len(vocab_dict['user_id']) + NUM_OOV_BUCKETS,
    output_dim=GLOBAL_EMBEDDING_SIZE
)(user_id_lookup)

user_id_embedding = tf.reduce_sum(user_id_embedding, axis=-2)

In [22]:
test_user_id_model = tf.keras.Model(inputs=user_id_input_layer, outputs=user_id_embedding)

for x in train_dataset.batch(1).take(1):
    print(x["user_id"])
    print(test_user_id_model(x["user_id"]))

tf.Tensor([b'346'], shape=(1,), dtype=string)
tf.Tensor(
[[ 0.04370818 -0.01135105  0.0266356   0.00445057  0.01794897 -0.01123155
   0.01820037 -0.01236316 -0.04714687  0.01180333  0.03817843  0.00682211
   0.04256009 -0.0283603  -0.00242324  0.00948932  0.01037207 -0.00822685
   0.04582613 -0.01500129 -0.03047838  0.04197461  0.03291135  0.04426822
   0.02072633 -0.00618589 -0.04319811  0.0090512  -0.04558665  0.02924452
  -0.01990993  0.04883667  0.00903035  0.0130672   0.0298072  -0.00774354
   0.03854993  0.03709947  0.00772595  0.04914529  0.02790226 -0.03761049
   0.02432003 -0.00191277 -0.01236521 -0.04426353 -0.03791174 -0.04519391
  -0.00229297 -0.01184074 -0.04209878 -0.01481508 -0.03921328 -0.04272251
   0.01458052  0.03762467  0.03184244 -0.0199193  -0.03657435  0.02252989
   0.01776868 -0.0383962  -0.03479626  0.04221696]], shape=(1, 64), dtype=float32)


#### Movie ID

In [23]:
# len(vocab_dict['movie_id'])

In [24]:
mv_id_input_layer = tf.keras.Input(
    name="movie_id",
    shape=(1,),
    dtype=tf.string
)

mv_id_lookup = tf.keras.layers.StringLookup(
    max_tokens=len(vocab_dict['movie_id']) + NUM_OOV_BUCKETS,
    num_oov_indices=NUM_OOV_BUCKETS,
    mask_token=None,
    vocabulary=vocab_dict['movie_id'],
)(mv_id_input_layer)

mv_id_embedding = tf.keras.layers.Embedding(
    # Let's use the explicit vocabulary lookup.
    input_dim=len(vocab_dict['movie_id']) + NUM_OOV_BUCKETS,
    output_dim=MV_EMBEDDING_SIZE
)(mv_id_lookup)

# mv_id_embedding = tf.reduce_sum(mv_id_embedding, axis=-2)

In [25]:
test_mv_id_model = tf.keras.Model(inputs=mv_id_input_layer, outputs=mv_id_embedding)

for x in train_dataset.batch(1).take(1):
    print(x["movie_id"])
    list_length = x["movie_id"].shape[1]
    print(test_mv_id_model(x["movie_id"]))

tf.Tensor([[b'94' b'245' b'403']], shape=(1, 3), dtype=string)
tf.Tensor(
[[[ 1.26819871e-02  1.47836246e-02  1.43100135e-02 -1.93941947e-02
    7.98638910e-03  3.71932723e-02  7.65300915e-03  2.07233690e-02
    4.74291481e-02  2.48486884e-02 -3.65972407e-02  1.32403634e-02
   -3.67226116e-02  1.88477673e-02 -2.69936454e-02 -2.82921642e-03
   -1.29413232e-02 -8.74693319e-03 -2.30226871e-02 -3.13136131e-02
    4.13347743e-02 -1.94737315e-02 -3.58124264e-02  6.30936772e-03
    6.69068098e-03 -1.13114938e-02  1.99737214e-02  1.63485445e-02
   -2.07304601e-02  3.55627425e-02 -3.32656391e-02 -1.68435574e-02]
  [ 3.02490108e-02  4.40416485e-03  2.69909836e-02  2.99401172e-02
   -6.63027167e-05  2.27944739e-02 -3.80651578e-02  1.48499273e-02
    1.49717443e-02 -4.35329564e-02  1.66729204e-02  4.75808717e-02
    2.64542177e-03  2.24997886e-02  2.01900862e-02 -2.46952064e-02
   -4.22429703e-02  2.19405405e-02 -4.79789376e-02  4.59348001e-02
    3.43324058e-02  9.05132294e-03  4.80602272e-02  9.

In [26]:
# list_length

In [27]:
# for x in train_dataset.batch(1).take(1):
#     print(x["movie_id"])
#     single_sample = x["movie_id"][0]
#     item_1 = tf.gather(x["movie_id"][0], 0)
#     print(item_1)

In [28]:
# single_sample[0]

#### Movie Genres

In [29]:
# len(vocab_dict['movie_genres'])

In [30]:
mv_genre_input_layer = tf.keras.Input(
    name="movie_genres",
    shape=(1,),
    dtype=tf.float32
)

mv_genre_lookup = tf.keras.layers.IntegerLookup(
    vocabulary=vocab_dict['movie_genres'],
    num_oov_indices=NUM_OOV_BUCKETS,
    oov_value=0,
)(mv_genre_input_layer)

mv_genre_embedding = tf.keras.layers.Embedding(
    # Let's use the explicit vocabulary lookup.
    input_dim=len(vocab_dict['movie_genres']) + NUM_OOV_BUCKETS,
    output_dim=MV_EMBEDDING_SIZE
)(mv_genre_lookup)

# mv_genre_embedding = tf.reduce_sum(mv_genre_embedding, axis=-2)

In [31]:
test_mv_gen_model = tf.keras.Model(inputs=mv_genre_input_layer, outputs=mv_genre_embedding)

for x in train_dataset.batch(2).take(1):
    print(x["movie_genres"])
    print(test_mv_gen_model(x["movie_genres"]))

tf.Tensor(
[[3 0 0]
 [7 0 0]], shape=(2, 3), dtype=int64)
tf.Tensor(
[[[-0.00905252  0.01898391 -0.00024579 -0.020698    0.01857164
    0.00074836  0.01230341 -0.02427216  0.03277064 -0.02594687
   -0.03635303 -0.03331405  0.03208944  0.03972979  0.03509669
   -0.00665893  0.0238544   0.0359375   0.04644355 -0.04636688
   -0.01557895  0.02767905  0.0390648   0.01242439 -0.03174337
    0.02715689 -0.0487331  -0.0014302  -0.03177358  0.00691553
   -0.0137447   0.01497729]
  [ 0.02349536  0.04710582  0.02036366 -0.04065933 -0.01117162
    0.00973912  0.00879743 -0.04954852  0.03227177  0.0417361
    0.04533937 -0.04180925  0.00312942 -0.02556071 -0.01017413
   -0.04456032 -0.02658897  0.01132615  0.04729686 -0.04301446
    0.0021451  -0.04593721 -0.03231512  0.03024571 -0.0459285
   -0.0285256   0.04047512 -0.02652644  0.04642197 -0.00502851
   -0.01014877  0.0341229 ]
  [ 0.02349536  0.04710582  0.02036366 -0.04065933 -0.01117162
    0.00973912  0.00879743 -0.04954852  0.03227177  0.0417

In [32]:
# for x in train_dataset.batch(1).take(1):
#     print(x["movie_genres"])
#     mv_gen_sample = x["movie_genres"]
#     item_gen = tf.gather(x["movie_genres"], 0)
#     print(item_gen)

In [33]:
# for x in train_dataset.batch(2).take(1):
#     print(x["user_rating"])
#     # mv_gen_sample = x["movie_genres"]
#     # item_gen = tf.gather(x["movie_genres"], 0)
#     # print(item_gen)

## Sampling Functions

#### item sampling

In [34]:
# data

In [35]:
# for x in train_dataset.batch(2).take(1):
#     ratings_list = x["user_rating"] #[0]
#     indices = tf.argsort(ratings_list, direction="DESCENDING")
    
#     mv_ids = test_mv_id_model(x["movie_id"])
#     # mv_ids_sliced = tf.slice(mv_ids, begin=[0,0,0], size=[HPARAMS['num_slots']])
    
#     mv_gens = test_mv_gen_model(x["movie_genres"])
#     # mv_gens_sliced = tf.slice(mv_gens, begin=[0,0,0], size=[HPARAMS['num_slots']])
    
# #     concat_embeddings = tf.concat(
# #         [mv_ids, mv_gens], axis=-1
# #     )
    
# #     ordered_concat = tf.gather(concat_embeddings, indices, batch_dims=1)
    
# #     stacked_mvs = tf.stack(concat_embeddings, axis = 1)

In [36]:
# len(ratings_list)

In [37]:
def _get_per_arm_features(x):
    """
    This function generates a single per-arm observation vector.
    """
    ratings_list = x["user_rating"] #[0]
    indices = tf.argsort(ratings_list, direction="DESCENDING")
    _batch_size = len(ratings_list)
    
    mv_ids = test_mv_id_model(x["movie_id"])
    mv_gens = test_mv_gen_model(x["movie_genres"])
    
    concat_embeddings = tf.concat(
        [mv_ids, mv_gens], axis=-1
    )
    
    ordered_concat = tf.gather(concat_embeddings, indices, batch_dims=1)
    # feedback = tf.gather_nd(concat_embeddings, indices)
    # print(f"ordered_concat_embeddings: {ordered_concat_embeddings}")
    # ordered_concat = tf.reduce_sum(ordered_concat, axis=0)
    
    slotted_ordered_concat = tf.slice(
        ordered_concat, begin=[0, 0, 0], size=[_batch_size, NUM_SLOTS, MV_EMBEDDING_SIZE * NUM_SLOTS]
    )
    
    return slotted_ordered_concat
    # return slotted_ordered_concat

In [38]:
test_arms = _get_per_arm_features(data)
test_arms #.shape

<tf.Tensor: shape=(2, 2, 64), dtype=float32, numpy=
array([[[ 3.02490108e-02,  4.40416485e-03,  2.69909836e-02,
          2.99401172e-02, -6.63027167e-05,  2.27944739e-02,
         -3.80651578e-02,  1.48499273e-02,  1.49717443e-02,
         -4.35329564e-02,  1.66729204e-02,  4.75808717e-02,
          2.64542177e-03,  2.24997886e-02,  2.01900862e-02,
         -2.46952064e-02, -4.22429703e-02,  2.19405405e-02,
         -4.79789376e-02,  4.59348001e-02,  3.43324058e-02,
          9.05132294e-03,  4.80602272e-02,  9.03313234e-03,
          5.63108921e-03, -3.80748734e-02,  2.88737305e-02,
          2.72508375e-02, -4.00614515e-02, -4.62130085e-02,
          9.86828655e-03,  8.28745216e-03,  2.34953649e-02,
          4.71058227e-02,  2.03636624e-02, -4.06593308e-02,
         -1.11716166e-02,  9.73912328e-03,  8.79742950e-03,
         -4.95485179e-02,  3.22717689e-02,  4.17360999e-02,
          4.53393720e-02, -4.18092497e-02,  3.12942266e-03,
         -2.55607124e-02, -1.01741329e-02, -4.45

In [39]:
test_arms = _get_per_arm_features(data)

PER_ARM_DIM = test_arms.shape[2]            
print(f"PER_ARM_DIM: {PER_ARM_DIM}")

# test_arms

PER_ARM_DIM: 64


#### global sampling

In [40]:
def _get_global_context_features(x):
    """
    This function generates a single global observation vector.
    """
    
    return test_user_id_model(x['user_id'])

In [41]:
test_globals = _get_global_context_features(data)

GLOBAL_DIM = test_globals.shape[1] 
print(f"GLOBAL_DIM: {GLOBAL_DIM}")

# test_globals

GLOBAL_DIM: 64


# Ranking Agent

## Feedback type

Ranking agents assume either a `score_vector` or `cascading feedback` framework for the feedback signal (reward). 

* `score_vector`: feedback is a vector of scores for every item in the slots. 
* `cascading feedback`: if the kth item was clicked, then the items up to k-1 receive a score of -1, the kth item receives a score based on a feedback value, while the rest of the items receive feedback of 0. 

Ranking agent objective: train the scoring network to be able to estimate the above scores

In [42]:
# feedback_model = ranking_environment.FeedbackModel.CASCADING
feedback_model = FeedbackModel.SCORE_VECTOR

## Tensor Specs

example Tensor Spec structures...

`observation_spec()`

```python
{'global': TensorSpec(shape=(9,), dtype=tf.float32, name=None),
 'per_arm': TensorSpec(shape=(50, 11), dtype=tf.float32, name=None)}
```

`action_spec()`

```python
BoundedTensorSpec(shape=(3,), dtype=tf.int32, name='action', minimum=array(0, dtype=int32), maximum=array(49, dtype=int32))
```

`reward_spec()`

```python
{'chosen_index': BoundedTensorSpec(shape=(), dtype=tf.int32, name='chosen_index', minimum=array(0, dtype=int32), maximum=array(3, dtype=int32)),
 'chosen_value': TensorSpec(shape=(), dtype=tf.float32, name='chosen_value')}
```

`time_step_spec()`

```python
TimeStep(
{'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
 'observation': {'global': TensorSpec(shape=(9,), dtype=tf.float32, name=None),
                 'per_arm': TensorSpec(shape=(50, 11), dtype=tf.float32, name=None)},
 'reward': {'chosen_index': BoundedTensorSpec(shape=(), dtype=tf.int32, name='chosen_index', minimum=array(0, dtype=int32), maximum=array(3, dtype=int32)),
            'chosen_value': TensorSpec(shape=(), dtype=tf.float32, name='chosen_value')},
 'step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type')})
```

In [43]:
# for i in range(1):
    
#     iterator = iter(train_dataset.batch(2))
#     data_1 = next(iterator)

# data_1

In [44]:
# global_spec = array_spec.ArraySpec.from_array(_get_global_context_features(data_1).numpy())
# global_spec

In [45]:
# _get_per_arm_features(data_1)

In [46]:
# item_spec = array_spec.add_outer_dims_nest(
#     array_spec.ArraySpec.from_array(_get_per_arm_features(data_1).numpy()), (NUM_ITEMS,)
# )
# item_spec

In [47]:
# GLOBAL_KEY = bandit_spec_utils.GLOBAL_FEATURE_KEY
# PER_ARM_KEY = bandit_spec_utils.PER_ARM_FEATURE_KEY

# observation_spec_test = {GLOBAL_KEY: global_spec, PER_ARM_KEY: item_spec}
# observation_spec_test

In [48]:
# _global_dim = global_spec.shape[0]
# _global_dim

In [49]:
# _item_dim = item_spec.shape[-1]
# _item_dim

**from [ranking_environment.py](https://github.com/tensorflow/agents/blob/master/tf_agents/bandits/environments/ranking_environment.py#L152C1-L166C6)**

```
    global_spec = array_spec.ArraySpec.from_array(global_sampling_fn())
    item_spec = array_spec.add_outer_dims_nest(
        array_spec.ArraySpec.from_array(item_sampling_fn()), (num_items,)
    )
    observation_spec = {GLOBAL_KEY: global_spec, PER_ARM_KEY: item_spec}
    self._global_dim = global_spec.shape[0]
    self._item_dim = item_spec.shape[-1]

    action_spec = array_spec.BoundedArraySpec(
        shape=(num_slots,),
        dtype=np.int32,
        minimum=0,
        maximum=num_items - 1,
        name='action',
    )
```

set vars

In [50]:
from tf_agents.specs import array_spec

print(f"BATCH_SIZE         : {BATCH_SIZE}")
print(f"EVAL_BATCH_SIZE    : {EVAL_BATCH_SIZE}")
print(f"NUM_ITEMS          : {NUM_ITEMS}")
print(f"NUM_SLOTS          : {NUM_SLOTS}")
print(f"DISTANCE_THRESHOLD : {DISTANCE_THRESHOLD}")
print(f"GLOBAL_DIM         : {GLOBAL_DIM}")
print(f"PER_ARM_DIM        : {PER_ARM_DIM}")

BATCH_SIZE         : 5
EVAL_BATCH_SIZE    : 1
NUM_ITEMS          : 3
NUM_SLOTS          : 2
DISTANCE_THRESHOLD : 0.5
GLOBAL_DIM         : 64
PER_ARM_DIM        : 64


### Observation spec

* The observation the agent ingests contains the global features and the features
of the items in the recommendation slots. 
* The item features are stored in the `per_arm` part of the observation, in the order of how they are recommended.
* Since this ordered list of items expresses what action was taken by the policy, the `action` value of the trajectory is not used by the agent.

In [51]:
observation_spec = {
    'global': tf.TensorSpec([GLOBAL_DIM], tf.float32),
    # 'per_arm': tf.TensorSpec([NUM_ITEMS, PER_ARM_DIM], tf.float32)
    'per_arm': tf.TensorSpec([NUM_SLOTS, PER_ARM_DIM], tf.float32)
}
observation_spec

{'global': TensorSpec(shape=(64,), dtype=tf.float32, name=None),
 'per_arm': TensorSpec(shape=(2, 64), dtype=tf.float32, name=None)}

### Action spec

> Action spec for ranking models must have rank 1

In [52]:
# action_spec = tensor_spec.BoundedTensorSpec(
#     shape=(NUM_SLOTS,),
#     dtype=tf.int32,
#     minimum=tf.constant(0),            
#     maximum=NUM_ITEMS - 1, # n degrees of freedom and will dictate the expected mean reward spec shape
#     name="action_spec"
# )

action_spec = array_spec.BoundedArraySpec(
    shape=(NUM_SLOTS,),
    dtype=np.int32,
    minimum=0,
    maximum=NUM_ITEMS - 1,
    name='action',
)

print(f"action_spec rank: {action_spec.shape}")

action_spec

action_spec rank: (2,)


BoundedArraySpec(shape=(2,), dtype=dtype('int32'), name='action', minimum=0, maximum=2)

### Reward spec

In [53]:
if feedback_model == ranking_environment.FeedbackModel.CASCADING:
    # `chosen_index == num_slots` means no recommended item was clicked.
    reward_spec = {
        'chosen_index': array_spec.BoundedArraySpec(
            shape=[],
            minimum=0,
            maximum=NUM_SLOTS,
            dtype=np.int32,
            name='chosen_index',
        ),
        'chosen_value': array_spec.ArraySpec(
            shape=[], dtype=np.float32, name='chosen_value'
        ),
    }
elif feedback_model == ranking_environment.FeedbackModel.SCORE_VECTOR:
    reward_spec = tf.TensorSpec(
        shape=[NUM_SLOTS], dtype=np.float32, name='score_vector'
    )
    # reward_spec = array_spec.ArraySpec(
    #     shape=[NUM_SLOTS], dtype=np.float32, name='score_vector'
    # )
else:
    reward_spec = f"Feedback model: {feedback_model}, not implemented"
    
reward_spec

TensorSpec(shape=(2,), dtype=tf.float32, name='score_vector')

### TimeStep spec

In [54]:
# TODO - investigate adding reward_spec
"""
TypeError: Expected observation and reward specs to 
both be either tensor or array specs, but saw spec values 
TensorSpec(shape=(64,), dtype=tf.float32, name=None) 
vs. ArraySpec(shape=(2,), dtype=dtype('float32'), name='score_vector')
"""


time_step_spec = ts.time_step_spec(
    observation_spec = observation_spec, 
    reward_spec = reward_spec             # TODO
)
time_step_spec

TimeStep(
{'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
 'observation': {'global': TensorSpec(shape=(64,), dtype=tf.float32, name=None),
                 'per_arm': TensorSpec(shape=(2, 64), dtype=tf.float32, name=None)},
 'reward': TensorSpec(shape=(2,), dtype=tf.float32, name='score_vector'),
 'step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type')})

Inspect chosen arm features spec

In [55]:
time_step_spec.observation

{'global': TensorSpec(shape=(64,), dtype=tf.float32, name=None),
 'per_arm': TensorSpec(shape=(2, 64), dtype=tf.float32, name=None)}

## Policy and Scoring Network

> all ranking agents train a network that estimates scores of item/user pairs

**Ranking Policies**
* `DESCENDING_SCORES` - Stack rank deterministically by scores
* `NO_PENALTY` - Sampling sequentially based on scores; no penalty applied
* `COSINE_DISTANCE` - Sampling sequentally and taking diversity into account

`penalty_mixture` parameter governs the balance between ranking based on scores and accounting for diversity
* low positive value --> ranking has less diversity
* higher value --> enforces more diversity

`logits_temperature` - temperature parameter for non-deterministic policies
* This value must be positive

In [56]:
print(f"GLOBAL_DIM  : {GLOBAL_DIM}")
print(f"PER_ARM_DIM : {PER_ARM_DIM}")

GLOBAL_DIM  : 64
PER_ARM_DIM : 64


In [57]:
AGENT_TYPE = "Ranking"
NETWORK_TYPE = "dotproduct"
POLICY_TYPE = ranking_agent.RankingPolicyType.COSINE_DISTANCE # COSINE_DISTANCE | NO_PENALTY | DESCENDING_SCORES

PENALTY_MIXTURE = 1.0
LOGITS_TEMPERATURE = 1.0

LEARNING_RATE = 0.005

GLOBAL_LAYERS   = [64, 32, 16]
ARM_LAYERS      = [64, 32, 16]
COMMON_LAYERS   = [16, 8]

HPARAMS = {  # TODO - streamline and consolidate
    "batch_size": BATCH_SIZE,
    "eval_batch_size" : EVAL_BATCH_SIZE,
    "num_itmes": NUM_ITEMS,
    "num_slots": NUM_SLOTS,
    "model_type": AGENT_TYPE,
    "network_type": NETWORK_TYPE,
    "global_layers": GLOBAL_LAYERS,
    "per_arm_layers": ARM_LAYERS,
    "common_layers": COMMON_LAYERS,
    "learning_rate": LEARNING_RATE,
    "policy_type": POLICY_TYPE,
    "feedback_model" : feedback_model,
    "penalty_mixture": PENALTY_MIXTURE,
    "logits_temperature": LOGITS_TEMPERATURE,
}
pprint(HPARAMS)

{'batch_size': 5,
 'common_layers': [16, 8],
 'eval_batch_size': 1,
 'feedback_model': 2,
 'global_layers': [64, 32, 16],
 'learning_rate': 0.005,
 'logits_temperature': 1.0,
 'model_type': 'Ranking',
 'network_type': 'dotproduct',
 'num_itmes': 3,
 'num_slots': 2,
 'penalty_mixture': 1.0,
 'per_arm_layers': [64, 32, 16],
 'policy_type': <RankingPolicyType.COSINE_DISTANCE: 1>}


In [58]:
if NETWORK_TYPE == 'commontower':
    scoring_network = global_and_arm_feature_network.create_feed_forward_common_tower_network(
        observation_spec = observation_spec, 
        global_layers = GLOBAL_LAYERS, 
        arm_layers = ARM_LAYERS, 
        common_layers = COMMON_LAYERS,
        # output_dim = output_dim,
    )
    
elif NETWORK_TYPE == 'dotproduct':
    scoring_network = global_and_arm_feature_network.create_feed_forward_dot_product_network(
        observation_spec = observation_spec, 
        global_layers = GLOBAL_LAYERS, 
        arm_layers = ARM_LAYERS
    )
    
if NETWORK_TYPE:
    print(f"Network: {scoring_network.name}")

Network: GlobalAndArmDotProductNetwork


## Define Agent 

In [59]:
rank_agent = ranking_agent.RankingAgent(
    time_step_spec=time_step_spec,
    action_spec=action_spec,
    scoring_network=scoring_network,
    optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=HPARAMS['learning_rate']),
    feedback_model=ranking_agent.FeedbackModel.SCORE_VECTOR, # FeedbackModel.SCORE_VECTOR, # feedback_model,
    policy_type=HPARAMS['policy_type'],
    logits_temperature=HPARAMS['logits_temperature'],
    penalty_mixture_coefficient=HPARAMS['penalty_mixture'],
    summarize_grads_and_vars=True
)

rank_agent

<tf_agents.bandits.agents.ranking_agent.RankingAgent at 0x7f3e4a111f60>

In [60]:
rank_agent.action_spec

BoundedTensorSpec(shape=(2,), dtype=tf.int32, name='action', minimum=array(0, dtype=int32), maximum=array(2, dtype=int32))

In [61]:
# time_step_spec

In [62]:
# rank_agent.time_step_spec

In [63]:
# rank_agent.training_data_spec

In [64]:
rank_agent.training_data_spec.observation

{'global': TensorSpec(shape=(64,), dtype=tf.float32, name=None),
 'per_arm': TensorSpec(shape=(2, 64), dtype=tf.float32, name=None)}

In [65]:
# rank_agent.policy.action

In [66]:
rank_agent.policy.time_step_spec

_TupleWrapper(TimeStep(
{'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
 'observation': DictWrapper({'global': TensorSpec(shape=(64,), dtype=tf.float32, name=None), 'per_arm': TensorSpec(shape=(2, 64), dtype=tf.float32, name=None)}),
 'reward': TensorSpec(shape=(2,), dtype=tf.float32, name='score_vector'),
 'step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type')}))

In [67]:
rank_agent.policy.trajectory_spec

_TupleWrapper(Trajectory(
{'action': BoundedTensorSpec(shape=(2,), dtype=tf.int32, name=None, minimum=array(0, dtype=int32), maximum=array(1, dtype=int32)),
 'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
 'next_step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type'),
 'observation': DictWrapper({'global': TensorSpec(shape=(64,), dtype=tf.float32, name=None), 'per_arm': TensorSpec(shape=(2, 64), dtype=tf.float32, name=None)}),
 'policy_info': PolicyInfo(log_probability=(), predicted_rewards_mean=TensorSpec(shape=(2,), dtype=tf.float32, name=None), multiobjective_scalarized_predicted_rewards_mean=(), predicted_rewards_optimistic=(), predicted_rewards_sampled=(), bandit_policy_type=()),
 'reward': TensorSpec(shape=(2,), dtype=tf.float32, name='score_vector'),
 'step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type')}))

### Reward function

* `_create_ranking_reward_features` - [src](https://source.corp.google.com/piper///depot/google3/learning/smartchoices/training/models/agent_factory_test.py?q=%22ranking_model_factory%22&sq=package:piper%20file:%2F%2Fdepot%2Fgoogle3%20-file:google3%2Fexperimental&start=1)

* `_get_rewards_from_arm_features` 

In [68]:
def _get_ranking_rewards_sv(x):
    """Calculates reward for the actions."""
    
    # rating_scores_list = []
    ratings_list = x["user_rating"] #[0]
    indices = tf.argsort(ratings_list, direction="DESCENDING")
    
    feedback = tf.gather(ratings_list, indices, batch_dims=-1) #.numpy()
    
    # feedback = tf.math.top_k(feedback, k=HPARAMS['num_slots']).values
    top_n_ratings = tf.slice(feedback, begin=[0, 0], size=[-1, HPARAMS['num_slots']])
    
    return top_n_ratings

In [69]:
test_ratings = _get_ranking_rewards_sv(data)
test_ratings

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[4., 3.],
       [5., 4.]], dtype=float32)>

In [70]:
def _rank_trajectory_fn(element): # hparams
    """Converts a dataset element into a trajectory."""
    # global_features = _get_global_context_features(element)
    # arm_features = _get_per_arm_features(element)
    
    global_features = _get_global_context_features(element)
    arm_features = _get_per_arm_features(element)
    
    # obs spec
    observation = {
        bandit_spec_utils.GLOBAL_FEATURE_KEY:
            train_utils._add_outer_dimension(global_features),
            # global_features,
        bandit_spec_utils.PER_ARM_FEATURE_KEY: 
            train_utils._add_outer_dimension(arm_features)
            # arm_features
    }
    
    # reward = element['user_rating']
    ranking_rewards = _get_ranking_rewards_sv(element)

    
    action = np.zeros((HPARAMS['num_slots']), dtype=np.int32)
    # top_n_ratings = tf.slice(feedback, begin=[0, 0], size=[-1, HPARAMS['num_slots']])
    # action=tf.zeros_like(ranking_rewards, dtype=tf.int32)
    
    discount = np.zeros((HPARAMS['num_slots']), dtype=np.float32)
    # discount=tf.zeros_like(ranking_rewards, dtype=tf.int32)
    return trajectory.single_step(
        observation=observation,
        action=action,
        policy_info=(), #policy_info,
        reward=ranking_rewards,
        discount=discount
    )


In [71]:
# test_arms # (2, 3, 64)

# top_arms = tf.slice(test_arms, begin=[0, 0, 0], size=[-1, HPARAMS['num_slots']])
# top_arms

In [72]:
for x in train_dataset.batch(HPARAMS['batch_size']).take(1):
    test_traj = _rank_trajectory_fn(x)
    
test_traj

Trajectory(
{'action': <tf.Tensor: shape=(2,), dtype=int32, numpy=array([0, 0], dtype=int32)>,
 'discount': <tf.Tensor: shape=(2,), dtype=float32, numpy=array([0., 0.], dtype=float32)>,
 'next_step_type': <tf.Tensor: shape=(2,), dtype=int32, numpy=array([2, 2], dtype=int32)>,
 'observation': {'global': <tf.Tensor: shape=(5, 1, 64), dtype=float32, numpy=
array([[[ 0.04370818, -0.01135105,  0.0266356 ,  0.00445057,
          0.01794897, -0.01123155,  0.01820037, -0.01236316,
         -0.04714687,  0.01180333,  0.03817843,  0.00682211,
          0.04256009, -0.0283603 , -0.00242324,  0.00948932,
          0.01037207, -0.00822685,  0.04582613, -0.01500129,
         -0.03047838,  0.04197461,  0.03291135,  0.04426822,
          0.02072633, -0.00618589, -0.04319811,  0.0090512 ,
         -0.04558665,  0.02924452, -0.01990993,  0.04883667,
          0.00903035,  0.0130672 ,  0.0298072 , -0.00774354,
          0.03854993,  0.03709947,  0.00772595,  0.04914529,
          0.02790226, -0.03761049,

In [73]:
print(f"test_traj.observation.shape: {test_traj.observation['global'].shape}")
print(f"test_traj.observation.shape: {test_traj.observation['per_arm'].shape}")
print(f"test_traj.discount.shape   : {test_traj.discount.shape}") 

expected_num_actions = action_spec.maximum - action_spec.minimum + 1
print(f"expected_num_actions: {expected_num_actions}")

predicted_rewards_mean = tensor_spec.TensorSpec([expected_num_actions])
print(f"predicted_rewards_mean: {predicted_rewards_mean}")

test_traj.observation.shape: (5, 1, 64)
test_traj.observation.shape: (5, 1, 2, 64)
test_traj.discount.shape   : (2,)
expected_num_actions: 3
predicted_rewards_mean: TensorSpec(shape=(3,), dtype=tf.float32, name=None)


# Train Ranking Agent

In [74]:
EXPERIMENT_NAME   = f'local-ranker-{PREFIX}'

# new experiment
invoke_time       = time.strftime("%Y%m%d-%H%M%S")
RUN_NAME          = f'run-{invoke_time}'

BASE_OUTPUT_DIR   = f'{BUCKET_URI}/{EXPERIMENT_NAME}/{RUN_NAME}'
LOG_DIR           = f"{BASE_OUTPUT_DIR}/logs"
ROOT_DIR          = f"{BASE_OUTPUT_DIR}/root"       # Root directory for writing logs/summaries/checkpoints.
ARTIFACTS_DIR     = f"{BASE_OUTPUT_DIR}/artifacts"  # Where the trained model will be saved and restored.

aiplatform.init(
    project=PROJECT_ID,
    location=REGION,
    experiment=EXPERIMENT_NAME
)

print(f"EXPERIMENT_NAME   : {EXPERIMENT_NAME}")
print(f"RUN_NAME          : {RUN_NAME}\n")
print(f"BASE_OUTPUT_DIR   : {BASE_OUTPUT_DIR}")
print(f"LOG_DIR           : {LOG_DIR}")
print(f"ROOT_DIR          : {ROOT_DIR}")
print(f"ARTIFACTS_DIR     : {ARTIFACTS_DIR}")

EXPERIMENT_NAME   : local-ranker-rec-bandits-v2
RUN_NAME          : run-20231010-121806

BASE_OUTPUT_DIR   : gs://rec-bandits-v2-hybrid-vertex-bucket/local-ranker-rec-bandits-v2/run-20231010-121806
LOG_DIR           : gs://rec-bandits-v2-hybrid-vertex-bucket/local-ranker-rec-bandits-v2/run-20231010-121806/logs
ROOT_DIR          : gs://rec-bandits-v2-hybrid-vertex-bucket/local-ranker-rec-bandits-v2/run-20231010-121806/root
ARTIFACTS_DIR     : gs://rec-bandits-v2-hybrid-vertex-bucket/local-ranker-rec-bandits-v2/run-20231010-121806/artifacts


In [75]:
import collections

from tf_agents.policies import py_tf_eager_policy
from tf_agents.policies import tf_policy
from tf_agents.trajectories import time_step as ts
from tf_agents.policies import policy_saver
from tf_agents.metrics import export_utils
from tf_agents.metrics import tf_metrics
from tf_agents.eval import metric_utils
from tf_agents.utils import common

import tf_agents

print(f"tf_agents version: {tf_agents.__version__}")
print(f"tensorflow version: {tf.__version__}")

tf_agents version: 0.17.0
tensorflow version: 2.13.0


### logs & checkpoints

In [76]:
global_step = tf.compat.v1.train.get_or_create_global_step()

rank_agent.initialize()
print(f'agent: {rank_agent.name}')
print(f'agent: {rank_agent.policy.name}')

agent: ranking_agent
agent: penalize_cosine_distance_ranking_policy


In [77]:
# ====================================================
# TB summary writer
# ====================================================
train_summary_writer = tf.compat.v2.summary.create_file_writer(
    f"{LOG_DIR}", flush_millis=10 * 1000
)
# train_summary_writer.set_as_default()

# eval_summary_writer = tf.compat.v2.summary.create_file_writer(
#     f"{LOG_DIR}/eval", flush_millis=10 * 1000
# )
# ====================================================
# metrics
# ====================================================
# `step_metric` records the number of individual rounds of bandit interaction;
# that is, (number of trajectories) * batch_size
step_metric = tf_metrics.EnvironmentSteps()

if feedback_model == ranking_environment.FeedbackModel.SCORE_VECTOR:
    reward_metric = tf_metrics.AverageReturnMetric(
        batch_size=HPARAMS['batch_size'],
        buffer_size=20
    )
else:
    reward_metric = tf_metrics.AverageReturnMultiMetric(
        reward_spec=environment.reward_spec(),
        batch_size=HPARAMS['batch_size'],
        buffer_size=20
    )
    
metrics = [reward_metric]

pprint(f"metrics: {metrics}")

# ====================================================
# get checkpoint manager
# ====================================================
CHKPOINT_DIR = f"{ROOT_DIR}/chkpoint"
print(f"setting checkpoint_manager: {CHKPOINT_DIR}")

checkpoint_manager = train_utils.restore_and_get_checkpoint_manager(
    root_dir=CHKPOINT_DIR, 
    agent=rank_agent, 
    metrics=metrics, 
    step_metric=step_metric
)

('metrics: [<tf_agents.metrics.tf_metrics.AverageReturnMetric object at '
 '0x7f3e4a12faf0>]')
setting checkpoint_manager: gs://rec-bandits-v2-hybrid-vertex-bucket/local-ranker-rec-bandits-v2/run-20231010-121806/root/chkpoint


In [78]:
# # ====================================================
# # policy saver
# # ====================================================
# saver = policy_saver.PolicySaver(
#     policy = rank_agent.policy, 
#     # train_step=global_step
# )

In [79]:
policy = rank_agent.policy
# isinstance(policy, tf_policy.TFPolicy)

In [80]:
len(list(train_dataset))

917

## Train config

In [81]:
TRAIN_DATA_SIZE = 900          # len(list(train_dataset))
NUM_TRAIN_STEPS = 180            # TRAIN_DATA_SIZE // HPARAMS['batch_size']

EVAL_DATA_SIZE  = 900          # len(list(val_dataset))
NUM_EVAL_STEPS  = 100           # EVAL_DATA_SIZE // HPARAMS['eval_batch_size']

CHKPT_INTERVAL  = NUM_TRAIN_STEPS # // 5
LOG_INTERVAL    = 10
# EVAL_INTERVAL = NUM_TRAIN_STEPS // 2

print(f"TRAIN_DATA_SIZE : {TRAIN_DATA_SIZE}")
print(f"NUM_TRAIN_STEPS : {NUM_TRAIN_STEPS}")
print(f"EVAL_DATA_SIZE : {EVAL_DATA_SIZE}")
print(f"NUM_EVAL_STEPS : {NUM_EVAL_STEPS}")
print(f"CHKPT_INTERVAL: {CHKPT_INTERVAL}")
print(f"LOG_INTERVAL : {LOG_INTERVAL}")
# print(f"EVAL_INTERVAL : {EVAL_INTERVAL}")

TRAIN_DATA_SIZE : 900
NUM_TRAIN_STEPS : 180
EVAL_DATA_SIZE : 900
NUM_EVAL_STEPS : 100
CHKPT_INTERVAL: 180
LOG_INTERVAL : 10


In [82]:
# train data
train_ds_iterator = iter(train_dataset.batch(HPARAMS['batch_size']))
# train_ds_iterator = iter(train_dataset)

# eval dataset
eval_ds = val_dataset.batch(HPARAMS["eval_batch_size"])

if NUM_EVAL_STEPS > 0:
    eval_ds = eval_ds.take(NUM_EVAL_STEPS)

# eval_ds = eval_ds.prefetch(tf.data.AUTOTUNE)
# eval_ds

[ranking_agent](https://github.com/tensorflow/agents/blob/master/tf_agents/bandits/agents/ranking_agent.py#L288C1-L310C8)

```
  def _loss(
      self,
      experience: types.NestedTensor,
      weights: Optional[types.Tensor] = None,
      training: bool = False,
  ) -> tf_agent.LossInfo:
    """Computes loss for training the reward and constraint networks.

    Args:
      experience: A batch of experience data in the form of a `Trajectory` or
        `Transition`.
      weights: Optional scalar or elementwise (per-batch-entry) importance
        weights.  The output batch loss will be scaled by these weights, and the
        final scalar loss is the mean of these values.
      training: Whether the loss is being used for training.

    Returns:
      A `LossInfo` containing the loss for the training step.

    Raises:
      ValueError:
        if the number of actions is greater than 1.
    """
```

In [83]:
list_o_loss = []

rank_agent.train_step_counter.assign(0)

print(f"starting train loop...")
start_time = time.time()

# for i in tqdm(range(NUM_TRAIN_STEPS)):
for i in range(NUM_TRAIN_STEPS):
    
    with train_summary_writer.as_default():

        data = next(train_ds_iterator)
        trajectories = _rank_trajectory_fn(data)

        # All tensors in experience must be shaped [batch, time, ...] 
        step = rank_agent.train_step_counter.numpy()
        loss = rank_agent.train(experience=trajectories)
        list_o_loss.append(loss.loss.numpy())
        
        train_utils._export_metrics_and_summaries(
            step=i, 
            metrics=metrics
        )
        
        # print step loss
        if step % LOG_INTERVAL == 0:
            print(
                'step = {0}: train loss = {1}'.format(
                    step, round(loss.loss.numpy(), 2)
                )
            )

        if i > 0 and i % CHKPT_INTERVAL == 0:
            saver.save(os.path.join(CHKPOINT_DIR, 'policy_%d' % step_metric.result()))
            print(f"saved policy to: {CHKPOINT_DIR}")
            
runtime_mins = int((time.time() - start_time) / 60)
print(f"train runtime_mins: {runtime_mins}")

starting train loop...
step = 0: train loss = 17.479999542236328
step = 10: train loss = 7.900000095367432
step = 20: train loss = 1.0099999904632568
step = 30: train loss = 1.100000023841858
step = 40: train loss = 0.8100000023841858
step = 50: train loss = 0.6499999761581421
step = 60: train loss = 0.9300000071525574
step = 70: train loss = 0.4000000059604645
step = 80: train loss = 0.9100000262260437
step = 90: train loss = 2.3299999237060547
step = 100: train loss = 0.5199999809265137
step = 110: train loss = 1.1299999952316284
step = 120: train loss = 0.800000011920929
step = 130: train loss = 0.6700000166893005
step = 140: train loss = 1.0
step = 150: train loss = 1.9500000476837158
step = 160: train loss = 0.3499999940395355
step = 170: train loss = 0.1899999976158142
train runtime_mins: 11


In [85]:
# LOG_DIR

! gsutil ls $LOG_DIR 
#/train

gs://rec-bandits-v2-hybrid-vertex-bucket/local-ranker-rec-bandits-v2/run-20231010-121806/logs/
gs://rec-bandits-v2-hybrid-vertex-bucket/local-ranker-rec-bandits-v2/run-20231010-121806/logs/events.out.tfevents.1696940294.jt-tfa-bandit-rankers-2023-v2.177543.0.v2


In [86]:
# %load_ext tensorboard
%reload_ext tensorboard

In [88]:
%tensorboard --logdir=$LOG_DIR

### Save Bandit Ranker

In [89]:
# save bandit
tf.saved_model.save(rank_agent, ARTIFACTS_DIR)

In [90]:
!gsutil ls $ARTIFACTS_DIR

gs://rec-bandits-v2-hybrid-vertex-bucket/local-ranker-rec-bandits-v2/run-20231010-121806/artifacts/
gs://rec-bandits-v2-hybrid-vertex-bucket/local-ranker-rec-bandits-v2/run-20231010-121806/artifacts/fingerprint.pb
gs://rec-bandits-v2-hybrid-vertex-bucket/local-ranker-rec-bandits-v2/run-20231010-121806/artifacts/saved_model.pb
gs://rec-bandits-v2-hybrid-vertex-bucket/local-ranker-rec-bandits-v2/run-20231010-121806/artifacts/assets/
gs://rec-bandits-v2-hybrid-vertex-bucket/local-ranker-rec-bandits-v2/run-20231010-121806/artifacts/variables/


# Evaluate bandit ranking

> **TODO**

In [248]:
HPARAMS

{'batch_size': 5,
 'eval_batch_size': 1,
 'num_itmes': 3,
 'num_slots': 2,
 'model_type': 'Ranking',
 'network_type': 'dotproduct',
 'global_layers': [64, 32, 16],
 'per_arm_layers': [64, 32, 16],
 'common_layers': [16, 8],
 'learning_rate': 0.005,
 'policy_type': <RankingPolicyType.COSINE_DISTANCE: 1>,
 'feedback_model': 2,
 'penalty_mixture': 1.0,
 'logits_temperature': 1.0}

In [247]:
def _create_random_ranking(
    num_ranked_items: int, num_allowed_values: int, batch_size: int
):
    """Returns a batch of `batch_size` rankings each of length `num_ranked_items`.

    1. If `num_ranked_items` is no greater than `num_allowed_values`, each ranking
    is a subset of [0, 1, ..., num_allowed_values - 1], of size
    `num_ranked_items`.
    2. If `num_ranked_items` is greater than `num_allowed_values`, the first
    `num_allowed_values` of each ranking is a permutation of
    [0, 1, ... num_allowed_values - 1]. The remaining `num_ranked_items -
    num_allowed_values` of the ranking are unspecified.

    Args:
    num_ranked_items: the expected number of items in the output ranking, as
      specified in the study config.
    num_allowed_values: the number of items to select from.
    batch_size: the number of batches of random rankings to return.
    """
    num_valid_indices = min(num_ranked_items, num_allowed_values)
    ranking = np.full(
        (batch_size, num_ranked_items), np.iinfo(np.int32).max, dtype=np.int32
    )
    for idx in range(batch_size):
        ranking[idx, :num_valid_indices] = np.random.choice(
            num_allowed_values, size=num_valid_indices, replace=False
        )
    return ranking

In [249]:
rand_rank_batch = _create_random_ranking(
    num_ranked_items=HPARAMS['num_slots'],
    num_allowed_values=HPARAMS['num_itmes'],
    batch_size=HPARAMS['batch_size']
)
rand_rank_batch

array([[0, 1],
       [1, 2],
       [2, 1],
       [1, 0],
       [1, 0]], dtype=int32)

In [238]:
# from src.perarm_features import eval_perarm as eval_perarm
eval_policy_tf = py_tf_eager_policy.PyTFEagerPolicy(rank_agent.policy, use_tf_function=True)
eval_policy_tf

<tf_agents.policies.py_tf_eager_policy.PyTFEagerPolicy at 0x7fbae45bba90>

In [243]:
# eval_policy_tf.time_step_spec

In [244]:
# eval_policy_tf.trajectory_spec

In [246]:
prediction = eval_policy_tf.action(trajectories)
prediction

In [317]:
for x in eval_ds:

    filter_mask = None
    # get feature tensors

#     global_feat_infer = _get_global_context_features(x)
#     arm_feat_infer = _get_per_arm_features(x)
    
#     feature = {'global': global_feat_infer, 'per_arm': arm_feat_infer}

#     rewards = x['user_rating']
    
    # reshape arm features
    # arm_feat_infer = tf.reshape(arm_feat_infer, [HPARAMS['eval_batch_size'], PER_ARM_DIM])
    # concat_arm = tf.concat([arm_feat_infer, dummy_arm], axis=0)
    
    # trajectory_step = train_utils._get_eval_step(feature, rewards.numpy()[0])
    trajectory_step = _rank_trajectory_fn(x)
    
    break

concat_embeddings shape: (5, 64)
concat_embeddings shape: (5, 64)


In [321]:
# rewards.numpy()[0]

In [320]:
# feature

In [322]:
# arm_feat_infer

In [None]:
# ====================================================
# Evaluate the agent's policy once before training
# ====================================================
# Reset the train step
rank_agent.train_step_counter.assign(0)

pre_policy_tf = py_tf_eager_policy.PyTFEagerPolicy(rank_agent.policy, use_tf_function=True)

print(f"evaluating pre-trained Agent...")
start_time = time.time()

pre_val_loss, pre_preds, pre_tr_rewards = eval_perarm._run_bandit_eval(
    policy = pre_policy_tf,
    data = eval_ds,
    eval_batch_size = HPARAMS['eval_batch_size'],
    per_arm_dim = PER_ARM_DIM,
    global_dim = GLOBAL_DIM,
    vocab_dict = vocab_dict,
    num_oov_buckets = NUM_OOV_BUCKETS,
    global_emb_size = GLOBAL_EMBEDDING_SIZE,
    mv_emb_size = MV_EMBEDDING_SIZE,
)

runtime_mins = int((time.time() - start_time) / 60)
print(f"pre-train val_loss     : {pre_val_loss}")
print(f"pre-train eval runtime : {runtime_mins}")
# ====================================================
# train loop
# ====================================================
print(f"starting train loop...")
start_time = time.time()

## Ranking Bandit

In [32]:
BATCH_SIZE      = 128
EVAL_BATCH_SIZE = 1

NUM_ACTIONS = 2
NUM_ITEMS  = 50
NUM_SLOTS  = 3 

print(f"BATCH_SIZE      : {BATCH_SIZE}")
print(f"EVAL_BATCH_SIZE : {EVAL_BATCH_SIZE}")
print(f"NUM_ACTIONS     : {NUM_ACTIONS}")
print(f"GLOBAL_DIM      : {GLOBAL_DIM}")
print(f"PER_ARM_DIM     : {PER_ARM_DIM}")

BATCH_SIZE      : 128
EVAL_BATCH_SIZE : 1
NUM_ACTIONS     : 2
GLOBAL_DIM      : 64
PER_ARM_DIM     : 64


## specs

### Observation Spec

**example config:**
```
global_dim = 9  #@param{ type: "integer"}
item_dim   = 11  #@param{ type: "integer"}
num_items  = 50 #@param{ type: "integer"}
num_slots  = 3  #@param{ type: "integer"}
```
**example obs sepc:**

```
{'global': TensorSpec(shape=(9,), dtype=tf.float32, name=None),
 'per_arm': TensorSpec(shape=(50, 11), dtype=tf.float32, name=None)}
 ```

In [33]:
observation_spec = {
    'global': tf.TensorSpec([GLOBAL_DIM], tf.float32),
    'per_arm': tf.TensorSpec([NUM_ACTIONS, PER_ARM_DIM], tf.float32) #excluding action dim here
}
observation_spec

{'global': TensorSpec(shape=(64,), dtype=tf.float32, name=None),
 'per_arm': TensorSpec(shape=(2, 64), dtype=tf.float32, name=None)}

### Action Spec

```
BoundedTensorSpec(shape=(3,), dtype=tf.int32, name='action', minimum=array(0, dtype=int32), maximum=array(49, dtype=int32))

```

In [34]:
action_spec = tensor_spec.BoundedTensorSpec(
    shape=[], 
    dtype=tf.int32,
    minimum=tf.constant(0),            
    maximum=NUM_SLOTS-1, # n degrees of freedom and will dictate the expected mean reward spec shape
    name="action_spec"
)

action_spec

BoundedTensorSpec(shape=(), dtype=tf.int32, name='action_spec', minimum=array(0, dtype=int32), maximum=array(2, dtype=int32))

### TimeStep Spec

```
TimeStep(
{'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
 'observation': {'global': TensorSpec(shape=(9,), dtype=tf.float32, name=None),
                 'per_arm': TensorSpec(shape=(50, 11), dtype=tf.float32, name=None)},
 'reward': {'chosen_index': BoundedTensorSpec(shape=(), dtype=tf.int32, name='chosen_index', minimum=array(0, dtype=int32), maximum=array(3, dtype=int32)),
            'chosen_value': TensorSpec(shape=(), dtype=tf.float32, name='chosen_value')},
 'step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type')})
 ```

In [35]:
time_step_spec = ts.time_step_spec(observation_spec)#, reward_spec=tf.TensorSpec([1, NUM_ACTIONS]))
time_step_spec

TimeStep(
{'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
 'observation': {'global': TensorSpec(shape=(64,), dtype=tf.float32, name=None),
                 'per_arm': TensorSpec(shape=(2, 64), dtype=tf.float32, name=None)},
 'reward': TensorSpec(shape=(), dtype=tf.float32, name='reward'),
 'step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type')})

In [None]:
feedback_model = ranking_environment.FeedbackModel

In [None]:
from src.perarm_features import ranking_bandit_policy
# policy = ranking_bandit_policy.GenLinearRankingBanditPolicy(

In [None]:
from src.perarm_features import agent_factory as agent_factory
ranking_bandit_agent = agent_factory.GenLinearRankingBanditAgent(XXX)