In [1]:
# !pip install tf-agents --user -q

In [1]:
!pip freeze | grep tf-agents

tf-agents==0.17.0


In [2]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [3]:
import functools
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import random


from tf_agents.bandits.agents import lin_ucb_agent
from tf_agents.bandits.environments import stationary_stochastic_per_arm_py_environment as p_a_env
from tf_agents.bandits.metrics import tf_metrics as tf_bandit_metrics
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import tf_py_environment
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.specs import tensor_spec
from tf_agents.trajectories import time_step as ts
import tensorflow_datasets as tfds
from pprint import pprint

nest = tf.nest

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


### movies data

In [4]:
movies = tfds.load("movielens/100k-movies", split="train")

for x in movies.batch(1).take(1):
    pprint(x)

{'movie_genres': <tf.Tensor: shape=(1, 1), dtype=int64, numpy=array([[4]])>,
 'movie_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'1681'], dtype=object)>,
 'movie_title': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'You So Crazy (1994)'], dtype=object)>}


### user and ratings data

In [5]:
ratings = tfds.load("movielens/100k-ratings", split="train")

for x in ratings.batch(1).take(1):
    pprint(x)

{'bucketized_user_age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([45.], dtype=float32)>,
 'movie_genres': <tf.Tensor: shape=(1, 1), dtype=int64, numpy=array([[7]])>,
 'movie_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'357'], dtype=object)>,
 'movie_title': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b"One Flew Over the Cuckoo's Nest (1975)"], dtype=object)>,
 'raw_user_age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([46.], dtype=float32)>,
 'timestamp': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([879024327])>,
 'user_gender': <tf.Tensor: shape=(1,), dtype=bool, numpy=array([ True])>,
 'user_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'138'], dtype=object)>,
 'user_occupation_label': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([4])>,
 'user_occupation_text': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'doctor'], dtype=object)>,
 'user_rating': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([4.], dtype=float32)>,

#### Let's make this simple and load up movielens that has features
We will only consider for this example
1) The movie genere as an Arm feature (we will concatenate multiple genres)
2) The user occupation and age bucket labels for the overall context features

We need to load the data, get the ratings - light EDA for us to get cardnality of the dataset as well as lookups for the 

In [6]:
# Get the unique movies and users
unique_movie_ids = ratings.map(lambda x: x["movie_id"])
unique_movie_ids = np.unique([x.numpy() for x in unique_movie_ids])
MOVIELENS_NUM_MOVIES = len(unique_movie_ids)


print(f"len(unique_movie_ids) : {len(unique_movie_ids)}")
print(f"unique_movie_ids      : {unique_movie_ids[:2]}")

len(unique_movie_ids) : 1682
unique_movie_ids      : [b'1' b'10']


In [7]:
unique_user_ids = ratings.map(lambda x: x["user_id"])
unique_user_ids = np.unique([x.numpy() for x in unique_user_ids])
MOVIELENS_NUM_USERS = len(unique_user_ids)


print(f"len(unique_user_ids) : {len(unique_user_ids)}")
print(f"unique_user_ids      : {unique_user_ids[:2]}")

len(unique_user_ids) : 943
unique_user_ids      : [b'1' b'10']


In [8]:
## Get the unnique set of user buckets and create a lookup table

In [9]:
from typing import Dict

def get_dictionary_lookup_by_tf_data_key(key: str) -> Dict:
    tensor = ratings.map(lambda x: x[key])
    unique_elems = set()
    for x in tensor:
        val = x.numpy()
        if type(val) is np.ndarray: # if multi dimesnional only grab first one
            val = val[0]
        unique_elems.add(val)
    
    #return a dictionary of keys by integer values for the feature space
    return {val: i for i, val in enumerate(unique_elems)}


In [10]:
user_age_lookup = get_dictionary_lookup_by_tf_data_key('bucketized_user_age')
user_age_dim = len(user_age_lookup)

In [11]:
user_age_lookup

{1.0: 0, 35.0: 1, 45.0: 2, 18.0: 3, 50.0: 4, 56.0: 5, 25.0: 6}

In [12]:
user_occ_lookup = get_dictionary_lookup_by_tf_data_key('user_occupation_text')
user_occ_dim = len(user_occ_lookup)

In [13]:
user_occ_lookup

{b'student': 0,
 b'librarian': 1,
 b'executive': 2,
 b'other': 3,
 b'marketing': 4,
 b'artist': 5,
 b'healthcare': 6,
 b'programmer': 7,
 b'engineer': 8,
 b'homemaker': 9,
 b'scientist': 10,
 b'doctor': 11,
 b'entertainment': 12,
 b'lawyer': 13,
 b'administrator': 14,
 b'none': 15,
 b'retired': 16,
 b'writer': 17,
 b'educator': 18,
 b'technician': 19,
 b'salesman': 20}

In [14]:
movie_gen_lookup = get_dictionary_lookup_by_tf_data_key('movie_genres')
movie_gen_dim = len(movie_gen_lookup)

In [15]:
movie_gen_lookup

{0: 0,
 1: 1,
 2: 2,
 3: 3,
 4: 4,
 5: 5,
 6: 6,
 7: 7,
 8: 8,
 9: 9,
 10: 10,
 12: 11,
 13: 12,
 14: 13,
 15: 14,
 16: 15,
 17: 16,
 18: 17,
 19: 18}

In [16]:
# REFACTOR BELOW
 #from https://github.com/tensorflow/agents/blob/master/tf_agents/bandits/environments/dataset_utilities.py#L153
    
# def load_movielens_data(data_file, delimiter=','):
#     """Loads the movielens data and returns the ratings matrix."""
#     ratings_matrix = np.zeros([MOVIELENS_NUM_USERS, MOVIELENS_NUM_MOVIES])
#     with tf.io.gfile.GFile(data_file, 'r') as infile:
#     # The file is a csv with rows containing:
#     # user id | item id | rating | timestamp
#     reader = csv.reader(infile, delimiter=delimiter)
#     for row in reader:
#         user_id, item_id, rating, _ = row
#         ratings_matrix[int(user_id) - 1, int(item_id) - 1] = float(rating)
#     return ratings_matrix



def load_movielens_data(ratings_dataset):
    # ratings = tfds.load("movielens/100k-ratings", split="train")
    ratings_matrix = np.zeros([MOVIELENS_NUM_USERS, MOVIELENS_NUM_MOVIES])
    local_data = ratings_dataset.map(lambda x: {'user_id': x['user_id']
                                                 ,'movie_id':  x['movie_id']
                                                 ,'user_rating':  x['user_rating']
                                                 ,'bucketized_user_age': x['bucketized_user_age']
                                                 ,'user_occupation_text': x['user_occupation_text']
                                                 ,'movie_genres': x['movie_genres'][0]
                                               }
                                                                         )
    user_age_int = []
    user_occ_int = []
    mov_gen_int = []
    for row in local_data:
        ratings_matrix[int(row['user_id'].numpy()) - 1, int(row['movie_id'].numpy()) - 1] = float(row['user_rating'].numpy())
        user_age_int.append(user_age_lookup[row['bucketized_user_age'].numpy()])
        user_occ_int.append(user_occ_lookup[row['user_occupation_text'].numpy()])
        mov_gen_int.append(movie_gen_lookup[row['movie_genres'].numpy()])
    return tf.convert_to_tensor(ratings_matrix, dtype=tf.float32), tf.convert_to_tensor(np.array(user_age_int), dtype=tf.float32), tf.convert_to_tensor(np.array(user_occ_int), dtype=tf.float32), tf.convert_to_tensor(np.array(mov_gen_int), dtype=tf.float32)
    

In [17]:
ratings_matrix, user_age_int, user_occ_int, mov_gen_int = load_movielens_data(ratings)

In [18]:
sampled_user_indices_np = np.random.randint(
            1000, size=8)
sampled_user_indices = tf.convert_to_tensor(sampled_user_indices_np, dtype=tf.int32)
sampled_user_indices = np.expand_dims(sampled_user_indices,axis=-1) #expand out to individual indicies to match sizes for slicing

In [19]:
sampled_user_indices

array([[189],
       [188],
       [336],
       [284],
       [ 37],
       [364],
       [903],
       [878]], dtype=int32)

In [20]:
sampled_user_ages = tf.gather_nd(indices=sampled_user_indices
                                         , params=user_age_int
                                         , batch_dims=0)

### Now do the same with the movies

In [21]:
sampled_movie_indices_np = np.array([
            random.sample(range(1000), 5)
            for _ in range(8)
        ])
sampled_movie_indices = tf.convert_to_tensor(sampled_movie_indices_np, dtype=tf.int32)

In [22]:
# movie_index_vector = tf.reshape(sampled_movie_indices, shape=[-1])
movie_index_vector = tf.convert_to_tensor(sampled_movie_indices, dtype=tf.int32)
movie_index_vector = tf.expand_dims(tf.reshape(movie_index_vector, shape=[-1]), axis=-1)
# flat_genre_list = self._mov_gen_int[movie_index_vector] #shape of 1
movie_index_vector

<tf.Tensor: shape=(40, 1), dtype=int32, numpy=
array([[ 71],
       [568],
       [375],
       [664],
       [ 51],
       [ 26],
       [ 19],
       [962],
       [ 78],
       [620],
       [447],
       [190],
       [248],
       [747],
       [996],
       [214],
       [812],
       [325],
       [171],
       [775],
       [701],
       [615],
       [106],
       [626],
       [433],
       [934],
       [120],
       [435],
       [745],
       [150],
       [519],
       [191],
       [712],
       [769],
       [195],
       [959],
       [229],
       [841],
       [149],
       [244]], dtype=int32)>

In [23]:
mov_gen_int

<tf.Tensor: shape=(100000,), dtype=float32, numpy=array([ 7.,  4.,  4., ..., 10.,  0.,  4.], dtype=float32)>

In [24]:
flat_genre_list = tf.gather_nd(indices=movie_index_vector
                               , params=mov_gen_int
                               , batch_dims=0) #shape of 1
genre_list = tf.reshape(flat_genre_list, shape=(8, 5)) #batch size, action 
genre_list

<tf.Tensor: shape=(8, 5), dtype=float32, numpy=
array([[0., 7., 0., 4., 4.],
       [5., 1., 4., 4., 0.],
       [0., 4., 0., 5., 6.],
       [5., 9., 0., 3., 7.],
       [0., 4., 0., 0., 0.],
       [0., 4., 0., 4., 0.],
       [1., 0., 4., 7., 7.],
       [7., 0., 7., 4., 7.]], dtype=float32)>

In [25]:
#### Tf SVD

In [26]:
s, u, vh = tf.linalg.svd(ratings_matrix, full_matrices=False)

rank_k = 4

# Keep only the largest singular values.
u_hat = u[:, :rank_k]
s_hat = s[:rank_k]
v_hat = vh[:, :rank_k]

In [27]:
latent_movie_features = tf.gather_nd(indices=movie_index_vector, params=v_hat)
latent_movie_features_reshaped = tf.reshape(latent_movie_features, shape=(8,5,rank_k))

In [28]:
latent_movie_features_reshaped

<tf.Tensor: shape=(8, 5, 4), dtype=float32, numpy=
array([[[-3.54306847e-02, -7.64061231e-03,  4.35530171e-02,
         -5.17456699e-03],
        [-1.56895164e-02,  5.02521824e-03,  1.84637476e-02,
          1.48388222e-02],
        [-5.39823947e-03,  1.52539869e-03,  1.46353347e-02,
         -6.38199737e-03],
        [-2.22820733e-02,  1.86000427e-03,  3.33855152e-02,
          1.75931305e-02],
        [-2.83747204e-02, -3.88095081e-02, -2.86513176e-02,
          6.45131571e-03]],

       [[-1.49953561e-02,  9.73032881e-03,  2.54759118e-02,
          1.57616511e-02],
        [-1.53361913e-02,  6.28445530e-03, -4.20554318e-02,
          3.02750878e-02],
        [-1.40080405e-02, -8.14620592e-03, -1.79557167e-02,
          1.65346693e-02],
        [-9.39802304e-02, -4.09501093e-03,  5.47075793e-02,
         -1.07907050e-03],
        [-3.25970771e-03, -4.54645092e-03,  2.55617104e-03,
         -8.74069054e-03]],

       [[-2.42376067e-02, -1.33329453e-02,  1.02638919e-02,
          5.487

In [29]:
# Concatenate the values so genre is added to the first axis

tf.concat([latent_movie_features_reshaped, tf.expand_dims(genre_list, axis=-1)], axis=2)

<tf.Tensor: shape=(8, 5, 5), dtype=float32, numpy=
array([[[-3.54306847e-02, -7.64061231e-03,  4.35530171e-02,
         -5.17456699e-03,  0.00000000e+00],
        [-1.56895164e-02,  5.02521824e-03,  1.84637476e-02,
          1.48388222e-02,  7.00000000e+00],
        [-5.39823947e-03,  1.52539869e-03,  1.46353347e-02,
         -6.38199737e-03,  0.00000000e+00],
        [-2.22820733e-02,  1.86000427e-03,  3.33855152e-02,
          1.75931305e-02,  4.00000000e+00],
        [-2.83747204e-02, -3.88095081e-02, -2.86513176e-02,
          6.45131571e-03,  4.00000000e+00]],

       [[-1.49953561e-02,  9.73032881e-03,  2.54759118e-02,
          1.57616511e-02,  5.00000000e+00],
        [-1.53361913e-02,  6.28445530e-03, -4.20554318e-02,
          3.02750878e-02,  1.00000000e+00],
        [-1.40080405e-02, -8.14620592e-03, -1.79557167e-02,
          1.65346693e-02,  4.00000000e+00],
        [-9.39802304e-02, -4.09501093e-03,  5.47075793e-02,
         -1.07907050e-03,  4.00000000e+00],
        [-3

## Replicate an agent using the above data

https://github.com/tensorflow/agents/blob/master/tf_agents/bandits/environments/movielens_per_arm_py_environment.py

Create an arm spec from this utility function
https://www.tensorflow.org/agents/api_docs/python/tf_agents/specs/bandit_spec_utils/create_per_arm_observation_spec

#### NOT Used but helpful to create an obs spec:

```python
# Example observation spec from above
# There are 20 user occupations and 7 age buckets. This makes our global dimension 27
# There are 19 genres, and that will be the arm dimension for this example

from tf_agents.specs.bandit_spec_utils import create_per_arm_observation_spec as create_obs_spec
create_obs_spec(
    global_dim = 1,
    per_arm_dim = 2,
    max_num_actions = 10,
    add_num_actions_feature = False
) 
```

In [30]:
"""Class implementation of the per-arm MovieLens Bandit environment."""
from __future__ import absolute_import

import random
from typing import Optional, Text
import gin
import numpy as np

from tf_agents.bandits.environments import bandit_py_environment
from tf_agents.bandits.environments import dataset_utilities
from tf_agents.bandits.specs import utils as bandit_spec_utils
from tf_agents.specs import array_spec
from tf_agents.trajectories import time_step as ts


GLOBAL_KEY = bandit_spec_utils.GLOBAL_FEATURE_KEY
PER_ARM_KEY = bandit_spec_utils.PER_ARM_FEATURE_KEY


# @gin.configurable
class MovieLensPerArmPyEnvironment(bandit_py_environment.BanditPyEnvironment):
    """Implements the per-arm version of the MovieLens Bandit environment.

    This environment implements the MovieLens 100K dataset, available at:
    https://www.kaggle.com/prajitdatta/movielens-100k-dataset

    This dataset contains 100K ratings from 943 users on 1682 items.
    This csv list of:
    user id | item id | rating | timestamp.
    This environment computes a low-rank matrix factorization (using SVD) of the
    data matrix `A`, such that: `A ~= U * Sigma * V^T`.

    The environment uses the rows of `U` as global (or user) features, and the
    rows of `V` as per-arm (or movie) features.

    The reward of recommending movie `v` to user `u` is `u * Sigma * v^T`.
    """

    def __init__(self,
               dataset = ratings,
               rank_k: int = 2,
               batch_size: int = 10,
               num_actions: int = 100,
               name: Optional[Text] = 'movielens_per_arm'):
        """Initializes the Per-arm MovieLens Bandit environment.

        Args:
          data_dir: (string) Directory where the data lies (in text form).
          rank_k : (int) Which rank to use in the matrix factorization. This will
            also be the feature dimension of both the user and the movie features.
          batch_size: (int) Number of observations generated per call.
          num_actions: (int) How many movies to choose from per round.
          csv_delimiter: (string) The delimiter to use in loading the data csv file.
          name: (string) The name of this environment instance.
        """
        self._batch_size = batch_size
        self._num_actions = num_actions
        self.rank_k = rank_k

        # Compute the matrix factorization.
        # self._data_matrix = dataset_utilities.load_movielens_data(
        #     data_dir, delimiter=csv_delimiter)

        self._data_matrix, self._user_age_int, self._user_occ_int, self._mov_gen_int = load_movielens_data(ratings)
        self._num_users, self._num_movies = self._data_matrix.shape

        # Compute the SVD.
        s, u, vh = tf.linalg.svd(self._data_matrix, full_matrices=False)

        # Keep only the largest singular values.
        self._u_hat = u[:, :rank_k]
        self._s_hat = s[:rank_k]
        self._v_hat = vh[:, :rank_k]

        self._approx_ratings_matrix = tf.matmul(self._u_hat * self._s_hat,
                                                tf.transpose(self._v_hat))

        self._action_spec = array_spec.BoundedArraySpec(
            shape=(),
            dtype=np.int32,
            minimum=0,
            maximum=num_actions - 1,
            name='action')
        observation_spec = {
            GLOBAL_KEY:
                array_spec.ArraySpec(shape=[rank_k+2], dtype=np.float32), #creating +space for user age and occupation
            PER_ARM_KEY:
                array_spec.ArraySpec(
                    shape=[num_actions, rank_k+1], dtype=np.float32), #creating +1 space for movie genre
        }
        self._time_step_spec = ts.time_step_spec(observation_spec)

        self._current_user_indices = tf.zeros(batch_size, dtype=np.int32)
        self._previous_user_indices = tf.zeros(batch_size, dtype=np.int32)

        self._current_movie_indices = tf.zeros([batch_size, num_actions],
                                               dtype=np.int32)
        self._previous_movie_indices = tf.zeros([batch_size, num_actions],
                                                dtype=np.int32)

        self._observation = {
            GLOBAL_KEY:
                tf.zeros([batch_size, rank_k+2], dtype=np.int32), #making space like above for dimensions
            PER_ARM_KEY:
                tf.zeros([batch_size, num_actions, rank_k+1], dtype=np.int32),
        }

        super(MovieLensPerArmPyEnvironment, self).__init__(
            observation_spec, self._action_spec, name=name)

    @property
    def batch_size(self):
        return self._batch_size

    @property
    def batched(self):
        return True

    def _observe(self):
        
        #user section - random sample users
        sampled_user_indices_np = np.random.randint(
            self._num_users, size=self._batch_size)
        sampled_user_indices_1d = tf.convert_to_tensor(sampled_user_indices_np
                                                       , dtype=tf.int32)
        #expand dims for gather_nd - need to have indices like this [[1], [2], [5]] vs. [1, 2, 5]
        sampled_user_indices = tf.expand_dims(sampled_user_indices_1d
                                              , axis=-1)
        
        #sample feature values - gather_nd gathers the values from the randomly sampled incies
        sampled_user_ages = tf.gather_nd(indices=sampled_user_indices
                                         , params=self._user_age_int)
        sampled_user_occ = tf.gather_nd(indices=sampled_user_indices
                                        , params=self._user_occ_int)
        latent_user_features = tf.gather_nd(indices=sampled_user_indices
                                            , params=self._u_hat)
        
        #we concatenate these - these are our user/context features. note expand dims is needed to properly concatnate across the 1st dim
        combined_user_features = tf.concat([latent_user_features
                                                 , tf.expand_dims(sampled_user_ages, axis=-1)
                                                 , tf.expand_dims(sampled_user_occ, axis=-1)], axis=1)
    
        
        ###movie section

        sampled_movie_indices_np = np.array([
            random.sample(range(self._num_movies), self._num_actions)
            for _ in range(self._batch_size)
        ])
        sampled_movie_indices = tf.convert_to_tensor(sampled_movie_indices_np
                                                     , dtype=tf.int32)
        
        
        #expand dims for gather_nd - need to have indices like this [[1], [2], [5]] vs. [1, 2, 5]
        movie_index_vector = tf.expand_dims(tf.reshape(sampled_movie_indices
                                                       , shape=[-1])
                                            , axis=-1)
        
        #movie index vector is flattened across actions now, so this will gather the genre feature values for each sampled action(movie)
        flat_genre_list = tf.gather_nd(indices=movie_index_vector
                                       , params=self._mov_gen_int) #shape of 1
        #adding actions back as a dimesions
        reshaped_genre_features = tf.reshape(flat_genre_list
                                             , shape = [self._batch_size
                                                        , self._num_actions])
        #gathering the latent movie features, again flattented at action level
        latent_movie_features = tf.gather_nd(indices=movie_index_vector
                                             , params=self._v_hat) #shape of 2
        #then we reshape the action back in
        latent_movie_features_reshaped = tf.reshape(latent_movie_features
                                                    , shape=[self._batch_size, self._num_actions, self.rank_k])
        #now that the shape is right for the latent features + the movie genre and we have dimensions = batch x action x feature dim (we concatenate at feature dim)
        current_movies = tf.concat([latent_movie_features_reshaped
                                             , tf.expand_dims(reshaped_genre_features, axis=-1)], axis=2)

        #save the indices 
        self._previous_user_indices = self._current_user_indices
        self._current_user_indices = sampled_user_indices
        self._previous_movie_indices = self._current_movie_indices
        self._current_movie_indices = sampled_movie_indices
        

        batched_observations = {
            GLOBAL_KEY:
                combined_user_features,
            PER_ARM_KEY:
                current_movies,
        }
        return batched_observations
    

    def _apply_action(self, action):
        action = tf.expand_dims(action, axis=-1)
        chosen_arm_indices = tf.gather_nd(indices=action
                                          , params=self._current_movie_indices
                                          , batch_dims = 1)
        chosen_user_moves = tf.concat([self._current_user_indices
                                       , tf.expand_dims(chosen_arm_indices, axis=-1)]
                                      , axis=1)
        return tf.gather_nd(indices=chosen_user_moves, params=self._approx_ratings_matrix)

    def _rewards_for_all_actions(self):
        broadcasted_user = tf.broadcast_to(self._previous_user_indices
                                           , [BATCH_SIZE, NUM_ACTIONS]) #broadcast the user ID across all actions
        chosen_user_movies = tf.stack([broadcasted_user      
                                       , self._previous_movie_indices]
                                      , axis=2)
        rewards_matrix = tf.gather_nd(indices=chosen_user_movies
                                      , params=self._approx_ratings_matrix)
        return rewards_matrix

    def compute_optimal_action(self):
        optimal_actions = tf.argmax(self._rewards_for_all_actions()
                                    , axis=-1)
        return tf.cast(optimal_actions
                       , dtype=tf.int32) #needs casting

    def compute_optimal_reward(self):
        return np.max(self._rewards_for_all_actions()
                      , axis=-1)

In [31]:
env = MovieLensPerArmPyEnvironment()

In [32]:
print('observation spec: ', env.observation_spec())
print('\nAn observation: ', env.reset().observation)

observation spec:  {'global': ArraySpec(shape=(4,), dtype=dtype('float32'), name=None), 'per_arm': ArraySpec(shape=(100, 3), dtype=dtype('float32'), name=None)}

An observation:  {'global': <tf.Tensor: shape=(10, 4), dtype=float32, numpy=
array([[-4.4375014e-02,  3.0617092e-02,  2.0000000e+00,  1.9000000e+01],
       [-7.9905227e-02, -9.8162433e-03,  6.0000000e+00,  8.0000000e+00],
       [-2.2107666e-02, -3.0238582e-02,  3.0000000e+00,  0.0000000e+00],
       [-2.8882496e-02,  3.8735330e-02,  2.0000000e+00,  1.4000000e+01],
       [-6.4466796e-03,  1.6925792e-03,  3.0000000e+00,  6.0000000e+00],
       [-1.1804277e-02,  5.5616260e-02,  0.0000000e+00,  0.0000000e+00],
       [-1.6301041e-02,  3.9134230e-02,  6.0000000e+00,  0.0000000e+00],
       [-5.9202615e-02, -2.6512155e-02,  1.0000000e+00,  1.7000000e+01],
       [-1.6079381e-02, -1.7002936e-02,  3.0000000e+00,  1.9000000e+01],
       [-6.1323754e-02, -5.3644337e-02,  3.0000000e+00,  0.0000000e+00]],
      dtype=float32)>, 'per_ar

### Now that the environment is created, let's optimize

Taken from here
https://github.com/tensorflow/agents/blob/5e5915b0a3650a15e82e77af6e37f41a6c744689/tf_agents/bandits/agents/examples/v2/train_eval_movielens.py#L84

In [33]:
import functools
import os
from absl import app
from absl import flags

import tensorflow as tf  # pylint: disable=g-explicit-tensorflow-version-import
from tf_agents.bandits.agents import dropout_thompson_sampling_agent as dropout_ts_agent
from tf_agents.bandits.agents import lin_ucb_agent
from tf_agents.bandits.agents import linear_thompson_sampling_agent as lin_ts_agent
from tf_agents.bandits.agents import neural_epsilon_greedy_agent as eps_greedy_agent
from tf_agents.bandits.agents.examples.v2 import trainer
from tf_agents.bandits.environments import environment_utilities
from tf_agents.bandits.environments import movielens_per_arm_py_environment
from tf_agents.bandits.environments import movielens_py_environment
from tf_agents.bandits.metrics import tf_metrics as tf_bandit_metrics
from tf_agents.bandits.networks import global_and_arm_feature_network
from tf_agents.environments import tf_py_environment
from tf_agents.networks import q_network

BATCH_SIZE = 8
TRAINING_LOOPS = 20000
STEPS_PER_LOOP = 2

RANK_K = 20
NUM_ACTIONS = 20

# LinUCB agent constants.

AGENT_ALPHA = 10.0

# epsilon Greedy constants.

EPSILON = 0.05
LAYERS = (50, 50, 50)
LR = 0.005

# Dropout TS constants.
DROPOUT_RATE = 0.2

In [34]:
tf.compat.v1.enable_v2_behavior()

In [35]:
env = MovieLensPerArmPyEnvironment(
        rank_k=RANK_K,
        batch_size=BATCH_SIZE,
        num_actions=NUM_ACTIONS,
)
environment = tf_py_environment.TFPyEnvironment(env)

### Note we will be using the reward function with this utility function

```python
@gin.configurable
def compute_optimal_reward_with_movielens_environment(observation, environment):
  """Helper function for gin configurable Regret metric."""
  del observation
  return tf.py_function(environment.compute_optimal_reward, [], tf.float32)

@gin.configurable
def compute_optimal_action_with_movielens_environment(observation,
                                                      environment,
                                                      action_dtype=tf.int32):
  """Helper function for gin configurable SuboptimalArms metric."""
  del observation
  return tf.py_function(environment.compute_optimal_action, [], action_dtype)
```

In [36]:
optimal_reward_fn = functools.partial(
      environment_utilities.compute_optimal_reward_with_movielens_environment,
      environment=environment)

optimal_action_fn = functools.partial(
  environment_utilities.compute_optimal_action_with_movielens_environment,
  environment=environment)

### Below we will try different agents by selecting one of the enumerated types:

```python
flags.DEFINE_enum(
    'agent', 'LinUCB', ['LinUCB', 'LinTS', 'epsGreedy', 'DropoutTS'],
    'Which agent to use. Possible values: `LinUCB`, `LinTS`, `epsGreedy`,'
    ' `DropoutTS`.')
```

In [37]:
AGENT_TYPE = 'LinUCB'

In [38]:
if AGENT_TYPE == 'LinUCB':
    agent = lin_ucb_agent.LinearUCBAgent(
        time_step_spec=environment.time_step_spec(),
        action_spec=environment.action_spec(),
        tikhonov_weight=0.001,
        alpha=AGENT_ALPHA,
        dtype=tf.float32,
        accepts_per_arm_features=True)

elif AGENT_TYPE == 'LinTS':
    agent = lin_ts_agent.LinearThompsonSamplingAgent(
        time_step_spec=environment.time_step_spec(),
        action_spec=environment.action_spec(),
        dtype=tf.float32,
        accepts_per_arm_features=True)

elif AGENT_TYPE == 'epsGreedy':
    network = (
      global_and_arm_feature_network
      .create_feed_forward_dot_product_network(
          environment.time_step_spec().observation,
          global_layers=LAYERS,
          arm_layers=LAYERS))

    agent = eps_greedy_agent.NeuralEpsilonGreedyAgent(
        time_step_spec=environment.time_step_spec(),
        action_spec=environment.action_spec(),
        reward_network=network,
        optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
        epsilon=EPSILON,
        emit_policy_info='predicted_rewards_mean',
        info_fields_to_inherit_from_greedy=['predicted_rewards_mean'])

elif AGENT_TYPE == 'DropoutTS':
    train_step_counter = tf.compat.v1.train.get_or_create_global_step()

    def dropout_fn():
        return tf.math.maximum(
          tf.math.reciprocal_no_nan(1.01 +
                                    tf.cast(train_step_counter, tf.float32)),
          0.0003)

    agent = dropout_ts_agent.DropoutThompsonSamplingAgent(
        time_step_spec=environment.time_step_spec(),
        action_spec=environment.action_spec(),
        dropout_rate=dropout_fn,
        network_layers=LAYERS,
        optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR))

regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn)
suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric(
  optimal_action_fn)

### Now train the MAB Agent

Create a local checkpoint folder if you already have not
!mkdir checkpoint

In [39]:
# !mkdir checkpoint

In [None]:
trainer.train(
      root_dir='checkpoint',
      agent=agent,
      environment=environment,
      training_loops=TRAINING_LOOPS,
      steps_per_loop=STEPS_PER_LOOP,
      additional_metrics=[regret_metric, suboptimal_arms_metric])









INFO:tensorflow:Assets written to: checkpoint/policy_40400/assets


INFO:tensorflow:Assets written to: checkpoint/policy_40400/assets


INFO:tensorflow:Assets written to: checkpoint/policy_42000/assets


INFO:tensorflow:Assets written to: checkpoint/policy_42000/assets


#### Development work below on getting tensors to work for prior movies

In [None]:
p_mi = env._previous_movie_indices
p_ui = env._previous_user_indices

p_ui

In [None]:
p_mi

In [700]:
broadcasted_user = tf.broadcast_to(p_ui, [BATCH_SIZE, NUM_ACTIONS]) #broadcast the user ID across all actions
broadcasted_user

<tf.Tensor: shape=(8, 20), dtype=int32, numpy=
array([[ 46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,
         46,  46,  46,  46,  46,  46,  46],
       [891, 891, 891, 891, 891, 891, 891, 891, 891, 891, 891, 891, 891,
        891, 891, 891, 891, 891, 891, 891],
       [274, 274, 274, 274, 274, 274, 274, 274, 274, 274, 274, 274, 274,
        274, 274, 274, 274, 274, 274, 274],
       [866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866,
        866, 866, 866, 866, 866, 866, 866],
       [938, 938, 938, 938, 938, 938, 938, 938, 938, 938, 938, 938, 938,
        938, 938, 938, 938, 938, 938, 938],
       [ 47,  47,  47,  47,  47,  47,  47,  47,  47,  47,  47,  47,  47,
         47,  47,  47,  47,  47,  47,  47],
       [117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117,
        117, 117, 117, 117, 117, 117, 117],
       [385, 385, 385, 385, 385, 385, 385, 385, 385, 385, 385, 385, 385,
        385, 385, 385, 385, 385, 385, 385]], dtype=int32)>

In [719]:
chosen_user_movies = tf.stack([broadcasted_user      
                                       , p_mi]
                                      , axis=2)
# rewards_matrix = tf.gather_nd(indices=chosen_user_moves
#                               , params=self._approx_ratings_matrix)

In [720]:
chosen_user_movies

<tf.Tensor: shape=(8, 20, 2), dtype=int32, numpy=
array([[[  46, 1526],
        [  46, 1011],
        [  46, 1063],
        [  46, 1540],
        [  46,  115],
        [  46,  795],
        [  46, 1104],
        [  46,  758],
        [  46, 1555],
        [  46, 1071],
        [  46,  782],
        [  46, 1499],
        [  46,  161],
        [  46,  773],
        [  46,  864],
        [  46, 1589],
        [  46,  850],
        [  46,  953],
        [  46,  152],
        [  46, 1401]],

       [[ 891,  868],
        [ 891,  864],
        [ 891, 1510],
        [ 891, 1318],
        [ 891, 1065],
        [ 891, 1532],
        [ 891,  803],
        [ 891, 1015],
        [ 891, 1151],
        [ 891,  238],
        [ 891, 1073],
        [ 891, 1634],
        [ 891,  375],
        [ 891,  361],
        [ 891,  987],
        [ 891, 1636],
        [ 891,  301],
        [ 891,  855],
        [ 891,  543],
        [ 891,  372]],

       [[ 274, 1383],
        [ 274,  929],
        [ 274,   20],


In [721]:
rewards_matrix = tf.gather_nd(indices=chosen_user_moves
                               , params=env._approx_ratings_matrix)

In [722]:
rewards_matrix

<tf.Tensor: shape=(8, 20), dtype=float32, numpy=
array([[ 5.83164915e-02, -2.41056252e-02, -1.87867433e-02,
         8.62880517e-03,  7.30352163e-01,  2.27192324e-02,
         9.38408375e-02,  2.41663065e-02,  1.70041691e-03,
         2.13751364e-02, -1.24250874e-01,  9.84792318e-03,
         2.39062514e-02, -8.74508824e-03,  1.60256714e-01,
         1.60809308e-02,  3.14846598e-02, -4.05272655e-03,
         2.32983939e-03, -2.88842362e-04],
       [ 4.02831882e-02, -2.68716589e-02, -2.93028913e-02,
        -1.04498416e-01,  3.91238064e-01,  2.57909205e-02,
         1.31979123e-01,  8.55176270e-01, -8.76861140e-02,
         2.17240095e+00, -1.74399465e-01, -8.42698440e-02,
         2.27339536e-01,  7.18889013e-02,  5.99470213e-02,
        -8.42698440e-02,  5.59053957e-01, -1.64513901e-01,
         4.21740830e-01,  7.19132006e-01],
       [-3.26522440e-03,  5.83950162e-01,  5.90337574e-01,
        -1.97528660e-01, -1.32889107e-01,  4.70319875e-02,
         1.00843341e-03, -2.43588313e-0