# Build per-arm Bandit model with TF-Agents and execute locally with Vertex AI

## background

**Arm features**
* In some bandits use cases, each arm has its own features. For example, in movie recommendation problems, the user features play the role of the context and the movies play the role of the arms (aka actions) 
* Each movie has its own features, such as `text description`, `metadata`, `trailer content` features and so on

These problems are often referred to as `arm features problems`

In [1]:
! python3 -c "import google.cloud.aiplatform; print('aiplatform SDK version: {}'.format(google.cloud.aiplatform.__version__))"

aiplatform SDK version: 1.33.1


## Load env config

* use the prefix from `00-env-setup`

In [2]:
# PREFIX = 'mabv1'
VERSION        = "v2"                       # TODO
PREFIX         = f'rec-bandits-{VERSION}'   # TODO

print(f"PREFIX: {PREFIX}")

PREFIX: rec-bandits-v2


**run the next cell to populate env vars**

In [3]:
# staging GCS
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}-bucket'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PROJECT_ID               = "hybrid-vertex"
PROJECT_NUM              = "934903580331"
LOCATION                 = "us-central1"

REGION                   = "us-central1"
BQ_LOCATION              = "US"
VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"

VERTEX_SA                = "934903580331-compute@developer.gserviceaccount.com"

PREFIX                   = "rec-bandits-v2"
VERSION                  = "v2"

BUCKET_NAME              = "rec-bandits-v2-hybrid-vertex-bucket"
BUCKET_URI               = "gs://rec-bandits-v2-hybrid-vertex-bucket"
DATA_GCS_PREFIX          = "data"
DATA_PATH                = "gs://rec-bandits-v2-hybrid-vertex-bucket/data"
VOCAB_SUBDIR             = "vocabs"
VOCAB_FILENAME           = "vocab_dict.pkl"
DATA_PATH_KFP_DEMO       = "gs://rec-bandits-v2-hybrid-vertex-bucket/data/kfp_demo_data/u.data"

VPC_NETWORK_FULL         = "projects/934903580331/global/networks/ucaip-haystack-vpc-network"

BIGQUERY_DATASET_NAME    = "mvlens_rec_bandits_v2"
BIGQUERY_TABLE_NA

In [4]:
# ! gsutil ls $BUCKET_URI

## imports

In [5]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [6]:
import functools
import json
from collections import defaultdict
from typing import Callable, Dict, List, Optional, TypeVar
from datetime import datetime
import time
from pprint import pprint

import logging
logging.disable(logging.WARNING)

import matplotlib.pyplot as plt
import numpy as np

# google cloud
from google.cloud import aiplatform, storage

# tensorflow
import tensorflow as tf
from tf_agents.agents import TFAgent
from tf_agents.bandits.agents import lin_ucb_agent
from tf_agents.bandits.agents.examples.v2 import trainer
from tf_agents.bandits.environments import (environment_utilities,
                                            movielens_py_environment,
                                            movielens_per_arm_py_environment)
from tf_agents.bandits.metrics import tf_metrics as tf_bandit_metrics
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import TFEnvironment, tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.metrics.tf_metric import TFStepMetric
from tf_agents.policies import policy_saver

# GPU
from numba import cuda 
import gc

# tf exceptions and vars
if tf.__version__[0] != "2":
    raise Exception("The trainer only runs with TensorFlow version 2.")

T = TypeVar("T")

In [7]:
import sys
sys.path.append("..")

# my project
from src.per_arm_rl import train_utils
from src.per_arm_rl import data_utils
from src.per_arm_rl import data_config
from src.per_arm_rl import trainer_baseline

### detect GPUs & reset devices

In [8]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [9]:
device = cuda.get_current_device()
device.reset()
gc.collect()

0

### Initialize Google Cloud SDK Clients

In [10]:
# cloud storage client
storage_client = storage.Client(project=PROJECT_ID)

# Vertex client
aiplatform.init(project=PROJECT_ID, location=LOCATION)

# Data prep

### copy sample data to $DATA_PATH

### Load data config

In [11]:
# test variables are set
print(f"USER_AGE_LOOKUP: {data_config.USER_AGE_LOOKUP}")
print(f"USER_AGE_DIM: {data_config.USER_AGE_DIM}")

print(f"USER_OCC_LOOKUP: {data_config.USER_OCC_LOOKUP}")
print(f"USER_OCC_DIM: {data_config.USER_OCC_DIM}")

print(f"MOVIE_GEN_LOOKUP: {data_config.MOVIE_GEN_LOOKUP}")
print(f"MOVIE_GEN_DIM: {data_config.MOVIE_GEN_DIM}")

print(f"MOVIELENS_NUM_MOVIES: {data_config.MOVIELENS_NUM_MOVIES}")
print(f"MOVIELENS_NUM_USERS: {data_config.MOVIELENS_NUM_USERS}")

USER_AGE_LOOKUP: {1.0: 0, 35.0: 1, 45.0: 2, 18.0: 3, 50.0: 4, 56.0: 5, 25.0: 6}
USER_AGE_DIM: 7
USER_OCC_LOOKUP: {b'salesman': 0, b'programmer': 1, b'writer': 2, b'librarian': 3, b'marketing': 4, b'homemaker': 5, b'scientist': 6, b'entertainment': 7, b'engineer': 8, b'executive': 9, b'student': 10, b'technician': 11, b'none': 12, b'artist': 13, b'doctor': 14, b'lawyer': 15, b'retired': 16, b'administrator': 17, b'other': 18, b'educator': 19, b'healthcare': 20}
USER_OCC_DIM: 21
MOVIE_GEN_LOOKUP: {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18}
MOVIE_GEN_DIM: 19
MOVIELENS_NUM_MOVIES: 1682
MOVIELENS_NUM_USERS: 943


In [12]:
# type(data_config.USER_OCC_LOOKUP)

### Read TF Records

In [13]:
options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.AUTO

In [14]:
SPLIT = "train" # "train" | "val"

train_files = []
for blob in storage_client.list_blobs(f"{BUCKET_NAME}", prefix=f'{DATA_GCS_PREFIX}/', delimiter="/"): # {SPLIT}
    if '.tfrecord' in blob.name:
        train_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
        
train_files

['gs://rec-bandits-v2-hybrid-vertex-bucket/data/ml-ratings-100k-full.tfrecord']

In [15]:
train_dataset = tf.data.TFRecordDataset(train_files)

train_dataset = train_dataset.map(data_utils.parse_tfrecord)

train_dataset

<_MapDataset element_spec={'bucketized_user_age': TensorSpec(shape=(), dtype=tf.float32, name=None), 'movie_genres': TensorSpec(shape=(1,), dtype=tf.int64, name=None), 'movie_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'timestamp': TensorSpec(shape=(), dtype=tf.int64, name=None), 'user_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'user_occupation_text': TensorSpec(shape=(), dtype=tf.string, name=None), 'user_rating': TensorSpec(shape=(), dtype=tf.float32, name=None)}>

In [16]:
for x in train_dataset.batch(1).take(2):
    pprint(x)

{'bucketized_user_age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([45.], dtype=float32)>,
 'movie_genres': <tf.Tensor: shape=(1, 1), dtype=int64, numpy=array([[7]])>,
 'movie_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'357'], dtype=object)>,
 'timestamp': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([879024327])>,
 'user_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'138'], dtype=object)>,
 'user_occupation_text': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'doctor'], dtype=object)>,
 'user_rating': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([4.], dtype=float32)>}
{'bucketized_user_age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([25.], dtype=float32)>,
 'movie_genres': <tf.Tensor: shape=(1, 1), dtype=int64, numpy=array([[4]])>,
 'movie_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'709'], dtype=object)>,
 'timestamp': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([875654590])>,
 'user_id': <tf.Tensor: shape=(1

# Define RL modules

In [17]:
# !pwd

# Train RL modules

## set hyperparameters

In [46]:
# Set hyperparameters.
BATCH_SIZE       = 128      # Training and prediction batch size.
TRAINING_LOOPS   = 110     # Number of training iterations.
STEPS_PER_LOOP   = 2      # Number of driver steps per training iteration.

# Set MovieLens simulation environment parameters.
RANK_K           = 20     # Rank for matrix factorization in the MovieLens environment; also the observation dimension.
NUM_ACTIONS      = 10     # Number of actions (movie items) to choose from.
PER_ARM          = True   # Use the non-per-arm version of the MovieLens environment.

# Set agent parameters.
TIKHONOV_WEIGHT  = 0.001   # LinUCB Tikhonov regularization weight.
AGENT_ALPHA      = 10.0    # LinUCB exploration parameter that multiplies the confidence intervals.

print(f"BATCH_SIZE       : {BATCH_SIZE}")
print(f"TRAINING_LOOPS   : {TRAINING_LOOPS}")
print(f"STEPS_PER_LOOP   : {STEPS_PER_LOOP}")
print(f"RANK_K           : {RANK_K}")
print(f"NUM_ACTIONS      : {NUM_ACTIONS}")
print(f"PER_ARM          : {PER_ARM}")
print(f"TIKHONOV_WEIGHT  : {TIKHONOV_WEIGHT}")
print(f"AGENT_ALPHA      : {AGENT_ALPHA}")

BATCH_SIZE       : 128
TRAINING_LOOPS   : 110
STEPS_PER_LOOP   : 2
RANK_K           : 20
NUM_ACTIONS      : 10
PER_ARM          : True
TIKHONOV_WEIGHT  : 0.001
AGENT_ALPHA      : 10.0


## Define RL environment

One can define a bandit environment by subclassing `BanditTFEnvironment`, or, similarly to RL environments, one can define a `BanditPyEnvironment` and wrap it with `TFPyEnvironment`

> See `src.per_arm_rl.my_per_arm_py_env.py` for an example custom environment which implements a per-arm version of the MovieLens environment

In [47]:
from src.per_arm_rl import my_per_arm_py_env as my_per_arm_py_env

In [48]:
data_config.MOVIELENS_NUM_USERS

943

In [49]:
env = my_per_arm_py_env.MyMovieLensPerArmPyEnvironment(
    project_number = PROJECT_NUM
    , data_path = DATA_PATH
    , bucket_name = BUCKET_NAME
    , data_gcs_prefix = f"{DATA_GCS_PREFIX}"
    , user_age_lookup_dict = data_config.USER_AGE_LOOKUP
    , user_occ_lookup_dict = data_config.USER_OCC_LOOKUP
    , movie_gen_lookup_dict = data_config.MOVIE_GEN_LOOKUP
    , num_users = data_config.MOVIELENS_NUM_USERS
    , num_movies = data_config.MOVIELENS_NUM_MOVIES
    , rank_k = RANK_K
    , batch_size = BATCH_SIZE
    , num_actions = NUM_ACTIONS
)

environment = tf_py_environment.TFPyEnvironment(env)

In [50]:
env.movie_gen_lookup_dict

{0: 0,
 1: 1,
 2: 2,
 3: 3,
 4: 4,
 5: 5,
 6: 6,
 7: 7,
 8: 8,
 9: 9,
 10: 10,
 12: 11,
 13: 12,
 14: 13,
 15: 14,
 16: 15,
 17: 16,
 18: 17,
 19: 18}

In [51]:
env.dataset

<_MapDataset element_spec={'bucketized_user_age': TensorSpec(shape=(), dtype=tf.float32, name=None), 'movie_genres': TensorSpec(shape=(1,), dtype=tf.int64, name=None), 'movie_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'timestamp': TensorSpec(shape=(), dtype=tf.int64, name=None), 'user_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'user_occupation_text': TensorSpec(shape=(), dtype=tf.string, name=None), 'user_rating': TensorSpec(shape=(), dtype=tf.float32, name=None)}>

In [52]:
env._user_occ_int

array([14.0001,  7.0001, 10.0001, ..., 10.0001,  2.0001, 10.0001])

In [53]:
environment.observation_spec()

{'global': TensorSpec(shape=(22,), dtype=tf.float32, name=None),
 'per_arm': TensorSpec(shape=(10, 21), dtype=tf.float32, name=None)}

In [54]:
environment.time_step_spec()

TimeStep(
{'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
 'observation': {'global': TensorSpec(shape=(22,), dtype=tf.float32, name=None),
                 'per_arm': TensorSpec(shape=(10, 21), dtype=tf.float32, name=None)},
 'reward': TensorSpec(shape=(), dtype=tf.float32, name='reward'),
 'step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type')})

In [55]:
environment.action_spec()

BoundedTensorSpec(shape=(), dtype=tf.int32, name='action', minimum=array(0, dtype=int32), maximum=array(9, dtype=int32))

## Define RL agent/algorithm

In [56]:
agent = lin_ucb_agent.LinearUCBAgent(
    time_step_spec = environment.time_step_spec()
    , action_spec = environment.action_spec()
    , tikhonov_weight = TIKHONOV_WEIGHT
    , alpha = AGENT_ALPHA
    , dtype = tf.float32
    , accepts_per_arm_features = PER_ARM
    , summarize_grads_and_vars = True
    , enable_summaries = True
)

print("TimeStep Spec (for each batch):\n", agent.time_step_spec, "\n")
print("Action Spec (for each batch)  :\n", agent.action_spec, "\n")
print("Reward Spec (for each batch)  :\n", environment.reward_spec(), "\n")

TimeStep Spec (for each batch):
 TimeStep(
{'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
 'observation': DictWrapper({'global': TensorSpec(shape=(22,), dtype=tf.float32, name=None), 'per_arm': TensorSpec(shape=(10, 21), dtype=tf.float32, name=None)}),
 'reward': TensorSpec(shape=(), dtype=tf.float32, name='reward'),
 'step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type')}) 

Action Spec (for each batch)  :
 BoundedTensorSpec(shape=(), dtype=tf.int32, name='action', minimum=array(0, dtype=int32), maximum=array(9, dtype=int32)) 

Reward Spec (for each batch)  :
 TensorSpec(shape=(), dtype=tf.float32, name='reward') 



#### The flow of training data

First, let us have a look at the data specification in the agent. The `training_data_spec` attribute of the agent specifies what elements and structure the training data should have.

In [57]:
print('training data spec: ', agent.training_data_spec)

training data spec:  Trajectory(
{'action': BoundedTensorSpec(shape=(), dtype=tf.int32, name='action', minimum=array(0, dtype=int32), maximum=array(9, dtype=int32)),
 'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
 'next_step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type'),
 'observation': DictWrapper({'global': TensorSpec(shape=(22,), dtype=tf.float32, name=None)}),
 'policy_info': PerArmPolicyInfo(log_probability=(), predicted_rewards_mean=(), multiobjective_scalarized_predicted_rewards_mean=(), predicted_rewards_optimistic=(), predicted_rewards_sampled=(), bandit_policy_type=(), chosen_arm_features=TensorSpec(shape=(21,), dtype=tf.float32, name=None)),
 'reward': TensorSpec(shape=(), dtype=tf.float32, name='reward'),
 'step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type')})


If we have a closer look to the `observation` part of the spec, we see that it does not contain per-arm features!

In [58]:
print('observation spec in training: ', agent.training_data_spec.observation)

observation spec in training:  {'global': TensorSpec(shape=(22,), dtype=tf.float32, name=None)}


What happened to the per-arm features? To answer this question, first we note that when the LinUCB agent trains, it does not need the per-arm features of all arms, it only needs those of the **chosen arm**. Hence, it makes sense to drop the tensor of shape `[BATCH_SIZE, NUM_ACTIONS, PER_ARM_DIM]`, as it is very wasteful, especially if the number of actions is large.

But still, the per-arm features of the chosen arm must be somewhere! To this end, we make sure that the LinUCB policy stores the features of the chosen arm within the `policy_info` field of the training data:

In [59]:
print('chosen arm features: ', agent.training_data_spec.policy_info.chosen_arm_features)

chosen arm features:  TensorSpec(shape=(21,), dtype=tf.float32, name=None)


## Define RL metrics

Bandits' most important metric is **regret**, calculated as the difference between the reward collected by the agent and the expected reward of an oracle policy that has access to the reward functions of the environment. The [RegretMetric](https://github.com/tensorflow/agents/blob/master/tf_agents/bandits/metrics/tf_metrics.py) thus needs a `baseline_reward_fn` function that calculates the best achievable expected reward given an observation. For our example, we need to take the maximum of the no-noise equivalents of the reward functions that we already defined for the environment.

In [60]:
optimal_reward_fn = functools.partial(
    train_utils.compute_optimal_reward_with_my_environment
    , environment=environment
)

regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn)
regret_metric

<tf_agents.bandits.metrics.tf_metrics.RegretMetric at 0x7f3bdf5feaa0>

In [61]:
optimal_action_fn = functools.partial(
    train_utils.compute_optimal_action_with_my_environment,
    environment=environment,
)
    
suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric(
  optimal_action_fn
)
suboptimal_arms_metric

<tf_agents.bandits.metrics.tf_metrics.SuboptimalArmsMetric at 0x7f3a4482f4f0>

In [62]:
metrics = [regret_metric, suboptimal_arms_metric]
metrics

[<tf_agents.bandits.metrics.tf_metrics.RegretMetric at 0x7f3bdf5feaa0>,
 <tf_agents.bandits.metrics.tf_metrics.SuboptimalArmsMetric at 0x7f3a4482f4f0>]

## Train (locally)

A **policy** in a bandit problem works the same way as in an RL problem: it provides an action (or a distribution of actions), given an observation as input.
* For more details, see the [TF-Agents Policy tutorial](https://github.com/tensorflow/agents/blob/master/docs/tutorials/3_policies_tutorial.ipynb).
* As with environments, there are two ways to construct a policy: One can create a `PyPolicy` and wrap it with `TFPyPolicy`, or directly create a `TFPolicy`

**Replay buffers**
* Reinforcement learning algorithms use `replay buffers` to store trajectories of experience when executing a policy in an environment. * During training, replay buffers are queried for a subset of the trajectories (either a sequential subset or a sample) to "replay" the agent's experience.

### Define the training logic (on-policy training)

> The following function is the same as [trainer.train](https://github.com/tensorflow/agents/blob/master/tf_agents/bandits/agents/examples/v2/trainer.py#L130), but it keeps track of intermediate metric values and saves different artifacts to different locations. You can also directly invoke [trainer.train](https://github.com/tensorflow/agents/blob/master/tf_agents/bandits/agents/examples/v2/trainer.py#L130) which also trains the policy.

### set Vertex Exeperiment

In [63]:
EXPERIMENT_NAME   = f'baseline-perarm-local-v1'

invoke_time       = time.strftime("%Y%m%d-%H%M%S")
RUN_NAME          = f'run-{invoke_time}'

LOG_DIR           = f"{BUCKET_URI}/{EXPERIMENT_NAME}/{RUN_NAME}/tb-logs"
ROOT_DIR          = f"{BUCKET_URI}/{EXPERIMENT_NAME}/{RUN_NAME}/root"       # Root directory for writing logs/summaries/checkpoints.
ARTIFACTS_DIR     = f"{BUCKET_URI}/{EXPERIMENT_NAME}/{RUN_NAME}/artifacts"  # Where the trained model will be saved and restored.

aiplatform.init(
    project=PROJECT_ID,
    location=REGION,
    experiment=EXPERIMENT_NAME
)

print(f"EXPERIMENT_NAME   : {EXPERIMENT_NAME}")
print(f"RUN_NAME          : {RUN_NAME}")
print(f"LOG_DIR           : {LOG_DIR}")
print(f"ROOT_DIR          : {ROOT_DIR}")
print(f"ARTIFACTS_DIR     : {ARTIFACTS_DIR}")

EXPERIMENT_NAME   : baseline-perarm-local-v1
RUN_NAME          : run-20231107-150316
LOG_DIR           : gs://rec-bandits-v2-hybrid-vertex-bucket/baseline-perarm-local-v1/run-20231107-150316/tb-logs
ROOT_DIR          : gs://rec-bandits-v2-hybrid-vertex-bucket/baseline-perarm-local-v1/run-20231107-150316/root
ARTIFACTS_DIR     : gs://rec-bandits-v2-hybrid-vertex-bucket/baseline-perarm-local-v1/run-20231107-150316/artifacts


### train RL agent

> TODO: out-of-bounds index error

In [64]:
#start the timer and training
start_time = time.time()

metric_results = trainer_baseline.train(
    # root_dir=LOG_DIR,
    artifact_dir=ARTIFACTS_DIR,
    log_dir=LOG_DIR,
    agent=agent,
    environment=environment,
    training_loops=TRAINING_LOOPS,
    steps_per_loop=STEPS_PER_LOOP,
    additional_metrics=metrics,
    save_policy=True
)

end_time = time.time()
runtime_mins = int((end_time - start_time) / 60)
print(f"train runtime_mins: {runtime_mins}")

step = 0: train loss = 0.5099999904632568
step = 10: train loss = 1.0199999809265137
step = 20: train loss = 0.9800000190734863
step = 30: train loss = 1.0099999904632568
step = 40: train loss = 0.949999988079071
step = 50: train loss = 0.8399999737739563
step = 60: train loss = 0.7699999809265137
step = 70: train loss = 0.8100000023841858
step = 80: train loss = 0.8799999952316284
step = 90: train loss = 1.1399999856948853
step = 100: train loss = 1.0199999809265137
train runtime_mins: 6


### TensorBoard

In [65]:
# %load_ext tensorboard
%reload_ext tensorboard

In [66]:
%tensorboard --logdir=$LOG_DIR

### load trained policy

In [79]:
# !gsutil ls $ARTIFACTS_DIR

In [70]:
# trained_policy = tf.saved_model.load(ARTIFACTS_DIR)
# trained_policy

# Debugging notes

In [73]:
environment._u_hat

sampled_user_ages = np.ones(BATCH_SIZE)
sampled_user_occ = np.ones(BATCH_SIZE)

SAMPLED_USER_INDICES = np.random.randint(data_config.MOVIELENS_NUM_USERS, size=BATCH_SIZE)
SAMPLED_USER_INDICES

combined_user_features = np.concatenate(
    (
        environment._u_hat[SAMPLED_USER_INDICES]
        , sampled_user_ages.reshape(-1,1)
        , sampled_user_occ.reshape(-1,1)
    )
    , axis=1
)

In [74]:
SAMPLED_USER_INDICES

array([161, 765, 664, 855, 878,  14,  56, 321, 887, 754, 884,  69, 473,
        56, 136, 241, 758, 513, 319,  12, 132, 382, 684, 857, 675, 527,
       569,  46, 129, 424, 179, 392, 210, 313, 759, 244, 458, 467, 820,
       185, 169, 287, 718, 889, 426, 241, 549,  99, 704, 110, 287, 438,
       837, 336, 586, 290,  84, 475, 887,  21, 586,  97, 258,  22, 343,
       916, 562,  34, 741, 266, 557, 514, 362,  53, 730, 532,  99, 852,
       571, 940, 392, 762, 443, 278, 666, 202,  59, 481, 198, 262, 892,
       690, 790, 746, 761, 749, 308,  37,  80, 370, 781, 136, 217, 646,
       137, 827, 183, 366, 433, 874, 560, 839, 293, 689, 461, 113, 297,
       476, 660, 438, 562, 344, 126, 760, 481, 857, 414, 728])

In [75]:
sampled_u_hats_pred = environment._u_hat[SAMPLED_USER_INDICES]
sampled_u_hats_pred

array([[-0.01291269, -0.02463202, -0.00912144, ..., -0.02481789,
         0.00289364,  0.01819377],
       [-0.04131135,  0.06962406, -0.00452757, ..., -0.02675546,
        -0.02063719, -0.02562911],
       [-0.03796224, -0.03537715, -0.00614115, ...,  0.0305048 ,
        -0.03006611, -0.01280255],
       ...,
       [-0.00444794, -0.01411317,  0.01979582, ...,  0.02230501,
         0.01115231, -0.00133995],
       [-0.00854008,  0.00115751,  0.00909269, ...,  0.02653135,
         0.00677221, -0.00530296],
       [-0.00273581, -0.0172236 ,  0.01085442, ...,  0.00161616,
         0.01074529,  0.0073932 ]], dtype=float32)

In [76]:
environment._u_hat

array([[-0.0658043 ,  0.00597506, -0.00613256, ...,  0.01743993,
        -0.03573092,  0.03894605],
       [-0.01402104, -0.04662602,  0.05257856, ...,  0.01171281,
         0.01670101, -0.01658576],
       [-0.00565798, -0.02561845,  0.02336183, ..., -0.01843041,
        -0.02539826, -0.0020144 ],
       ...,
       [-0.00744452, -0.02502129,  0.00616532, ..., -0.03812664,
         0.03884653,  0.01103627],
       [-0.02403119,  0.00809611,  0.02288736, ..., -0.0200555 ,
         0.00571077, -0.00570774],
       [-0.04224209, -0.01092715, -0.05854604, ...,  0.02310407,
        -0.03948996, -0.0340745 ]], dtype=float32)

In [78]:
# # user_ids_pred = np.array([1,2,3,4,5,6,7])
# user_age_pred = np.array([1,1,1,1,1,1,1,1])
# user_occ_pred = np.array([1,1,1,1,1,1,1,1])

# combined_user_features = np.concatenate(
#     (
#         # user_ids_pred
#         sampled_u_hats_pred
#         , user_age_pred.reshape(-1,1)
#         , user_occ_pred.reshape(-1,1)
#     )
#     , axis=1
# )

# combined_user_features.shape