# Preparing off-policy training data for RL

> "Off-policy" refers to the situation where for a data record, given its observation, the current policy in training might not choose the same action as the one in said data record

## Load env config

* use the prefix from `00-env-setup`

In [1]:
VERSION        = "v2"                       # TODO
PREFIX         = f'rec-bandits-{VERSION}'   # TODO

print(f"PREFIX: {PREFIX}")

PREFIX: rec-bandits-v2


**run the next cell to populate env vars**

In [2]:
# staging GCS
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}-bucket'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PROJECT_ID               = "hybrid-vertex"
PROJECT_NUM              = "934903580331"
LOCATION                 = "us-central1"

REGION                   = "us-central1"
BQ_LOCATION              = "US"
VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"

VERTEX_SA                = "934903580331-compute@developer.gserviceaccount.com"

PREFIX                   = "rec-bandits-v2"
VERSION                  = "v2"

BUCKET_NAME              = "rec-bandits-v2-hybrid-vertex-bucket"
BUCKET_URI               = "gs://rec-bandits-v2-hybrid-vertex-bucket"
DATA_GCS_PREFIX          = "data"
DATA_PATH                = "gs://rec-bandits-v2-hybrid-vertex-bucket/data"
VOCAB_SUBDIR             = "vocabs"
VOCAB_FILENAME           = "vocab_dict.pkl"

VPC_NETWORK_FULL         = "projects/934903580331/global/networks/ucaip-haystack-vpc-network"

BIGQUERY_DATASET_ID      = "hybrid_vertex.movielens_ds_rec_bandits_v2"
BIGQUERY_TABLE_ID        = "hybrid_vertex.movielens_ds_rec_bandits_v2.training_dataset"

REPO

In [3]:
! gsutil ls $BUCKET_URI

gs://rec-bandits-v2-hybrid-vertex-bucket/acc-paf-v3/
gs://rec-bandits-v2-hybrid-vertex-bucket/baseline-perarm-local-v1/
gs://rec-bandits-v2-hybrid-vertex-bucket/config/
gs://rec-bandits-v2-hybrid-vertex-bucket/data/
gs://rec-bandits-v2-hybrid-vertex-bucket/env-ranker-rec-bandits-v2/
gs://rec-bandits-v2-hybrid-vertex-bucket/local-ranker-rec-bandits-v2/
gs://rec-bandits-v2-hybrid-vertex-bucket/mab-local-classy-v3/
gs://rec-bandits-v2-hybrid-vertex-bucket/v2-acc-cuda-alloc/
gs://rec-bandits-v2-hybrid-vertex-bucket/v2-acc-input-data-v1/
gs://rec-bandits-v2-hybrid-vertex-bucket/v2-acc-local-one-device/
gs://rec-bandits-v2-hybrid-vertex-bucket/v2-acc-local-oned-thdc-4/
gs://rec-bandits-v2-hybrid-vertex-bucket/v2-acc-local-v1/
gs://rec-bandits-v2-hybrid-vertex-bucket/v2-acc-no-summaries/
gs://rec-bandits-v2-hybrid-vertex-bucket/v2-local-2a-v1/
gs://rec-bandits-v2-hybrid-vertex-bucket/v2-scale-mod-v1/
gs://rec-bandits-v2-hybrid-vertex-bucket/v2-scale-t4-docker/
gs://rec-bandits-v2-hybrid-verte

## imports

In [4]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [5]:
import numpy as np
import sys
import time
from pprint import pprint
from IPython import display
import matplotlib.pyplot as plt

import logging
logging.disable(logging.WARNING)

import tensorflow as tf
import tensorflow_datasets as tfds
# import tensorflow_recommenders as tfrs

# GPU
from numba import cuda
import gc

# google cloud
from google.cloud import aiplatform, storage

from src.per_arm_rl import data_utils as data_utils
from src.per_arm_rl import data_config # as data_config

In [6]:
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
    
gpus

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [8]:
device = cuda.get_current_device()
device.reset()
gc.collect()

14

### Initialize GCP clients

In [9]:
# cloud storage client
storage_client = storage.Client(project=PROJECT_ID)

# Vertex client
aiplatform.init(project=PROJECT_ID, location=LOCATION)

### Create `data_utils.py`

> this will be used to support data processing throughout the development workflow

## Prepare Movielens dataset

### load data from Tensorflow Datasets

* see [TFDS documentation](https://www.tensorflow.org/datasets/catalog/movielens#movielens100k-ratings) for more details on this dataset, feature descriptions, and other versions

In [10]:
ratings = tfds.load("movielens/100k-ratings", split="train")

for x in ratings.batch(1).take(1):
    pprint(x)

{'bucketized_user_age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([45.], dtype=float32)>,
 'movie_genres': <tf.Tensor: shape=(1, 1), dtype=int64, numpy=array([[7]])>,
 'movie_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'357'], dtype=object)>,
 'movie_title': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b"One Flew Over the Cuckoo's Nest (1975)"], dtype=object)>,
 'raw_user_age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([46.], dtype=float32)>,
 'timestamp': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([879024327])>,
 'user_gender': <tf.Tensor: shape=(1,), dtype=bool, numpy=array([ True])>,
 'user_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'138'], dtype=object)>,
 'user_occupation_label': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([4])>,
 'user_occupation_text': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'doctor'], dtype=object)>,
 'user_rating': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([4.], dtype=float32)>,

In [29]:
ratings

<_PrefetchDataset element_spec={'bucketized_user_age': TensorSpec(shape=(), dtype=tf.float32, name=None), 'movie_genres': TensorSpec(shape=(None,), dtype=tf.int64, name=None), 'movie_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'movie_title': TensorSpec(shape=(), dtype=tf.string, name=None), 'raw_user_age': TensorSpec(shape=(), dtype=tf.float32, name=None), 'timestamp': TensorSpec(shape=(), dtype=tf.int64, name=None), 'user_gender': TensorSpec(shape=(), dtype=tf.bool, name=None), 'user_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'user_occupation_label': TensorSpec(shape=(), dtype=tf.int64, name=None), 'user_occupation_text': TensorSpec(shape=(), dtype=tf.string, name=None), 'user_rating': TensorSpec(shape=(), dtype=tf.float32, name=None), 'user_zip_code': TensorSpec(shape=(), dtype=tf.string, name=None)}>

In [11]:
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
val = shuffled.skip(80_000).take(20_000)

### write dataset to TF Records

In [12]:
# full dataset
TF_RECORD_FILE_full = "ml-ratings-100k-full.tfrecord"
LOCAL_TF_RECORD_full = f"./{TF_RECORD_FILE_full}"

# train split
TF_RECORD_FILE_train = "ml-ratings-100k-train.tfrecord"
LOCAL_TF_RECORD_train = f"./{TF_RECORD_FILE_train}"

# val split
TF_RECORD_FILE_val = "ml-ratings-100k-val.tfrecord"
LOCAL_TF_RECORD_val = f"./{TF_RECORD_FILE_val}"

# paths
FULL_DATA_PATH = f"{DATA_PATH}"
TRAIN_DATA_PATH = f"{DATA_PATH}/train"
VAL_DATA_PATH = f"{DATA_PATH}/val"

print(f"FULL_DATA_PATH   : {FULL_DATA_PATH}")
print(f"TRAIN_DATA_PATH  : {TRAIN_DATA_PATH}")
print(f"VAL_DATA_PATH    : {VAL_DATA_PATH}")

FULL_DATA_PATH   : gs://rec-bandits-v2-hybrid-vertex-bucket/data
TRAIN_DATA_PATH  : gs://rec-bandits-v2-hybrid-vertex-bucket/data/train
VAL_DATA_PATH    : gs://rec-bandits-v2-hybrid-vertex-bucket/data/val


In [13]:
data_utils.write_tfrecords(TF_RECORD_FILE_full, ratings, list_wise=False)

In [14]:
data_utils.write_tfrecords(TF_RECORD_FILE_train, train, list_wise=False)

In [17]:
data_utils.write_tfrecords(TF_RECORD_FILE_val, val, list_wise=False)

### save TF Records to GCS

In [18]:
# LOCAL_TF_RECORD = f"./{TF_RECORD_FILE}"
# ! gsutil -q cp $LOCAL_TF_RECORD $DATA_PATH/train/
# ! gsutil ls $DATA_PATH

! gsutil -q cp $LOCAL_TF_RECORD_full $FULL_DATA_PATH/
! gsutil -q cp $LOCAL_TF_RECORD_train $TRAIN_DATA_PATH/
! gsutil -q cp $LOCAL_TF_RECORD_val $VAL_DATA_PATH/

In [19]:
! gsutil ls $DATA_PATH

gs://rec-bandits-v2-hybrid-vertex-bucket/data/ml-ratings-100k-full.tfrecord
gs://rec-bandits-v2-hybrid-vertex-bucket/data/listwise-3n-train/
gs://rec-bandits-v2-hybrid-vertex-bucket/data/listwise-3n-val/
gs://rec-bandits-v2-hybrid-vertex-bucket/data/train/
gs://rec-bandits-v2-hybrid-vertex-bucket/data/val/


## validate TF Records

In [17]:
train_files = []
for blob in storage_client.list_blobs(f"{BUCKET_NAME}", prefix=f'{DATA_GCS_PREFIX}/', delimiter='/'):
    if '.tfrecord' in blob.name:
        train_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
        
train_files

['gs://rec-bandits-v2-hybrid-vertex-bucket/data/ml-ratings-100k-full.tfrecord']

In [18]:
train_dataset = tf.data.TFRecordDataset(train_files)
train_dataset

<TFRecordDatasetV2 element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>

In [19]:
train_dataset = train_dataset.map(data_utils.parse_tfrecord)

for x in train_dataset.batch(1).take(1):
    pprint(x)

{'bucketized_user_age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([45.], dtype=float32)>,
 'movie_genres': <tf.Tensor: shape=(1, 1), dtype=int64, numpy=array([[7]])>,
 'movie_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'357'], dtype=object)>,
 'timestamp': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([879024327])>,
 'user_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'138'], dtype=object)>,
 'user_occupation_text': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'doctor'], dtype=object)>,
 'user_rating': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([4.], dtype=float32)>}


In [20]:
# Get the unique movies and users
unique_user_ratings = train_dataset.map(lambda x: x["user_rating"])

unique_user_ratings = np.unique([x.numpy() for x in unique_user_ratings])

unique_user_ratings

array([1., 2., 3., 4., 5.], dtype=float32)

# Generate look-up dicts

**TODO** - use more Tensorflow native method for generating vocabs and stats

### unique IDs

In [23]:
# Get the unique movies and users
unique_movie_ids = train_dataset.map(lambda x: x["movie_id"])

unique_movie_ids = np.unique([x.numpy() for x in unique_movie_ids])

MOVIELENS_NUM_MOVIES = len(unique_movie_ids)

print(f"len(unique_movie_ids) : {len(unique_movie_ids)}")
print(f"unique_movie_ids      : {unique_movie_ids[:2]}")

len(unique_movie_ids) : 1682
unique_movie_ids      : [b'1' b'10']


In [24]:
# Get the unique movies and users
# unique_movie_ids = ratings.map(lambda x: x["movie_id"])
unique_occ_ids = train_dataset.map(lambda x: x["user_occupation_text"])

unique_occ_ids = np.unique([x.numpy() for x in unique_occ_ids])

NUM_OCCS = len(unique_occ_ids)

print(f"len(unique_occ_ids) : {len(unique_occ_ids)}")
print(f"unique_occ_ids      : {unique_occ_ids[:2]}")

len(unique_occ_ids) : 21
unique_occ_ids      : [b'administrator' b'artist']


In [25]:
# unique_user_ids = ratings.map(lambda x: x["user_id"])
unique_user_ids = train_dataset.map(lambda x: x["user_id"])

unique_user_ids = np.unique([x.numpy() for x in unique_user_ids])

MOVIELENS_NUM_USERS = len(unique_user_ids)

print(f"len(unique_user_ids) : {len(unique_user_ids)}")
print(f"unique_user_ids      : {unique_user_ids[:2]}")

len(unique_user_ids) : 943
unique_user_ids      : [b'1' b'10']


### lookup dictionaries

In [26]:
USER_AGE_LOOKUP = data_utils.get_dictionary_lookup_by_tf_data_key(
    key = 'bucketized_user_age'
    , dataset= train_dataset
)

USER_AGE_DIM = len(USER_AGE_LOOKUP)
print(f"USER_AGE_DIM: {USER_AGE_DIM}")

USER_AGE_LOOKUP

USER_AGE_DIM: 7


{1.0: 0, 35.0: 1, 45.0: 2, 18.0: 3, 50.0: 4, 56.0: 5, 25.0: 6}

In [27]:
USER_OCC_LOOKUP = data_utils.get_dictionary_lookup_by_tf_data_key(
    key = 'user_occupation_text'
    , dataset= train_dataset
)
USER_OCC_DIM = len(USER_OCC_LOOKUP)
print(f"USER_OCC_DIM: {USER_OCC_DIM}")

# USER_OCC_LOOKUP

USER_OCC_DIM: 21


In [28]:
MOVIE_GEN_LOOKUP = data_utils.get_dictionary_lookup_by_tf_data_key(
    key = 'movie_genres'
    , dataset= train_dataset
)
MOVIE_GEN_DIM = len(MOVIE_GEN_LOOKUP)
print(f"MOVIE_GEN_DIM: {MOVIE_GEN_DIM}")

# MOVIE_GEN_LOOKUP

MOVIE_GEN_DIM: 19


## create `data_config.py`

> write data config for subsequent notebooks

In [29]:
config = f'''
USER_AGE_LOOKUP       = {USER_AGE_LOOKUP}
USER_AGE_DIM          = {USER_AGE_DIM}

USER_OCC_LOOKUP       = {USER_OCC_LOOKUP}
USER_OCC_DIM          = {USER_OCC_DIM}

MOVIE_GEN_LOOKUP      = {MOVIE_GEN_LOOKUP}
MOVIE_GEN_DIM         = {MOVIE_GEN_DIM}

MOVIELENS_NUM_MOVIES  = {MOVIELENS_NUM_MOVIES}
MOVIELENS_NUM_USERS   = {MOVIELENS_NUM_USERS}
'''
# TODO - cleanup
with open(f'{REPO_DOCKER_PATH_PREFIX}/{RL_SUB_DIR}/data_config.py', 'w') as f:
    f.write(config)

### Validate creating the ratings matrix

In [12]:
from src.per_arm_rl import data_config as data_config

In [13]:
data_config.USER_AGE_LOOKUP

{1.0: 0, 35.0: 1, 45.0: 2, 18.0: 3, 50.0: 4, 56.0: 5, 25.0: 6}

In [14]:
data_config.USER_OCC_LOOKUP

{b'salesman': 0,
 b'programmer': 1,
 b'writer': 2,
 b'librarian': 3,
 b'marketing': 4,
 b'homemaker': 5,
 b'scientist': 6,
 b'entertainment': 7,
 b'engineer': 8,
 b'executive': 9,
 b'student': 10,
 b'technician': 11,
 b'none': 12,
 b'artist': 13,
 b'doctor': 14,
 b'lawyer': 15,
 b'retired': 16,
 b'administrator': 17,
 b'other': 18,
 b'educator': 19,
 b'healthcare': 20}

In [15]:
data_config.MOVIE_GEN_LOOKUP

{0: 0,
 1: 1,
 2: 2,
 3: 3,
 4: 4,
 5: 5,
 6: 6,
 7: 7,
 8: 8,
 9: 9,
 10: 10,
 12: 11,
 13: 12,
 14: 13,
 15: 14,
 16: 15,
 17: 16,
 18: 17,
 19: 18}

**Test movielens rating matrix**

In [30]:
local_data = train_dataset.map(
    lambda x: {
        'user_id': x['user_id']
        ,'movie_id':  x['movie_id']
        ,'user_rating':  x['user_rating']
        ,'bucketized_user_age': x['bucketized_user_age']
        ,'user_occupation_text': x['user_occupation_text']
        ,'movie_genres': x['movie_genres'][0]
    }
)
local_data

<_MapDataset element_spec={'user_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'movie_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'user_rating': TensorSpec(shape=(), dtype=tf.float32, name=None), 'bucketized_user_age': TensorSpec(shape=(), dtype=tf.float32, name=None), 'user_occupation_text': TensorSpec(shape=(), dtype=tf.string, name=None), 'movie_genres': TensorSpec(shape=(), dtype=tf.int64, name=None)}>

In [31]:
ratings_matrix = np.zeros([data_config.MOVIELENS_NUM_USERS, data_config.MOVIELENS_NUM_MOVIES])
ratings_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [33]:
user_age_int = []
user_occ_int = []
mov_gen_int = []

for row in local_data:
    ratings_matrix[
        int(row['user_id'].numpy()) - 1
        , int(row['movie_id'].numpy()) - 1
    ] = float(row['user_rating'].numpy())

    user_age_int.append(
        float(data_config.USER_AGE_LOOKUP[row['bucketized_user_age'].numpy()]) + .0001
    )
    user_occ_int.append(
        float(data_config.USER_OCC_LOOKUP[row['user_occupation_text'].numpy()]) + .0001
    )
    mov_gen_int.append(
        float(data_config.MOVIE_GEN_LOOKUP[row['movie_genres'].numpy()]) + .0001
    ) 

In [34]:
ratings_matrix

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [35]:
np.array(user_age_int)

array([2.0001, 6.0001, 3.0001, ..., 3.0001, 1.0001, 3.0001])

In [36]:
np.array(user_occ_int)

array([14.0001,  7.0001, 10.0001, ..., 10.0001,  2.0001, 10.0001])

In [37]:
np.array(mov_gen_int)

array([7.00010e+00, 4.00010e+00, 4.00010e+00, ..., 1.00001e+01,
       1.00000e-04, 4.00010e+00])

The commands above are executed when calling the `load_movielens_ratings()` function below:

In [21]:
test_dataset_load = data_utils.load_movielens_ratings(
    ratings_dataset = train_dataset
    , num_users = data_config.MOVIELENS_NUM_USERS
    , num_movies = data_config.MOVIELENS_NUM_MOVIES
    , user_age_lookup_dict = data_config.USER_AGE_LOOKUP
    , user_occ_lookup_dict = data_config.USER_OCC_LOOKUP
    , movie_gen_lookup_dict = data_config.MOVIE_GEN_LOOKUP
)

print(f"num outputs: {len(test_dataset_load)}")

test_dataset_load

num outputs: 4


(array([[5., 3., 4., ..., 0., 0., 0.],
        [4., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [5., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 5., 0., ..., 0., 0., 0.]]),
 array([2.0001, 6.0001, 3.0001, ..., 3.0001, 1.0001, 3.0001]),
 array([14.0001,  7.0001, 10.0001, ..., 10.0001,  2.0001, 10.0001]),
 array([7.00010e+00, 4.00010e+00, 4.00010e+00, ..., 1.00001e+01,
        1.00000e-04, 4.00010e+00]))

In [22]:
ratings_matrix = test_dataset_load[0]
print(ratings_matrix.shape)
ratings_matrix

(943, 1682)


array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [23]:
user_age_int = test_dataset_load[1]
print(user_age_int[0])
user_age_int

2.0001


array([2.0001, 6.0001, 3.0001, ..., 3.0001, 1.0001, 3.0001])

In [36]:
user_occ_int = test_dataset_load[2]
print(user_occ_int)
user_occ_int

[ 6.0001 20.0001 10.0001 ... 10.0001 13.0001 10.0001]


array([ 6.0001, 20.0001, 10.0001, ..., 10.0001, 13.0001, 10.0001])

In [38]:
mov_gen_int = test_dataset_load[3]
print(mov_gen_int)
mov_gen_int

[7.00010e+00 4.00010e+00 4.00010e+00 ... 1.00001e+01 1.00000e-04
 4.00010e+00]


array([7.00010e+00, 4.00010e+00, 4.00010e+00, ..., 1.00001e+01,
       1.00000e-04, 4.00010e+00])

# Ranking Data (listwise)

**define how many items should be in each example's item list:**

In [27]:
NUM_EXAMPLES_PER_LIST = 5 # 3 | 5

**write this value to config file for `data_util` functions**

In [None]:
config = f'''
NUM_EXAMPLES_PER_LIST = {NUM_EXAMPLES_PER_LIST}
'''
# TODO - cleanup
with open(f'{REPO_DOCKER_PATH_PREFIX}/{RL_SUB_DIR}/utils_config.py', 'w') as f:
    f.write(config)

In [28]:
from src.per_arm_rl import utils_config as utils_config

utils_config.NUM_EXAMPLES_PER_LIST

5

In [29]:
import array
import collections
import numpy as np
from typing import Dict, List, Optional, Text, Tuple

from pprint import pprint

from src.per_arm_rl import data_utils as data_utils

In [45]:
# train = shuffled.take(80_000)
# val = shuffled.skip(80_000).take(20_000)

In [13]:
for x in train.batch(1).take(1):
    pprint(x)

{'bucketized_user_age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([35.], dtype=float32)>,
 'movie_genres': <tf.Tensor: shape=(1, 1), dtype=int64, numpy=array([[7]])>,
 'movie_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'898'], dtype=object)>,
 'movie_title': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Postman, The (1997)'], dtype=object)>,
 'raw_user_age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([44.], dtype=float32)>,
 'timestamp': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([885409515])>,
 'user_gender': <tf.Tensor: shape=(1,), dtype=bool, numpy=array([False])>,
 'user_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'681'], dtype=object)>,
 'user_occupation_label': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([14])>,
 'user_occupation_text': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'marketing'], dtype=object)>,
 'user_rating': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([4.], dtype=float32)>,
 'user_zip_cod

## Create train listwise ds

In [17]:
NUM_EXAMPLES_PER_LIST = 5    # 3 | 5

In [26]:
config = f'''
NUM_EXAMPLES_PER_LIST = {NUM_EXAMPLES_PER_LIST}
'''
# TODO - cleanup
with open(f'{REPO_DOCKER_PATH_PREFIX}/{RL_SUB_DIR}/utils_config.py', 'w') as f:
    f.write(config)

In [None]:
from src.per_arm_rl import utils_config as utils_config

utils_config.NUM_EXAMPLES_PER_LIST

In [18]:
# We sample 50 lists for each user for the training data. For each list we
# sample (3 | 5) movies from the movies the user rated.
train_lw = data_utils.create_listwise_ds(
    train,
    num_list_per_user=50,
    num_examples_per_list=NUM_EXAMPLES_PER_LIST,
    seed=42
)

In [19]:
for example in train_lw.skip(7).take(1):
    pprint(example)

{'movie_genres': <tf.Tensor: shape=(5,), dtype=int64, numpy=array([4, 7, 7, 2, 7])>,
 'movie_id': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'294', b'690', b'1176', b'538', b'310'], dtype=object)>,
 'user_id': <tf.Tensor: shape=(), dtype=string, numpy=b'681'>,
 'user_rating': <tf.Tensor: shape=(5,), dtype=float32, numpy=array([5., 4., 4., 3., 3.], dtype=float32)>}


In [20]:
example['user_rating'].numpy()

array([5., 4., 4., 3., 3.], dtype=float32)

In [21]:
len(list(train_lw))

47150

### write TF records file

In [23]:
DATA_PATH

'gs://rec-bandits-v2-hybrid-vertex-bucket/data'

In [24]:
# train split
TF_RECORD_FILE_lw_train = f"ml-100k-listwise-{NUM_EXAMPLES_PER_LIST}n-train.tfrecord"
LOCAL_TF_RECORD_lw_train = f"./{TF_RECORD_FILE_lw_train}"

In [25]:
data_utils.write_tfrecords(LOCAL_TF_RECORD_lw_train, train_lw, list_wise=True)

#### validate TRAIN TF record file(s)

In [30]:
# test TF record local
tmp_lw_dataset = tf.data.TFRecordDataset(LOCAL_TF_RECORD_lw_train)
tmp_lw_dataset = tmp_lw_dataset.map(data_utils.parse_lw_tfrecord)

for x in tmp_lw_dataset.batch(1).take(1):
    pprint(x)

{'movie_genres': <tf.Tensor: shape=(1, 5), dtype=int64, numpy=array([[ 7,  4,  7,  7, 10]])>,
 'movie_id': <tf.Tensor: shape=(1, 5), dtype=string, numpy=array([[b'898', b'294', b'258', b'1176', b'682']], dtype=object)>,
 'user_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'681'], dtype=object)>,
 'user_rating': <tf.Tensor: shape=(1, 5), dtype=float32, numpy=array([[4., 5., 1., 4., 1.]], dtype=float32)>}


In [31]:
lw_train_prefix = f"{DATA_GCS_PREFIX}/listwise-{NUM_EXAMPLES_PER_LIST}n-train"
LW_TRAIN_DATA_PATH = f"{BUCKET_URI}/{lw_train_prefix}"

! gsutil -q cp $LOCAL_TF_RECORD_lw_train $LW_TRAIN_DATA_PATH/

In [32]:
# ! gsutil ls $LW_TRAIN_DATA_PATH

gs://rec-bandits-v2-hybrid-vertex-bucket/data/listwise-5n-train/ml-100k-listwise-5n-train.tfrecord


In [33]:
test_lw_files = []

for blob in storage_client.list_blobs(f"{BUCKET_NAME}", prefix=f'{lw_train_prefix}/', delimiter='/'):
    if '.tfrecord' in blob.name:
        test_lw_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
        
tmp_lw_dataset = tf.data.TFRecordDataset(test_lw_files)
tmp_lw_dataset = tmp_lw_dataset.map(data_utils.parse_lw_tfrecord)

for x in tmp_lw_dataset.batch(1).take(1):
    pprint(x)

{'movie_genres': <tf.Tensor: shape=(1, 5), dtype=int64, numpy=array([[ 7,  4,  7,  7, 10]])>,
 'movie_id': <tf.Tensor: shape=(1, 5), dtype=string, numpy=array([[b'898', b'294', b'258', b'1176', b'682']], dtype=object)>,
 'user_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'681'], dtype=object)>,
 'user_rating': <tf.Tensor: shape=(1, 5), dtype=float32, numpy=array([[4., 5., 1., 4., 1.]], dtype=float32)>}


## Create val listwise ds

In [34]:
val_lw = data_utils.create_listwise_ds(
    val,
    num_list_per_user=1,
    num_examples_per_list=NUM_EXAMPLES_PER_LIST,
    seed=42
)

In [35]:
for example in val_lw.take(3):
    pprint(example)

{'movie_genres': <tf.Tensor: shape=(5,), dtype=int64, numpy=array([ 3,  0,  0,  0, 19])>,
 'movie_id': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'94', b'245', b'403', b'50', b'470'], dtype=object)>,
 'user_id': <tf.Tensor: shape=(), dtype=string, numpy=b'346'>,
 'user_rating': <tf.Tensor: shape=(5,), dtype=float32, numpy=array([3., 4., 3., 5., 3.], dtype=float32)>}
{'movie_genres': <tf.Tensor: shape=(5,), dtype=int64, numpy=array([7, 0, 0, 2, 7])>,
 'movie_id': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'678', b'127', b'343', b'1', b'125'], dtype=object)>,
 'user_id': <tf.Tensor: shape=(), dtype=string, numpy=b'602'>,
 'user_rating': <tf.Tensor: shape=(5,), dtype=float32, numpy=array([4., 5., 2., 4., 4.], dtype=float32)>}
{'movie_genres': <tf.Tensor: shape=(5,), dtype=int64, numpy=array([4, 7, 4, 0, 4])>,
 'movie_id': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'168', b'1053', b'26', b'110', b'1048'], dtype=object)>,
 'user_id': <tf.Tensor: shape=(), dtype=st

In [36]:
len(list(val_lw))

836

### write TF records file

In [37]:
# val split
TF_RECORD_FILE_lw_val = f"ml-100k-listwise-{NUM_EXAMPLES_PER_LIST}n-val.tfrecord"
LOCAL_TF_RECORD_lw_val = f"./{TF_RECORD_FILE_lw_val}"

In [38]:
data_utils.write_tfrecords(LOCAL_TF_RECORD_lw_val, val_lw, list_wise=True)

#### validate VAL TF record file(s)

In [39]:
# test TF record local
tmp_lw_dataset = tf.data.TFRecordDataset(LOCAL_TF_RECORD_lw_val)
tmp_lw_dataset = tmp_lw_dataset.map(data_utils.parse_lw_tfrecord)

for x in tmp_lw_dataset.batch(1).take(1):
    pprint(x)

{'movie_genres': <tf.Tensor: shape=(1, 5), dtype=int64, numpy=array([[ 3,  0,  0,  0, 19]])>,
 'movie_id': <tf.Tensor: shape=(1, 5), dtype=string, numpy=array([[b'94', b'245', b'403', b'50', b'470']], dtype=object)>,
 'user_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'346'], dtype=object)>,
 'user_rating': <tf.Tensor: shape=(1, 5), dtype=float32, numpy=array([[3., 4., 3., 5., 3.]], dtype=float32)>}


In [40]:
lw_val_prefix = f"{DATA_GCS_PREFIX}/listwise-{NUM_EXAMPLES_PER_LIST}n-val"
LW_VAL_DATA_PATH = f"{BUCKET_URI}/{lw_val_prefix}"

! gsutil -q cp $LOCAL_TF_RECORD_lw_val $LW_VAL_DATA_PATH/

In [41]:
# ! gsutil ls $LW_VAL_DATA_PATH

gs://rec-bandits-v2-hybrid-vertex-bucket/data/listwise-5n-val/ml-100k-listwise-5n-val.tfrecord


In [42]:
test_lw_files = []

for blob in storage_client.list_blobs(f"{BUCKET_NAME}", prefix=f'{lw_val_prefix}/', delimiter='/'):
    if '.tfrecord' in blob.name:
        test_lw_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
        
tmp_lw_dataset = tf.data.TFRecordDataset(test_lw_files)
tmp_lw_dataset = tmp_lw_dataset.map(data_utils.parse_lw_tfrecord)

for x in tmp_lw_dataset.batch(1).take(1):
    pprint(x)

{'movie_genres': <tf.Tensor: shape=(1, 5), dtype=int64, numpy=array([[ 3,  0,  0,  0, 19]])>,
 'movie_id': <tf.Tensor: shape=(1, 5), dtype=string, numpy=array([[b'94', b'245', b'403', b'50', b'470']], dtype=object)>,
 'user_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'346'], dtype=object)>,
 'user_rating': <tf.Tensor: shape=(1, 5), dtype=float32, numpy=array([[3., 4., 3., 5., 3.]], dtype=float32)>}


### get uniques

In [71]:
movies = ratings.map(lambda x: x["movie_id"])
unique_movie_ids = np.unique(np.concatenate(list(movies.batch(1000))))

len(unique_movie_ids)

1682

In [72]:
users = ratings.map(lambda x: x["user_id"])

unique_user_ids = np.unique(np.concatenate(list(users.batch(1000))))

len(unique_user_ids)

943

# Copy vocab file to GCS

In [74]:
!gsutil cp 02-perarm-features-bandit/vocab_dict.pkl gs://$BUCKET_NAME/$VOCAB_SUBDIR/$VOCAB_FILENAME

Copying file://02-perarm-features-bandit/vocab_dict.pkl [Content-Type=application/octet-stream]...
/ [1 files][142.9 KiB/142.9 KiB]                                                
Operation completed over 1 objects/142.9 KiB.                                    


In [75]:
!gsutil ls gs://$BUCKET_NAME/$VOCAB_SUBDIR

gs://rec-bandits-v2-hybrid-vertex-bucket/vocabs/vocab_dict.pkl


**Finished**