# Preparing off-policy training data for RL

> "Off-policy" refers to the situation where for a data record, given its observation, the current policy in training might not choose the same action as the one in said data record

## Load env config

* use the prefix from `00-env-setup`

In [1]:
VERSION        = "v2"                       # TODO
PREFIX         = f'rec-bandits-{VERSION}'   # TODO

print(f"PREFIX: {PREFIX}")

PREFIX: rec-bandits-v2


**run the next cell to populate env vars**

In [2]:
# staging GCS
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}-bucket'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PROJECT_ID               = "hybrid-vertex"
PROJECT_NUM              = "934903580331"
LOCATION                 = "us-central1"

REGION                   = "us-central1"
BQ_LOCATION              = "US"
VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"

VERTEX_SA                = "934903580331-compute@developer.gserviceaccount.com"

PREFIX                   = "rec-bandits-v2"
VERSION                  = "v2"

BUCKET_NAME              = "rec-bandits-v2-hybrid-vertex-bucket"
BUCKET_URI               = "gs://rec-bandits-v2-hybrid-vertex-bucket"
DATA_GCS_PREFIX          = "data"
DATA_PATH                = "gs://rec-bandits-v2-hybrid-vertex-bucket/data"
VOCAB_SUBDIR             = "vocabs"
VOCAB_FILENAME           = "vocab_dict.pkl"

VPC_NETWORK_FULL         = "projects/934903580331/global/networks/ucaip-haystack-vpc-network"

BIGQUERY_DATASET_ID      = "hybrid_vertex.movielens_ds_rec_bandits_v2"
BIGQUERY_TABLE_ID        = "hybrid_vertex.movielens_ds_rec_bandits_v2.training_dataset"

REPO

In [3]:
! gsutil ls $BUCKET_URI

gs://rec-bandits-v2-hybrid-vertex-bucket/config/
gs://rec-bandits-v2-hybrid-vertex-bucket/data/


## imports

In [4]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [6]:
import numpy as np
import sys
import time
from pprint import pprint
from IPython import display
import matplotlib.pyplot as plt

import logging
logging.disable(logging.WARNING)

import tensorflow as tf
import tensorflow_datasets as tfds
# import tensorflow_recommenders as tfrs

# GPU
from numba import cuda
import gc

# google cloud
from google.cloud import aiplatform, storage

from src.per_arm_rl import data_utils as data_utils
from src.per_arm_rl import data_config # as data_config

In [7]:
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
    
gpus

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [9]:
device = cuda.get_current_device()
device.reset()
gc.collect()

14

### Initialize GCP clients

In [10]:
# cloud storage client
storage_client = storage.Client(project=PROJECT_ID)

# Vertex client
aiplatform.init(project=PROJECT_ID, location=LOCATION)

### Create `data_utils.py`

> this will be used to support data processing throughout the development workflow

## Prepare Movielens dataset

### load data from Tensorflow Datasets

* see [TFDS documentation](https://www.tensorflow.org/datasets/catalog/movielens#movielens100k-ratings) for more details on this dataset, feature descriptions, and other versions

In [10]:
ratings = tfds.load("movielens/100k-ratings", split="train")

for x in ratings.batch(1).take(1):
    pprint(x)

{'bucketized_user_age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([45.], dtype=float32)>,
 'movie_genres': <tf.Tensor: shape=(1, 1), dtype=int64, numpy=array([[7]])>,
 'movie_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'357'], dtype=object)>,
 'movie_title': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b"One Flew Over the Cuckoo's Nest (1975)"], dtype=object)>,
 'raw_user_age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([46.], dtype=float32)>,
 'timestamp': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([879024327])>,
 'user_gender': <tf.Tensor: shape=(1,), dtype=bool, numpy=array([ True])>,
 'user_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'138'], dtype=object)>,
 'user_occupation_label': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([4])>,
 'user_occupation_text': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'doctor'], dtype=object)>,
 'user_rating': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([4.], dtype=float32)>,

In [11]:
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
val = shuffled.skip(80_000).take(20_000)

### write dataset to TF Records

In [12]:
# full dataset
TF_RECORD_FILE_full = "ml-ratings-100k-full.tfrecord"
LOCAL_TF_RECORD_full = f"./{TF_RECORD_FILE_full}"

# train split
TF_RECORD_FILE_train = "ml-ratings-100k-train.tfrecord"
LOCAL_TF_RECORD_train = f"./{TF_RECORD_FILE_train}"

# val split
TF_RECORD_FILE_val = "ml-ratings-100k-val.tfrecord"
LOCAL_TF_RECORD_val = f"./{TF_RECORD_FILE_val}"

# paths
FULL_DATA_PATH = f"{DATA_PATH}"
TRAIN_DATA_PATH = f"{DATA_PATH}/train"
VAL_DATA_PATH = f"{DATA_PATH}/val"

print(f"FULL_DATA_PATH   : {FULL_DATA_PATH}")
print(f"TRAIN_DATA_PATH  : {TRAIN_DATA_PATH}")
print(f"VAL_DATA_PATH    : {VAL_DATA_PATH}")

FULL_DATA_PATH   : gs://rec-bandits-v2-hybrid-vertex-bucket/data
TRAIN_DATA_PATH  : gs://rec-bandits-v2-hybrid-vertex-bucket/data/train
VAL_DATA_PATH    : gs://rec-bandits-v2-hybrid-vertex-bucket/data/val


In [13]:
data_utils.write_tfrecords(TF_RECORD_FILE_full, ratings, list_wise=False)

In [14]:
data_utils.write_tfrecords(TF_RECORD_FILE_train, train, list_wise=False)

In [15]:
data_utils.write_tfrecords(TF_RECORD_FILE_val, val, list_wise=False)

### save TF Records to GCS

In [16]:
# LOCAL_TF_RECORD = f"./{TF_RECORD_FILE}"
# ! gsutil -q cp $LOCAL_TF_RECORD $DATA_PATH/train/
# ! gsutil ls $DATA_PATH

! gsutil -q cp $LOCAL_TF_RECORD_full $FULL_DATA_PATH/
! gsutil -q cp $LOCAL_TF_RECORD_train $TRAIN_DATA_PATH/
! gsutil -q cp $LOCAL_TF_RECORD_val $VAL_DATA_PATH/

In [17]:
! gsutil ls $DATA_PATH

gs://rec-bandits-v2-hybrid-vertex-bucket/data/ml-ratings-100k-full.tfrecord
gs://rec-bandits-v2-hybrid-vertex-bucket/data/listwise-3n-train/
gs://rec-bandits-v2-hybrid-vertex-bucket/data/listwise-3n-val/
gs://rec-bandits-v2-hybrid-vertex-bucket/data/train/
gs://rec-bandits-v2-hybrid-vertex-bucket/data/val/


## validate TF Records

In [18]:
train_files = []
for blob in storage_client.list_blobs(f"{BUCKET_NAME}", prefix=f'{DATA_GCS_PREFIX}/', delimiter='/'):
    if '.tfrecord' in blob.name:
        train_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
        
train_files

['gs://rec-bandits-v2-hybrid-vertex-bucket/data/ml-ratings-100k-full.tfrecord']

In [19]:
train_dataset = tf.data.TFRecordDataset(train_files)

train_dataset

<TFRecordDatasetV2 element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>

In [20]:
train_dataset = train_dataset.map(data_utils.parse_tfrecord)

for x in train_dataset.batch(1).take(1):
    pprint(x)

{'bucketized_user_age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([45.], dtype=float32)>,
 'movie_genres': <tf.Tensor: shape=(1, 1), dtype=int64, numpy=array([[7]])>,
 'movie_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'357'], dtype=object)>,
 'timestamp': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([879024327])>,
 'user_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'138'], dtype=object)>,
 'user_occupation_text': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'doctor'], dtype=object)>,
 'user_rating': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([4.], dtype=float32)>}


## Generate look-up dicts

**TODO** - use more Tensorflow native method for generating vocabs and stats

### unique IDs

In [21]:
# Get the unique movies and users
unique_movie_ids = train_dataset.map(lambda x: x["movie_id"])

unique_movie_ids = np.unique([x.numpy() for x in unique_movie_ids])

MOVIELENS_NUM_MOVIES = len(unique_movie_ids)

print(f"len(unique_movie_ids) : {len(unique_movie_ids)}")
print(f"unique_movie_ids      : {unique_movie_ids[:2]}")

len(unique_movie_ids) : 1682
unique_movie_ids      : [b'1' b'10']


In [22]:
# Get the unique movies and users
# unique_movie_ids = ratings.map(lambda x: x["movie_id"])
unique_occ_ids = train_dataset.map(lambda x: x["user_occupation_text"])

unique_occ_ids = np.unique([x.numpy() for x in unique_occ_ids])

NUM_OCCS = len(unique_occ_ids)

print(f"len(unique_occ_ids) : {len(unique_occ_ids)}")
print(f"unique_occ_ids      : {unique_occ_ids[:2]}")

len(unique_occ_ids) : 21
unique_occ_ids      : [b'administrator' b'artist']


In [23]:
# unique_user_ids = ratings.map(lambda x: x["user_id"])
unique_user_ids = train_dataset.map(lambda x: x["user_id"])

unique_user_ids = np.unique([x.numpy() for x in unique_user_ids])

MOVIELENS_NUM_USERS = len(unique_user_ids)

print(f"len(unique_user_ids) : {len(unique_user_ids)}")
print(f"unique_user_ids      : {unique_user_ids[:2]}")

len(unique_user_ids) : 943
unique_user_ids      : [b'1' b'10']


### lookup dictionaries

In [24]:
USER_AGE_LOOKUP = data_utils.get_dictionary_lookup_by_tf_data_key(
    key = 'bucketized_user_age'
    , dataset= train_dataset
)

USER_AGE_DIM = len(USER_AGE_LOOKUP)
print(f"USER_AGE_DIM: {USER_AGE_DIM}")

USER_AGE_LOOKUP

USER_AGE_DIM: 7


{1.0: 0, 35.0: 1, 45.0: 2, 18.0: 3, 50.0: 4, 56.0: 5, 25.0: 6}

In [25]:
USER_OCC_LOOKUP = data_utils.get_dictionary_lookup_by_tf_data_key(
    key = 'user_occupation_text'
    , dataset= train_dataset
)
USER_OCC_DIM = len(USER_OCC_LOOKUP)
print(f"USER_OCC_DIM: {USER_OCC_DIM}")

# USER_OCC_LOOKUP

USER_OCC_DIM: 21


In [26]:
MOVIE_GEN_LOOKUP = data_utils.get_dictionary_lookup_by_tf_data_key(
    key = 'movie_genres'
    , dataset= train_dataset
)
MOVIE_GEN_DIM = len(MOVIE_GEN_LOOKUP)
print(f"MOVIE_GEN_DIM: {MOVIE_GEN_DIM}")

# MOVIE_GEN_LOOKUP

MOVIE_GEN_DIM: 19


## create `data_config.py`

> write data config for subsequent notebooks

In [27]:
config = f'''
USER_AGE_LOOKUP       = {USER_AGE_LOOKUP}
USER_AGE_DIM          = {USER_AGE_DIM}

USER_OCC_LOOKUP       = {USER_OCC_LOOKUP}
USER_OCC_DIM          = {USER_OCC_DIM}

MOVIE_GEN_LOOKUP      = {MOVIE_GEN_LOOKUP}
MOVIE_GEN_DIM         = {MOVIE_GEN_DIM}

MOVIELENS_NUM_MOVIES  = {MOVIELENS_NUM_MOVIES}
MOVIELENS_NUM_USERS   = {MOVIELENS_NUM_USERS}
'''
# TODO - cleanup
with open(f'{REPO_DOCKER_PATH_PREFIX}/{RL_SUB_DIR}/data_config.py', 'w') as f:
    f.write(config)

### Validate creating the ratings matrix

In [28]:
from src.per_arm_rl import data_config as data_config

In [29]:
data_config.USER_AGE_LOOKUP

{1.0: 0, 35.0: 1, 45.0: 2, 18.0: 3, 50.0: 4, 56.0: 5, 25.0: 6}

In [30]:
data_config.USER_OCC_LOOKUP

{b'healthcare': 0,
 b'engineer': 1,
 b'retired': 2,
 b'administrator': 3,
 b'homemaker': 4,
 b'scientist': 5,
 b'marketing': 6,
 b'educator': 7,
 b'other': 8,
 b'writer': 9,
 b'technician': 10,
 b'salesman': 11,
 b'entertainment': 12,
 b'librarian': 13,
 b'programmer': 14,
 b'doctor': 15,
 b'artist': 16,
 b'executive': 17,
 b'lawyer': 18,
 b'none': 19,
 b'student': 20}

In [31]:
data_config.MOVIE_GEN_LOOKUP

{0: 0,
 1: 1,
 2: 2,
 3: 3,
 4: 4,
 5: 5,
 6: 6,
 7: 7,
 8: 8,
 9: 9,
 10: 10,
 12: 11,
 13: 12,
 14: 13,
 15: 14,
 16: 15,
 17: 16,
 18: 17,
 19: 18}

In [32]:
test_dataset_load = data_utils.load_movielens_ratings(
    ratings_dataset = train_dataset
    , num_users = data_config.MOVIELENS_NUM_USERS
    , num_movies = data_config.MOVIELENS_NUM_MOVIES
    , user_age_lookup_dict = data_config.USER_AGE_LOOKUP
    , user_occ_lookup_dict = data_config.USER_OCC_LOOKUP
    , movie_gen_lookup_dict = data_config.MOVIE_GEN_LOOKUP
)

test_dataset_load

(array([[5., 3., 4., ..., 0., 0., 0.],
        [4., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [5., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 5., 0., ..., 0., 0., 0.]]),
 array([2.0001, 6.0001, 3.0001, ..., 3.0001, 1.0001, 3.0001]),
 array([15.0001, 12.0001, 20.0001, ..., 20.0001,  9.0001, 20.0001]),
 array([7.00010e+00, 4.00010e+00, 4.00010e+00, ..., 1.00001e+01,
        1.00000e-04, 4.00010e+00]))

In [68]:
len(test_dataset_load)

4

In [33]:
ratings_matrix = test_dataset_load[0]
print(ratings_matrix.shape)
ratings_matrix

(943, 1682)


array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

### validate TF Records

In [34]:
! gsutil ls $DATA_PATH

gs://rec-bandits-v2-hybrid-vertex-bucket/data/ml-ratings-100k-full.tfrecord
gs://rec-bandits-v2-hybrid-vertex-bucket/data/listwise-3n-train/
gs://rec-bandits-v2-hybrid-vertex-bucket/data/listwise-3n-val/
gs://rec-bandits-v2-hybrid-vertex-bucket/data/train/
gs://rec-bandits-v2-hybrid-vertex-bucket/data/val/


In [35]:
## validate

train_files = []
for blob in storage_client.list_blobs(f"{BUCKET_NAME}", prefix=f'{DATA_GCS_PREFIX}/train'):
    if '.tfrecord' in blob.name:
        train_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
        
train_files

['gs://rec-bandits-v2-hybrid-vertex-bucket/data/train/ml-ratings-100k-train.tfrecord']

In [36]:
train_dataset = tf.data.TFRecordDataset(train_files)

train_dataset = train_dataset.map(data_utils.parse_tfrecord)

for x in train_dataset.batch(1).take(1):
    pprint(x)

{'bucketized_user_age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([35.], dtype=float32)>,
 'movie_genres': <tf.Tensor: shape=(1, 1), dtype=int64, numpy=array([[7]])>,
 'movie_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'898'], dtype=object)>,
 'timestamp': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([885409515])>,
 'user_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'681'], dtype=object)>,
 'user_occupation_text': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'marketing'], dtype=object)>,
 'user_rating': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([4.], dtype=float32)>}


In [37]:
# Get the unique movies and users
unique_user_ratings = train_dataset.map(lambda x: x["user_rating"])

unique_user_ratings = np.unique([x.numpy() for x in unique_user_ratings])

unique_user_ratings

array([1., 2., 3., 4., 5.], dtype=float32)

# Ranking Data (listwise)

In [39]:
import array
import collections
import numpy as np
from typing import Dict, List, Optional, Text, Tuple

from pprint import pprint

from src.per_arm_rl import data_utils_v12 as data_utils

In [40]:
train = shuffled.take(80_000)
val = shuffled.skip(80_000).take(20_000)
# tmp_test = shuffled.skip(80_000).take(1000)

In [41]:
for x in train.batch(1).take(1):
    pprint(x)

{'bucketized_user_age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([35.], dtype=float32)>,
 'movie_genres': <tf.Tensor: shape=(1, 1), dtype=int64, numpy=array([[7]])>,
 'movie_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'898'], dtype=object)>,
 'movie_title': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Postman, The (1997)'], dtype=object)>,
 'raw_user_age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([44.], dtype=float32)>,
 'timestamp': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([885409515])>,
 'user_gender': <tf.Tensor: shape=(1,), dtype=bool, numpy=array([False])>,
 'user_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'681'], dtype=object)>,
 'user_occupation_label': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([14])>,
 'user_occupation_text': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'marketing'], dtype=object)>,
 'user_rating': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([4.], dtype=float32)>,
 'user_zip_cod

### helper functions

In [42]:
def _create_feature_dict() -> Dict[Text, List[tf.Tensor]]:
    """Helper function for creating an empty feature dict for defaultdict."""
    return {"movie_id": [], "movie_genres": [], "user_rating": []}

In [43]:
def _sample_list(
    feature_lists: Dict[Text, List[tf.Tensor]],
    num_examples_per_list: int,
    random_state: Optional[np.random.RandomState] = None,
) -> Tuple[tf.Tensor, tf.Tensor]:
    """Function for sampling a list example from given feature lists."""
    if random_state is None:
        random_state = np.random.RandomState()

    sampled_indices = random_state.choice(
        range(len(feature_lists["movie_id"])),
        size=num_examples_per_list,
        replace=False,
    )
    sampled_movie_ids = [
        feature_lists["movie_id"][idx] for idx in sampled_indices
    ]
    # sampled_movie_titles = [
    #     feature_lists["movie_title"][idx] for idx in sampled_indices
    # ]
    sampled_genres = [
        feature_lists["movie_genres"][idx]
        for idx in sampled_indices
    ]
    sampled_ratings = [
        feature_lists["user_rating"][idx]
        for idx in sampled_indices
    ]

    return (
        tf.stack(sampled_movie_ids, 0),
        tf.stack(sampled_genres, 0),
        tf.stack(sampled_ratings, 0),
    )

In [44]:
def create_listwise_ds(
    rating_dataset: tf.data.Dataset,
    num_list_per_user: int = 10,
    num_examples_per_list: int = 10,
    seed: Optional[int] = None,
) -> tf.data.Dataset:
    
    """
    Function for converting the MovieLens 100K dataset to a listwise dataset
    """
    random_state = np.random.RandomState(seed)

    example_lists_by_user = collections.defaultdict(_create_feature_dict)

    movie_id_vocab = set()
    for example in rating_dataset:

        user_id = example["user_id"].numpy()

        example_lists_by_user[user_id]["movie_id"].append(
            example["movie_id"]
        )
        example_lists_by_user[user_id]["movie_genres"].append(
            example["movie_genres"][0]
        )
        example_lists_by_user[user_id]["user_rating"].append(
            example["user_rating"]
        )
        movie_id_vocab.add(example["movie_id"].numpy())

    tensor_slices = {"user_id": [], "movie_id": [], "movie_genres": [], "user_rating": []}

    for user_id, feature_lists in example_lists_by_user.items():
        for _ in range(num_list_per_user):

            # Drop the user if they don't have enough ratings.
            if len(feature_lists["movie_id"]) < num_examples_per_list:
                continue

            sampled_movie_ids, sampled_genres, sampled_ratings = _sample_list(
                feature_lists,
                num_examples_per_list,
                random_state=random_state,
            )

            tensor_slices["user_id"].append(user_id)
            tensor_slices["movie_id"].append(sampled_movie_ids)
            tensor_slices["movie_genres"].append(sampled_genres)
            tensor_slices["user_rating"].append(sampled_ratings)

    return tf.data.Dataset.from_tensor_slices(tensor_slices)

## Create train listwise ds

In [45]:
# from src.per_arm_rl import data_utils_v12 as data_utils

NUM_LIST_PER_USER = 50
NUM_EXAMPLES_PER_LIST = 3 # 3 | 5

In [46]:
# We sample 50 lists for each user for the training data. For each list we
# sample (3 | 5) movies from the movies the user rated.
train_lw = create_listwise_ds(
    train,
    num_list_per_user=NUM_LIST_PER_USER,
    num_examples_per_list=NUM_EXAMPLES_PER_LIST,
    seed=42
)

In [47]:
for example in train_lw.skip(7).take(1):
    pprint(example)

{'movie_genres': <tf.Tensor: shape=(3,), dtype=int64, numpy=array([4, 7, 7])>,
 'movie_id': <tf.Tensor: shape=(3,), dtype=string, numpy=array([b'294', b'690', b'1176'], dtype=object)>,
 'user_id': <tf.Tensor: shape=(), dtype=string, numpy=b'681'>,
 'user_rating': <tf.Tensor: shape=(3,), dtype=float32, numpy=array([5., 4., 4.], dtype=float32)>}


In [48]:
example['user_rating'].numpy()

array([5., 4., 4.], dtype=float32)

In [49]:
len(list(train_lw))

47150

### write TF records file

In [50]:
DATA_PATH

'gs://rec-bandits-v2-hybrid-vertex-bucket/data'

In [51]:
# train split
TF_RECORD_FILE_lw_train = f"ml-100k-listwise-{NUM_EXAMPLES_PER_LIST}n-train.tfrecord"
LOCAL_TF_RECORD_lw_train = f"./{TF_RECORD_FILE_lw_train}"

In [52]:
data_utils.write_tfrecords(LOCAL_TF_RECORD_lw_train, train_lw, list_wise=True)

#### validate TRAIN TF record file(s)

In [53]:
# test TF record local
tmp_lw_dataset = tf.data.TFRecordDataset(LOCAL_TF_RECORD_lw_train)
tmp_lw_dataset = tmp_lw_dataset.map(data_utils.parse_lw_tfrecord)

for x in tmp_lw_dataset.batch(1).take(1):
    pprint(x)

{'movie_genres': <tf.Tensor: shape=(1, 3), dtype=int64, numpy=array([[7, 4, 7]])>,
 'movie_id': <tf.Tensor: shape=(1, 3), dtype=string, numpy=array([[b'898', b'294', b'258']], dtype=object)>,
 'user_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'681'], dtype=object)>,
 'user_rating': <tf.Tensor: shape=(1, 3), dtype=float32, numpy=array([[4., 5., 1.]], dtype=float32)>}


In [54]:
lw_train_prefix = f"{DATA_GCS_PREFIX}/listwise-{NUM_EXAMPLES_PER_LIST}n-train"
LW_TRAIN_DATA_PATH = f"{BUCKET_URI}/{lw_train_prefix}"

! gsutil -q cp $LOCAL_TF_RECORD_lw_train $LW_TRAIN_DATA_PATH/

In [55]:
! gsutil ls $LW_TRAIN_DATA_PATH

gs://rec-bandits-v2-hybrid-vertex-bucket/data/listwise-3n-train/ml-100k-listwise-3n-train.tfrecord


In [56]:
test_lw_files = []

for blob in storage_client.list_blobs(f"{BUCKET_NAME}", prefix=f'{lw_train_prefix}/', delimiter='/'):
    if '.tfrecord' in blob.name:
        test_lw_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
        
tmp_lw_dataset = tf.data.TFRecordDataset(test_lw_files)
tmp_lw_dataset = tmp_lw_dataset.map(data_utils.parse_lw_tfrecord)

for x in tmp_lw_dataset.batch(1).take(1):
    pprint(x)

{'movie_genres': <tf.Tensor: shape=(1, 3), dtype=int64, numpy=array([[7, 4, 7]])>,
 'movie_id': <tf.Tensor: shape=(1, 3), dtype=string, numpy=array([[b'898', b'294', b'258']], dtype=object)>,
 'user_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'681'], dtype=object)>,
 'user_rating': <tf.Tensor: shape=(1, 3), dtype=float32, numpy=array([[4., 5., 1.]], dtype=float32)>}


## Create val listwise ds

In [57]:
val_lw = create_listwise_ds(
    val,
    num_list_per_user=1,
    num_examples_per_list=NUM_EXAMPLES_PER_LIST,
    seed=42
)

In [58]:
for example in val_lw.take(3):
    pprint(example)

{'movie_genres': <tf.Tensor: shape=(3,), dtype=int64, numpy=array([3, 0, 0])>,
 'movie_id': <tf.Tensor: shape=(3,), dtype=string, numpy=array([b'94', b'245', b'403'], dtype=object)>,
 'user_id': <tf.Tensor: shape=(), dtype=string, numpy=b'346'>,
 'user_rating': <tf.Tensor: shape=(3,), dtype=float32, numpy=array([3., 4., 3.], dtype=float32)>}
{'movie_genres': <tf.Tensor: shape=(3,), dtype=int64, numpy=array([7, 0, 0])>,
 'movie_id': <tf.Tensor: shape=(3,), dtype=string, numpy=array([b'678', b'127', b'343'], dtype=object)>,
 'user_id': <tf.Tensor: shape=(), dtype=string, numpy=b'602'>,
 'user_rating': <tf.Tensor: shape=(3,), dtype=float32, numpy=array([4., 5., 2.], dtype=float32)>}
{'movie_genres': <tf.Tensor: shape=(3,), dtype=int64, numpy=array([4, 7, 4])>,
 'movie_id': <tf.Tensor: shape=(3,), dtype=string, numpy=array([b'168', b'1053', b'26'], dtype=object)>,
 'user_id': <tf.Tensor: shape=(), dtype=string, numpy=b'393'>,
 'user_rating': <tf.Tensor: shape=(3,), dtype=float32, numpy=arr

In [59]:
len(list(val_lw))

917

### write TF records file

In [60]:
# val split
TF_RECORD_FILE_lw_val = f"ml-100k-listwise-{NUM_EXAMPLES_PER_LIST}n-val.tfrecord"
LOCAL_TF_RECORD_lw_val = f"./{TF_RECORD_FILE_lw_val}"

In [61]:
data_utils.write_tfrecords(LOCAL_TF_RECORD_lw_val, val_lw, list_wise=True)

#### validate VAL TF record file(s)

In [62]:
# test TF record local
tmp_lw_dataset = tf.data.TFRecordDataset(LOCAL_TF_RECORD_lw_val)
tmp_lw_dataset = tmp_lw_dataset.map(data_utils.parse_lw_tfrecord)

for x in tmp_lw_dataset.batch(1).take(1):
    pprint(x)

{'movie_genres': <tf.Tensor: shape=(1, 3), dtype=int64, numpy=array([[3, 0, 0]])>,
 'movie_id': <tf.Tensor: shape=(1, 3), dtype=string, numpy=array([[b'94', b'245', b'403']], dtype=object)>,
 'user_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'346'], dtype=object)>,
 'user_rating': <tf.Tensor: shape=(1, 3), dtype=float32, numpy=array([[3., 4., 3.]], dtype=float32)>}


In [63]:
lw_val_prefix = f"{DATA_GCS_PREFIX}/listwise-{NUM_EXAMPLES_PER_LIST}n-val"
LW_VAL_DATA_PATH = f"{BUCKET_URI}/{lw_val_prefix}"

! gsutil -q cp $LOCAL_TF_RECORD_lw_val $LW_VAL_DATA_PATH/

In [64]:
! gsutil ls $LW_VAL_DATA_PATH

gs://rec-bandits-v2-hybrid-vertex-bucket/data/listwise-3n-val/ml-100k-listwise-3n-val.tfrecord


In [65]:
test_lw_files = []

for blob in storage_client.list_blobs(f"{BUCKET_NAME}", prefix=f'{lw_val_prefix}/', delimiter='/'):
    if '.tfrecord' in blob.name:
        test_lw_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
        
tmp_lw_dataset = tf.data.TFRecordDataset(test_lw_files)
tmp_lw_dataset = tmp_lw_dataset.map(data_utils.parse_lw_tfrecord)

for x in tmp_lw_dataset.batch(1).take(1):
    pprint(x)

{'movie_genres': <tf.Tensor: shape=(1, 3), dtype=int64, numpy=array([[3, 0, 0]])>,
 'movie_id': <tf.Tensor: shape=(1, 3), dtype=string, numpy=array([[b'94', b'245', b'403']], dtype=object)>,
 'user_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'346'], dtype=object)>,
 'user_rating': <tf.Tensor: shape=(1, 3), dtype=float32, numpy=array([[3., 4., 3.]], dtype=float32)>}


### get uniques

In [66]:
movies = ratings.map(lambda x: x["movie_id"])
unique_movie_ids = np.unique(np.concatenate(list(movies.batch(1000))))

len(unique_movie_ids)

1682

In [67]:
users = ratings.map(lambda x: x["user_id"])

unique_user_ids = np.unique(np.concatenate(list(users.batch(1000))))

len(unique_user_ids)

943

**Finished**