# Preparing offline training data for RL

**Objectives** of this notebook:
1. prepare training datasets from the [MovieLens 100k](https://www.tensorflow.org/datasets/catalog/movielens#movielens100k-movies) (or optionally, [MovieLens 1M](https://www.tensorflow.org/datasets/catalog/movielens#movielens1m-movies)) public dataset
2. Write datasets to TF-Records
3.  Generate dataset vocabulary and look-up dictionaries

## Load env config

* use the prefix from `00-env-setup`

In [2]:
VERSION = "v2"  # TODO
PREFIX = f"rec-bandits-{VERSION}"  # TODO

print(f"PREFIX: {PREFIX}")

PREFIX: rec-bandits-v2


**run the next cell to populate env vars**

In [3]:
# staging GCS
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}-bucket'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PROJECT_ID               = "cpg-cdp"
PROJECT_NUM              = "939655404703"
LOCATION                 = "us-central1"

REGION                   = "us-central1"
BQ_LOCATION              = "US"
VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"

VERTEX_SA                = "939655404703-compute@developer.gserviceaccount.com"

PREFIX                   = "rec-bandits-v2"
VERSION                  = "v2"

BUCKET_NAME              = "rec-bandits-v2-cpg-cdp-bucket"
BUCKET_URI               = "gs://rec-bandits-v2-cpg-cdp-bucket"
DATA_GCS_PREFIX          = "data"
DATA_PATH                = "gs://rec-bandits-v2-cpg-cdp-bucket/data"
VOCAB_SUBDIR             = "vocabs"
VOCAB_FILENAME           = "vocab_dict.pkl"
DATA_PATH_KFP_DEMO       = "gs://rec-bandits-v2-cpg-cdp-bucket/data/kfp_demo_data/u.data"

VPC_NETWORK_FULL         = "projects/939655404703/global/networks/ucaip-haystack-vpc-network"

BIGQUERY_DATASET_NAME    = "mvlens_rec_bandits_v2"
BIGQUERY_TABLE_NAME      = "training_dataset"



In [4]:
# ! gsutil ls $BUCKET_URI

## imports

In [5]:
import os

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

In [11]:
! pip install numba

Collecting numba
  Downloading numba-0.59.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.7 kB)
Collecting llvmlite<0.43,>=0.42.0dev0 (from numba)
  Downloading llvmlite-0.42.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.8 kB)
Downloading numba-0.59.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/3.7 MB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading llvmlite-0.42.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (43.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 MB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: llvmlite, numba
Successfully installed llvmlite-0.42.0 numba-0.59.0


In [12]:
import sys
import time
import numpy as np
import pickle as pkl
from pprint import pprint
from IPython import display
import matplotlib.pyplot as plt

# from typing import Dict, List, Optional, Text, Tuple

import logging

logging.disable(logging.WARNING)

import tensorflow as tf
import tensorflow_datasets as tfds

# import tensorflow_recommenders as tfrsa

# GPU
from numba import cuda
import gc

# google cloud
from google.cloud import aiplatform, storage

# this repo
from src.per_arm_rl import data_utils as data_utils

In [13]:
gpus = tf.config.experimental.list_physical_devices("GPU")
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

gpus

[]

In [14]:
device = cuda.get_current_device()
device.reset()
gc.collect()

23

### Initialize GCP clients

In [15]:
# cloud storage client
storage_client = storage.Client(project=PROJECT_ID)

# Vertex client
aiplatform.init(project=PROJECT_ID, location=LOCATION)

### Create `data_utils.py`

> this will be used to support data processing throughout the development workflow

# [1] Prepare Movielens dataset

* [1.a] - prepare 100k dataset; used in folders `01...` and `02...`
* [1.b] - prepare 1M dataset; used in folders `05` (WIP)

## [1.a] 100k dataset

> download and prepare [MovieLens 100k](https://www.tensorflow.org/datasets/catalog/movielens#movielens100k-movies) public dataset

In [16]:
DATA_TAG = "100k"

RECORD_COUNT = 100_000
SHARD_SIZE = 20_000
NUM_SHARDS = int(RECORD_COUNT / SHARD_SIZE)
print(f"NUM_SHARDS: {NUM_SHARDS}")

# paths
DATA_GCS_PREFIX = f"data/movielens-{DATA_TAG}"

print(f"DATA_GCS_PREFIX : {DATA_GCS_PREFIX}")

NUM_SHARDS: 5
DATA_GCS_PREFIX : data/movielens-100k


### load data from Tensorflow Datasets

* see [TFDS documentation](https://www.tensorflow.org/datasets/catalog/movielens#movielens100k-ratings) for more details on this dataset, feature descriptions, and other versions

In [17]:
ratings = tfds.load(f"movielens/{DATA_TAG}-ratings", split="train")

for x in ratings.batch(1).take(1):
    pprint(x)

[1mDownloading and preparing dataset 4.70 MiB (download: 4.70 MiB, generated: 32.41 MiB, total: 37.10 MiB) to /home/jwortz_google_com/tensorflow_datasets/movielens/100k-ratings/0.1.1...[0m


Dl Completed...: 0 url [00:00, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:00<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:00<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:00<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:00<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:00<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:00<?, ? url/s]
Dl Completed...: 100%|██████████| 1/1 [00:00<00:00,  2.66 url/s]
Dl Completed...: 100%|██████████| 1/1 [00:00<00:00,  2.60 url/s]
Dl Completed...: 100%|██████████| 1/1 [00:00<00:00,  2.55 url/s]
Dl Completed...: 100%|██████████| 1/1 [00:00<00:00,  2.51 url/s]
Dl Completed...: 100%|██████████| 1/1 [00:00<00:00,  2.40 url/s]
Dl Completed...: 100%|██████████| 1/1 [00:00<00:00,  2.36 url/s]
Dl Completed...: 100%|██████████| 1/1 [00:00<00:00,  2.33 url/s]
Dl Completed...: 100%|██████████| 1/1 [00:00<00:00,  2.29 url/s]
Dl Completed...: 100%|██████████| 1/1 [00:00<00:00,  2.26 url/s]
Dl Completed...: 100%|██████████|

[1mDataset movielens downloaded and prepared to /home/jwortz_google_com/tensorflow_datasets/movielens/100k-ratings/0.1.1. Subsequent calls will reuse this data.[0m
{'bucketized_user_age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([45.], dtype=float32)>,
 'movie_genres': <tf.Tensor: shape=(1, 1), dtype=int64, numpy=array([[7]])>,
 'movie_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'357'], dtype=object)>,
 'movie_title': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b"One Flew Over the Cuckoo's Nest (1975)"], dtype=object)>,
 'raw_user_age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([46.], dtype=float32)>,
 'timestamp': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([879024327])>,
 'user_gender': <tf.Tensor: shape=(1,), dtype=bool, numpy=array([ True])>,
 'user_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'138'], dtype=object)>,
 'user_occupation_label': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([4])>,
 'user_occupation_text': <t

In [18]:
bucket = storage_client.bucket(BUCKET_NAME)

for i in range(0, NUM_SHARDS):
    TF_RECORD_FILE = f"ml-{DATA_TAG}-ratings-train-{str(i+1).zfill(2)}-of-{str(NUM_SHARDS).zfill(2)}.tfrecord"
    LOCAL_TF_RECORD_FILE = f"./{TF_RECORD_FILE}"
    take = SHARD_SIZE
    skip = take * i

    print(f"writing skip: {skip} take: {take}")
    ds_slice = ratings.skip(skip).take(take)

    print(f"writing: {LOCAL_TF_RECORD_FILE}...")
    data_utils.write_tfrecords(LOCAL_TF_RECORD_FILE, ds_slice, list_wise=False)

    DEST_BLOB = f"{DATA_GCS_PREFIX}/all/{TF_RECORD_FILE}"
    blob = bucket.blob(DEST_BLOB)
    blob.upload_from_filename(TF_RECORD_FILE)

    print(f"copied {TF_RECORD_FILE} to Cloud Storage\n")

writing skip: 0 take: 20000
writing: ./ml-100k-ratings-train-01-of-05.tfrecord...


copied ml-100k-ratings-train-01-of-05.tfrecord to Cloud Storage

writing skip: 20000 take: 20000
writing: ./ml-100k-ratings-train-02-of-05.tfrecord...
copied ml-100k-ratings-train-02-of-05.tfrecord to Cloud Storage

writing skip: 40000 take: 20000
writing: ./ml-100k-ratings-train-03-of-05.tfrecord...
copied ml-100k-ratings-train-03-of-05.tfrecord to Cloud Storage

writing skip: 60000 take: 20000
writing: ./ml-100k-ratings-train-04-of-05.tfrecord...
copied ml-100k-ratings-train-04-of-05.tfrecord to Cloud Storage

writing skip: 80000 take: 20000
writing: ./ml-100k-ratings-train-05-of-05.tfrecord...
copied ml-100k-ratings-train-05-of-05.tfrecord to Cloud Storage



In [20]:
! gsutil ls gs://$BUCKET_NAME/$DATA_GCS_PREFIX/all

gs://rec-bandits-v2-cpg-cdp-bucket/data/movielens-100k/all/ml-100k-ratings-train-01-of-05.tfrecord
gs://rec-bandits-v2-cpg-cdp-bucket/data/movielens-100k/all/ml-100k-ratings-train-02-of-05.tfrecord
gs://rec-bandits-v2-cpg-cdp-bucket/data/movielens-100k/all/ml-100k-ratings-train-03-of-05.tfrecord
gs://rec-bandits-v2-cpg-cdp-bucket/data/movielens-100k/all/ml-100k-ratings-train-04-of-05.tfrecord
gs://rec-bandits-v2-cpg-cdp-bucket/data/movielens-100k/all/ml-100k-ratings-train-05-of-05.tfrecord


### copy subset to train subfolder

In [21]:
for i in range(1, (NUM_SHARDS)):
    TF_RECORD_FILE = f"ml-{DATA_TAG}-ratings-train-{str(i).zfill(2)}-of-{str(NUM_SHARDS).zfill(2)}.tfrecord"
    print(TF_RECORD_FILE)
    
    ! gsutil -q cp gs://$BUCKET_NAME/$DATA_GCS_PREFIX/all/$TF_RECORD_FILE gs://$BUCKET_NAME/$DATA_GCS_PREFIX/train/$TF_RECORD_FILE
    ! rm ./$TF_RECORD_FILE

ml-100k-ratings-train-01-of-05.tfrecord
ml-100k-ratings-train-02-of-05.tfrecord
ml-100k-ratings-train-03-of-05.tfrecord
ml-100k-ratings-train-04-of-05.tfrecord


### copy subset to val subfolder

In [22]:
for i in range(NUM_SHARDS, (NUM_SHARDS+1)):
    TF_RECORD_FILE = f"ml-{DATA_TAG}-ratings-train-{str(i).zfill(2)}-of-{str(NUM_SHARDS).zfill(2)}.tfrecord"
    print(TF_RECORD_FILE)
    
    ! gsutil -q cp gs://$BUCKET_NAME/$DATA_GCS_PREFIX/all/$TF_RECORD_FILE gs://$BUCKET_NAME/$DATA_GCS_PREFIX/val/$TF_RECORD_FILE
    ! rm ./$TF_RECORD_FILE

ml-100k-ratings-train-05-of-05.tfrecord


### validate TF Records

In [23]:
train_files = []
for blob in storage_client.list_blobs(
    f"{BUCKET_NAME}",
    prefix=f"{DATA_GCS_PREFIX}/all/",
    # delimiter='/'
):
    if ".tfrecord" in blob.name:
        train_files.append(
            blob.public_url.replace("https://storage.googleapis.com/", "gs://")
        )

train_files

['gs://rec-bandits-v2-cpg-cdp-bucket/data/movielens-100k/all/ml-100k-ratings-train-01-of-05.tfrecord',
 'gs://rec-bandits-v2-cpg-cdp-bucket/data/movielens-100k/all/ml-100k-ratings-train-02-of-05.tfrecord',
 'gs://rec-bandits-v2-cpg-cdp-bucket/data/movielens-100k/all/ml-100k-ratings-train-03-of-05.tfrecord',
 'gs://rec-bandits-v2-cpg-cdp-bucket/data/movielens-100k/all/ml-100k-ratings-train-04-of-05.tfrecord',
 'gs://rec-bandits-v2-cpg-cdp-bucket/data/movielens-100k/all/ml-100k-ratings-train-05-of-05.tfrecord']

In [24]:
train_dataset = tf.data.TFRecordDataset(train_files)
train_dataset = train_dataset.map(data_utils.parse_tfrecord)

for x in train_dataset.batch(1).take(1):
    pprint(x)

{'bucketized_user_age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([45.], dtype=float32)>,
 'movie_genres': <tf.Tensor: shape=(1, 1), dtype=int64, numpy=array([[7]])>,
 'movie_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'357'], dtype=object)>,
 'timestamp': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([879024327])>,
 'user_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'138'], dtype=object)>,
 'user_occupation_text': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'doctor'], dtype=object)>,
 'user_rating': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([4.], dtype=float32)>}


In [25]:
# Get the unique movies and users
unique_user_ratings = train_dataset.map(lambda x: x["user_rating"])

unique_user_ratings = np.unique([x.numpy() for x in unique_user_ratings])

unique_user_ratings

array([1., 2., 3., 4., 5.], dtype=float32)

### Generate look-up dicts

**TODO** - use more Tensorflow native method for generating vocabs and stats

#### unique movie IDs

In [26]:
movie_id_lookup = tf.keras.layers.StringLookup()
movie_id_lookup.adapt(train_dataset.map(lambda x: x["movie_id"]))
movie_id_vocab = movie_id_lookup.get_vocabulary()

MOVIELENS_NUM_MOVIES = len(movie_id_vocab)

print(f"len(movie_id_vocab) : {len(movie_id_vocab)}")
print(f"movie_id_vocab      : {movie_id_vocab[:2]}")

len(movie_id_vocab) : 1683
movie_id_vocab      : ['[UNK]', '50']


#### unique user IDs

In [27]:
unique_user_ids_lookup = tf.keras.layers.StringLookup()
unique_user_ids_lookup.adapt(train_dataset.map(lambda x: x["user_id"]))
unique_user_ids_vocab = unique_user_ids_lookup.get_vocabulary()

MOVIELENS_NUM_USERS = len(unique_user_ids_vocab)

print(f"len(unique_user_ids_vocab) : {len(unique_user_ids_vocab)}")
print(f"unique_user_ids_vocab      : {unique_user_ids_vocab[:2]}")

len(unique_user_ids_vocab) : 944
unique_user_ids_vocab      : ['[UNK]', '405']


#### unique occupational_text_values

In [28]:
# Get the unique movies and users
# unique_occ_ids = train_dataset.map(lambda x: x["user_occupation_text"])
# unique_occ_ids = np.unique([x.numpy().decode('utf-8') for x in unique_occ_ids])

unique_occ_lookup = tf.keras.layers.StringLookup()
unique_occ_lookup.adapt(train_dataset.map(lambda x: x["user_occupation_text"]))
unique_occ_vocab = unique_occ_lookup.get_vocabulary()

NUM_OCCS = len(unique_occ_vocab)

print(f"len(unique_occ_vocab) : {len(unique_occ_vocab)}")
print(f"unique_occ_vocab      : {unique_occ_vocab[:2]}")

len(unique_occ_vocab) : 22
unique_occ_vocab      : ['[UNK]', 'student']


#### unique user_age values

In [29]:
unique_user_age = train_dataset.map(lambda x: x["bucketized_user_age"])
unique_user_age = np.unique([x.numpy() for x in unique_user_age])

print(f"len(unique_user_age) : {len(unique_user_age)}")
print(f"unique_user_age      : {unique_user_age[:2]}")

len(unique_user_age) : 7
unique_user_age      : [ 1. 18.]


#### timestamp buckets

In [30]:
max_timestamp = (
    ratings.map(lambda x: x["timestamp"])
    .reduce(tf.cast(0, tf.int64), tf.maximum)
    .numpy()
    .max()
)
min_timestamp = (
    ratings.map(lambda x: x["timestamp"])
    .reduce(np.int64(1e9), tf.minimum)
    .numpy()
    .min()
)

timestamp_buckets = np.linspace(min_timestamp, max_timestamp, num=1000)

print(f"timestamp_buckets: {timestamp_buckets[:3]}")

timestamp_buckets: [8.74724710e+08 8.74743291e+08 8.74761871e+08]


#### unique movie genres

In [31]:
unique_movie_genres = train_dataset.map(lambda x: x["movie_genres"])
unique_movie_genres = np.unique([x.numpy() for x in unique_movie_genres])

MOVIELENS_NUM_GENRES = len(unique_movie_genres)

print(f"len(unique_movie_genres) : {len(unique_movie_genres)}")
print(f"unique_movie_genres      : {unique_movie_genres[:2]}")

len(unique_movie_genres) : 19
unique_movie_genres      : [0 1]


### Write vocab

In [32]:
vocab_dict = {
    "movie_id": movie_id_vocab,
    "user_id": unique_user_ids_vocab,
    "user_occupation_text": unique_occ_vocab,
    "movie_genres": unique_movie_genres,
    "bucketized_user_age": unique_user_age,
    "max_timestamp": max_timestamp,
    "min_timestamp": min_timestamp,
    "timestamp_buckets": timestamp_buckets,
}

print(f"DATA_GCS_PREFIX : {DATA_GCS_PREFIX}")

DATA_GCS_PREFIX : data/movielens-100k


In [33]:
VOCAB_FILE_NAME = "vocab_dict_1m.pkl"
filehandler = open(VOCAB_FILE_NAME, "wb")
pkl.dump(vocab_dict, filehandler)

filehandler.close()

VOCAB_DEST_BLOB = f"{DATA_GCS_PREFIX}/vocab_dict.pkl"

blob = bucket.blob(VOCAB_DEST_BLOB)
blob.upload_from_filename(VOCAB_FILE_NAME)

### Create lookup dictionaries

In [34]:
USER_AGE_LOOKUP = data_utils.get_dictionary_lookup_by_tf_data_key(
    key="bucketized_user_age", dataset=train_dataset
)

USER_AGE_DIM = len(USER_AGE_LOOKUP)
print(f"USER_AGE_DIM: {USER_AGE_DIM}")

USER_AGE_LOOKUP

USER_AGE_DIM: 7


{1.0: 0, 35.0: 1, 45.0: 2, 18.0: 3, 50.0: 4, 56.0: 5, 25.0: 6}

In [35]:
USER_OCC_LOOKUP = data_utils.get_dictionary_lookup_by_tf_data_key(
    key="user_occupation_text", dataset=train_dataset
)
USER_OCC_DIM = len(USER_OCC_LOOKUP)
print(f"USER_OCC_DIM: {USER_OCC_DIM}")

# USER_OCC_LOOKUP

USER_OCC_DIM: 21


In [36]:
MOVIE_GEN_LOOKUP = data_utils.get_dictionary_lookup_by_tf_data_key(
    key="movie_genres", dataset=train_dataset
)
MOVIE_GEN_DIM = len(MOVIE_GEN_LOOKUP)
print(f"MOVIE_GEN_DIM: {MOVIE_GEN_DIM}")

# MOVIE_GEN_LOOKUP

MOVIE_GEN_DIM: 19


#### Create `data_config.py`

In [37]:
config = f"""
USER_AGE_LOOKUP       = {USER_AGE_LOOKUP}
USER_AGE_DIM          = {USER_AGE_DIM}

USER_OCC_LOOKUP       = {USER_OCC_LOOKUP}
USER_OCC_DIM          = {USER_OCC_DIM}

MOVIE_GEN_LOOKUP      = {MOVIE_GEN_LOOKUP}
MOVIE_GEN_DIM         = {MOVIE_GEN_DIM}

MOVIELENS_NUM_MOVIES  = {MOVIELENS_NUM_MOVIES}
MOVIELENS_NUM_USERS   = {MOVIELENS_NUM_USERS}
"""

with open(f"{REPO_DOCKER_PATH_PREFIX}/{RL_SUB_DIR}/data_config.py", "w") as f:
    f.write(config)

In [38]:
from src.per_arm_rl import data_config  # as data_config

In [39]:
# data_config.USER_AGE_LOOKUP

In [40]:
# data_config.USER_OCC_LOOKUP

In [41]:
# data_config.MOVIE_GEN_LOOKUP

## [1.b] 1M dataset

> download and prepare [MovieLens 1M](https://www.tensorflow.org/datasets/catalog/movielens#movielens1m-movies) public dataset

In [42]:
DATA_TAG = "1m"

RECORD_COUNT = 1_000_000
SHARD_SIZE = 100_000
NUM_SHARDS = int(RECORD_COUNT / SHARD_SIZE)
print(f"NUM_SHARDS: {NUM_SHARDS}")

# paths
DATA_GCS_PREFIX = f"data/movielens-{DATA_TAG}"

print(f"DATA_GCS_PREFIX : {DATA_GCS_PREFIX}")

NUM_SHARDS: 10
DATA_GCS_PREFIX : data/movielens-1m


### load data from Tensorflow Datasets

* see [TFDS documentation](https://www.tensorflow.org/datasets/catalog/movielens#movielens100k-ratings) for more details on this dataset, feature descriptions, and other versions

In [43]:
ratings = tfds.load(f"movielens/{DATA_TAG}-ratings", split="train")

for x in ratings.batch(1).take(1):
    pprint(x)

[1mDownloading and preparing dataset 5.64 MiB (download: 5.64 MiB, generated: 308.42 MiB, total: 314.06 MiB) to /home/jwortz_google_com/tensorflow_datasets/movielens/1m-ratings/0.1.1...[0m


Dl Completed...: 0 url [00:00, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:00<?, ? url/s]
Extraction completed...: 100%|██████████| 4/4 [00:00<00:00,  6.94 file/s]
Dl Size...: 100%|██████████| 5/5 [00:00<00:00,  8.61 MiB/s]
Dl Completed...: 100%|██████████| 1/1 [00:00<00:00,  1.71 url/s]
                                                                         

Dl Completed...:   0%|          | 0/1 [00:00<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:00<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:00<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:00<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:00<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:00<?, ? url/s]
Dl Completed...: 100%|██████████| 1/1 [00:00<00:00,  2.65 url/s]
Dl Completed...: 100%|██████████| 1/1 [00:00<00:00,  2.58 url/s]
Dl Completed...: 100%|██████████| 1/1 [00:00<00:00,  1.93 url/s]
Dl Completed...: 100%|██████████| 1/1 [00:00<00:00,  1.82 url/s]
Dl Completed...: 100%|██████████| 1/1 [00:00<00:00,  1.80 url/s]
Dl Completed...: 100%|██████████| 1/1 [00:00<00:00,  1.78 url/s]
Dl Completed...: 100%|██████████| 1/1 [00:00<00:00,  1.76 url/s]
Dl Completed...: 100%|██████████| 1/1 [00:00<00:00,  1.75 url/s]
Dl Completed...: 100%|██████████| 1/1 [00:00<00:00,  1.73 url/s]
[A

[1mDataset movielens downloaded and prepared to /home/jwortz_google_com/tensorflow_datasets/movielens/1m-ratings/0.1.1. Subsequent calls will reuse this data.[0m
{'bucketized_user_age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([35.], dtype=float32)>,
 'movie_genres': <tf.Tensor: shape=(1, 2), dtype=int64, numpy=array([[0, 7]])>,
 'movie_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'3107'], dtype=object)>,
 'movie_title': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Backdraft (1991)'], dtype=object)>,
 'timestamp': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([977432193])>,
 'user_gender': <tf.Tensor: shape=(1,), dtype=bool, numpy=array([ True])>,
 'user_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'130'], dtype=object)>,
 'user_occupation_label': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([18])>,
 'user_occupation_text': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'technician/engineer'], dtype=object)>,
 'user_rating': <tf.T

In [44]:
bucket = storage_client.bucket(BUCKET_NAME)

for i in range(0, NUM_SHARDS):
    TF_RECORD_FILE = (
        f"ml-{DATA_TAG}-ratings-train-{str(i+1).zfill(2)}-of-{NUM_SHARDS}.tfrecord"
    )
    LOCAL_TF_RECORD_FILE = f"./{TF_RECORD_FILE}"
    take = SHARD_SIZE
    skip = take * i

    print(f"writing skip: {skip} take: {take}")
    ds_slice = ratings.skip(skip).take(take)

    print(f"writing: {LOCAL_TF_RECORD_FILE}...")
    data_utils.write_tfrecords(LOCAL_TF_RECORD_FILE, ds_slice, list_wise=False)

    DEST_BLOB = f"{DATA_GCS_PREFIX}/all/{TF_RECORD_FILE}"
    blob = bucket.blob(DEST_BLOB)
    blob.upload_from_filename(TF_RECORD_FILE)

    print(f"copied {TF_RECORD_FILE} to Cloud Storage\n")

writing skip: 0 take: 100000
writing: ./ml-1m-ratings-train-01-of-10.tfrecord...
copied ml-1m-ratings-train-01-of-10.tfrecord to Cloud Storage

writing skip: 100000 take: 100000
writing: ./ml-1m-ratings-train-02-of-10.tfrecord...
copied ml-1m-ratings-train-02-of-10.tfrecord to Cloud Storage

writing skip: 200000 take: 100000
writing: ./ml-1m-ratings-train-03-of-10.tfrecord...
copied ml-1m-ratings-train-03-of-10.tfrecord to Cloud Storage

writing skip: 300000 take: 100000
writing: ./ml-1m-ratings-train-04-of-10.tfrecord...
copied ml-1m-ratings-train-04-of-10.tfrecord to Cloud Storage

writing skip: 400000 take: 100000
writing: ./ml-1m-ratings-train-05-of-10.tfrecord...
copied ml-1m-ratings-train-05-of-10.tfrecord to Cloud Storage

writing skip: 500000 take: 100000
writing: ./ml-1m-ratings-train-06-of-10.tfrecord...
copied ml-1m-ratings-train-06-of-10.tfrecord to Cloud Storage

writing skip: 600000 take: 100000
writing: ./ml-1m-ratings-train-07-of-10.tfrecord...
copied ml-1m-ratings-trai

In [45]:
! gsutil ls gs://$BUCKET_NAME/$DATA_GCS_PREFIX/all

gs://rec-bandits-v2-cpg-cdp-bucket/data/movielens-1m/all/ml-1m-ratings-train-01-of-10.tfrecord
gs://rec-bandits-v2-cpg-cdp-bucket/data/movielens-1m/all/ml-1m-ratings-train-02-of-10.tfrecord
gs://rec-bandits-v2-cpg-cdp-bucket/data/movielens-1m/all/ml-1m-ratings-train-03-of-10.tfrecord
gs://rec-bandits-v2-cpg-cdp-bucket/data/movielens-1m/all/ml-1m-ratings-train-04-of-10.tfrecord
gs://rec-bandits-v2-cpg-cdp-bucket/data/movielens-1m/all/ml-1m-ratings-train-05-of-10.tfrecord
gs://rec-bandits-v2-cpg-cdp-bucket/data/movielens-1m/all/ml-1m-ratings-train-06-of-10.tfrecord
gs://rec-bandits-v2-cpg-cdp-bucket/data/movielens-1m/all/ml-1m-ratings-train-07-of-10.tfrecord
gs://rec-bandits-v2-cpg-cdp-bucket/data/movielens-1m/all/ml-1m-ratings-train-08-of-10.tfrecord
gs://rec-bandits-v2-cpg-cdp-bucket/data/movielens-1m/all/ml-1m-ratings-train-09-of-10.tfrecord
gs://rec-bandits-v2-cpg-cdp-bucket/data/movielens-1m/all/ml-1m-ratings-train-10-of-10.tfrecord


### copy subset to train subfolder

In [46]:
for i in range(1, (NUM_SHARDS-1)):
    TF_RECORD_FILE = f"ml-{DATA_TAG}-ratings-train-{str(i).zfill(2)}-of-{NUM_SHARDS}.tfrecord"
    print(TF_RECORD_FILE)
    
    ! gsutil -q cp gs://$BUCKET_NAME/$DATA_GCS_PREFIX/all/$TF_RECORD_FILE gs://$BUCKET_NAME/$DATA_GCS_PREFIX/train/$TF_RECORD_FILE
    ! rm ./$TF_RECORD_FILE

ml-1m-ratings-train-01-of-10.tfrecord
ml-1m-ratings-train-02-of-10.tfrecord
ml-1m-ratings-train-03-of-10.tfrecord
ml-1m-ratings-train-04-of-10.tfrecord
ml-1m-ratings-train-05-of-10.tfrecord
ml-1m-ratings-train-06-of-10.tfrecord
ml-1m-ratings-train-07-of-10.tfrecord
ml-1m-ratings-train-08-of-10.tfrecord


### copy subset to val subfolder

In [47]:
for i in range((NUM_SHARDS-1), (NUM_SHARDS+1)):
    TF_RECORD_FILE = f"ml-{DATA_TAG}-ratings-train-{str(i).zfill(2)}-of-{NUM_SHARDS}.tfrecord"
    print(TF_RECORD_FILE)
    
    ! gsutil -q cp gs://$BUCKET_NAME/$DATA_GCS_PREFIX/all/$TF_RECORD_FILE gs://$BUCKET_NAME/$DATA_GCS_PREFIX/val/$TF_RECORD_FILE
    ! rm ./$TF_RECORD_FILE

ml-1m-ratings-train-09-of-10.tfrecord
ml-1m-ratings-train-10-of-10.tfrecord


### validate TF Records

In [48]:
train_files = []
for blob in storage_client.list_blobs(
    f"{BUCKET_NAME}",
    prefix=f"{DATA_GCS_PREFIX}/all/",
    # delimiter='/'
):
    if ".tfrecord" in blob.name:
        train_files.append(
            blob.public_url.replace("https://storage.googleapis.com/", "gs://")
        )

train_files

['gs://rec-bandits-v2-cpg-cdp-bucket/data/movielens-1m/all/ml-1m-ratings-train-01-of-10.tfrecord',
 'gs://rec-bandits-v2-cpg-cdp-bucket/data/movielens-1m/all/ml-1m-ratings-train-02-of-10.tfrecord',
 'gs://rec-bandits-v2-cpg-cdp-bucket/data/movielens-1m/all/ml-1m-ratings-train-03-of-10.tfrecord',
 'gs://rec-bandits-v2-cpg-cdp-bucket/data/movielens-1m/all/ml-1m-ratings-train-04-of-10.tfrecord',
 'gs://rec-bandits-v2-cpg-cdp-bucket/data/movielens-1m/all/ml-1m-ratings-train-05-of-10.tfrecord',
 'gs://rec-bandits-v2-cpg-cdp-bucket/data/movielens-1m/all/ml-1m-ratings-train-06-of-10.tfrecord',
 'gs://rec-bandits-v2-cpg-cdp-bucket/data/movielens-1m/all/ml-1m-ratings-train-07-of-10.tfrecord',
 'gs://rec-bandits-v2-cpg-cdp-bucket/data/movielens-1m/all/ml-1m-ratings-train-08-of-10.tfrecord',
 'gs://rec-bandits-v2-cpg-cdp-bucket/data/movielens-1m/all/ml-1m-ratings-train-09-of-10.tfrecord',
 'gs://rec-bandits-v2-cpg-cdp-bucket/data/movielens-1m/all/ml-1m-ratings-train-10-of-10.tfrecord']

In [49]:
train_dataset = tf.data.TFRecordDataset(train_files)
train_dataset

<TFRecordDatasetV2 element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>

In [50]:
train_dataset = train_dataset.map(data_utils.parse_tfrecord)

for x in train_dataset.batch(1).take(1):
    pprint(x)

{'bucketized_user_age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([35.], dtype=float32)>,
 'movie_genres': <tf.Tensor: shape=(1, 1), dtype=int64, numpy=array([[0]])>,
 'movie_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'3107'], dtype=object)>,
 'timestamp': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([977432193])>,
 'user_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'130'], dtype=object)>,
 'user_occupation_text': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'technician/engineer'], dtype=object)>,
 'user_rating': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([5.], dtype=float32)>}


In [51]:
# Get the unique movies and users
unique_user_ratings = train_dataset.map(lambda x: x["user_rating"])

unique_user_ratings = np.unique([x.numpy() for x in unique_user_ratings])

unique_user_ratings

array([1., 2., 3., 4., 5.], dtype=float32)

### Generate look-up dicts

**TODO** - use more Tensorflow native method for generating vocabs and stats

#### unique movie IDs

In [52]:
# # Get the unique movies and users
# unique_movie_ids = train_dataset.map(lambda x: x["movie_id"])
# unique_movie_ids = np.unique([x.numpy().decode('utf-8') for x in unique_movie_ids])

movie_id_lookup = tf.keras.layers.StringLookup()
movie_id_lookup.adapt(train_dataset.map(lambda x: x["movie_id"]))
movie_id_vocab = movie_id_lookup.get_vocabulary()

MOVIELENS_NUM_MOVIES = len(movie_id_vocab)

print(f"len(movie_id_vocab) : {len(movie_id_vocab)}")
print(f"movie_id_vocab      : {movie_id_vocab[:2]}")

len(movie_id_vocab) : 3707
movie_id_vocab      : ['[UNK]', '2858']


#### unique user IDs

In [53]:
# unique_user_ids = train_dataset.map(lambda x: x["user_id"])
# unique_user_ids = np.unique([x.numpy().decode('utf-8') for x in unique_user_ids])

unique_user_ids_lookup = tf.keras.layers.StringLookup()
unique_user_ids_lookup.adapt(train_dataset.map(lambda x: x["user_id"]))
unique_user_ids_vocab = unique_user_ids_lookup.get_vocabulary()

MOVIELENS_NUM_USERS = len(unique_user_ids_vocab)

print(f"len(unique_user_ids_vocab) : {len(unique_user_ids_vocab)}")
print(f"unique_user_ids_vocab      : {unique_user_ids_vocab[:2]}")

#### unique occupational_text values

In [None]:
# Get the unique movies and users
# unique_occ_ids = train_dataset.map(lambda x: x["user_occupation_text"])
# unique_occ_ids = np.unique([x.numpy().decode('utf-8') for x in unique_occ_ids])

unique_occ_lookup = tf.keras.layers.StringLookup()
unique_occ_lookup.adapt(train_dataset.map(lambda x: x["user_occupation_text"]))
unique_occ_vocab = unique_occ_lookup.get_vocabulary()

NUM_OCCS = len(unique_occ_vocab)

print(f"len(unique_occ_vocab) : {len(unique_occ_vocab)}")
print(f"unique_occ_vocab      : {unique_occ_vocab[:2]}")

len(unique_occ_vocab) : 22
unique_occ_vocab      : ['[UNK]', 'college/grad student']


#### unique user_age values

In [None]:
unique_user_age = train_dataset.map(lambda x: x["bucketized_user_age"])
unique_user_age = np.unique([x.numpy() for x in unique_user_age])

print(f"len(unique_user_age) : {len(unique_user_age)}")
print(f"unique_user_age      : {unique_user_age[:2]}")

len(unique_user_age) : 7
unique_user_age      : [ 1. 18.]


#### timestamp buckets

In [None]:
max_timestamp = (
    ratings.map(lambda x: x["timestamp"])
    .reduce(tf.cast(0, tf.int64), tf.maximum)
    .numpy()
    .max()
)
min_timestamp = (
    ratings.map(lambda x: x["timestamp"])
    .reduce(np.int64(1e9), tf.minimum)
    .numpy()
    .min()
)

timestamp_buckets = np.linspace(min_timestamp, max_timestamp, num=1000)

print(f"timestamp_buckets: {timestamp_buckets[:3]}")

timestamp_buckets: [9.56703932e+08 9.56793772e+08 9.56883613e+08]


#### unique movie genres

In [None]:
unique_movie_genres = train_dataset.map(lambda x: x["movie_genres"])
unique_movie_genres = np.unique([x.numpy() for x in unique_movie_genres])

MOVIELENS_NUM_GENRES = len(unique_movie_genres)

print(f"len(unique_movie_genres) : {len(unique_movie_genres)}")
print(f"unique_movie_genres      : {unique_movie_genres[:2]}")

len(unique_movie_genres) : 18
unique_movie_genres      : [0 1]


### Write vocab dict

In [None]:
vocab_dict = {
    "movie_id": movie_id_vocab,
    "user_id": unique_user_ids_vocab,
    "user_occupation_text": unique_occ_vocab,
    "movie_genres": unique_movie_genres,
    "bucketized_user_age": unique_user_age,
    "max_timestamp": max_timestamp,
    "min_timestamp": min_timestamp,
    "timestamp_buckets": timestamp_buckets,
}

print(f"DATA_GCS_PREFIX : {DATA_GCS_PREFIX}")

VOCAB_1M_GCS_PREFIX : data/movielens-1m


In [None]:
VOCAB_FILE_NAME = "vocab_dict_1m.pkl"
filehandler = open(VOCAB_FILE_NAME, "wb")
pkl.dump(vocab_dict, filehandler)

filehandler.close()

VOCAB_DEST_BLOB = f"{DATA_GCS_PREFIX}/vocab_dict.pkl"

blob = bucket.blob(VOCAB_DEST_BLOB)
blob.upload_from_filename(VOCAB_FILE_NAME)

### Create lookup dictionaries

In [None]:
USER_AGE_LOOKUP = data_utils.get_dictionary_lookup_by_tf_data_key(
    key="bucketized_user_age", dataset=train_dataset
)

USER_AGE_DIM = len(USER_AGE_LOOKUP)
print(f"USER_AGE_DIM: {USER_AGE_DIM}")

USER_AGE_LOOKUP

USER_AGE_DIM: 7


{1.0: 0, 35.0: 1, 45.0: 2, 18.0: 3, 50.0: 4, 56.0: 5, 25.0: 6}

In [None]:
USER_OCC_LOOKUP = data_utils.get_dictionary_lookup_by_tf_data_key(
    key="user_occupation_text", dataset=train_dataset
)
USER_OCC_DIM = len(USER_OCC_LOOKUP)
print(f"USER_OCC_DIM: {USER_OCC_DIM}")

# USER_OCC_LOOKUP

USER_OCC_DIM: 21


In [None]:
MOVIE_GEN_LOOKUP = data_utils.get_dictionary_lookup_by_tf_data_key(
    key="movie_genres", dataset=train_dataset
)
MOVIE_GEN_DIM = len(MOVIE_GEN_LOOKUP)
print(f"MOVIE_GEN_DIM: {MOVIE_GEN_DIM}")

# MOVIE_GEN_LOOKUP

MOVIE_GEN_DIM: 18


#### Create `data_config_1m.py`

> write data config for subsequent notebooks

In [None]:
config = f"""
USER_AGE_LOOKUP       = {USER_AGE_LOOKUP}
USER_AGE_DIM          = {USER_AGE_DIM}

USER_OCC_LOOKUP       = {USER_OCC_LOOKUP}
USER_OCC_DIM          = {USER_OCC_DIM}

MOVIE_GEN_LOOKUP      = {MOVIE_GEN_LOOKUP}
MOVIE_GEN_DIM         = {MOVIE_GEN_DIM}

MOVIELENS_NUM_MOVIES  = {MOVIELENS_NUM_MOVIES}
MOVIELENS_NUM_USERS   = {MOVIELENS_NUM_USERS}
"""

with open(f"{REPO_DOCKER_PATH_PREFIX}/{RL_SUB_DIR}/data_config_1m.py", "w") as f:
    f.write(config)

# [2] Validate Ratings Matrix

In [None]:
from src.per_arm_rl import data_config_1m as data_config

In [None]:
# data_config.USER_AGE_LOOKUP

{1.0: 0, 35.0: 1, 45.0: 2, 18.0: 3, 50.0: 4, 56.0: 5, 25.0: 6}

In [None]:
# data_config.USER_OCC_LOOKUP

{b'clerical/admin': 0,
 b'other/not specified': 1,
 b'doctor/health care': 2,
 b'college/grad student': 3,
 b'writer': 4,
 b'lawyer': 5,
 b'tradesman/craftsman': 6,
 b'scientist': 7,
 b'farmer': 8,
 b'technician/engineer': 9,
 b'programmer': 10,
 b'unemployed': 11,
 b'K-12 student': 12,
 b'executive/managerial': 13,
 b'sales/marketing': 14,
 b'self-employed': 15,
 b'academic/educator': 16,
 b'retired': 17,
 b'customer service': 18,
 b'artist': 19,
 b'homemaker': 20}

In [None]:
# data_config.MOVIE_GEN_LOOKUP

{0: 0,
 1: 1,
 2: 2,
 3: 3,
 4: 4,
 5: 5,
 6: 6,
 7: 7,
 8: 8,
 9: 9,
 10: 10,
 12: 11,
 13: 12,
 14: 13,
 15: 14,
 16: 15,
 18: 16,
 19: 17}

## Load movielens rating matrix

In [None]:
local_data = train_dataset.map(
    lambda x: {
        "user_id": x["user_id"],
        "movie_id": x["movie_id"],
        "user_rating": x["user_rating"],
        "bucketized_user_age": x["bucketized_user_age"],
        "user_occupation_text": x["user_occupation_text"],
        "movie_genres": x["movie_genres"][0],
    }
)
local_data

<_MapDataset element_spec={'user_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'movie_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'user_rating': TensorSpec(shape=(), dtype=tf.float32, name=None), 'bucketized_user_age': TensorSpec(shape=(), dtype=tf.float32, name=None), 'user_occupation_text': TensorSpec(shape=(), dtype=tf.string, name=None), 'movie_genres': TensorSpec(shape=(), dtype=tf.int64, name=None)}>

In [None]:
ratings_matrix = np.zeros(
    [data_config.MOVIELENS_NUM_USERS, data_config.MOVIELENS_NUM_MOVIES]
)
ratings_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
user_age_int = []
user_occ_int = []
mov_gen_int = []

for row in local_data:
    ratings_matrix[
        int(row["user_id"].numpy()) - 1, int(row["movie_id"].numpy()) - 1
    ] = float(row["user_rating"].numpy())

    user_age_int.append(
        float(data_config.USER_AGE_LOOKUP[row["bucketized_user_age"].numpy()]) + 0.0001
    )
    user_occ_int.append(
        float(data_config.USER_OCC_LOOKUP[row["user_occupation_text"].numpy()]) + 0.0001
    )
    mov_gen_int.append(
        float(data_config.MOVIE_GEN_LOOKUP[row["movie_genres"].numpy()]) + 0.0001
    )

In [None]:
ratings_matrix

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [None]:
np.array(user_age_int)

array([2.0001, 6.0001, 3.0001, ..., 3.0001, 1.0001, 3.0001])

In [None]:
np.array(user_occ_int)

array([14.0001,  7.0001, 10.0001, ..., 10.0001,  2.0001, 10.0001])

In [None]:
np.array(mov_gen_int)

array([7.00010e+00, 4.00010e+00, 4.00010e+00, ..., 1.00001e+01,
       1.00000e-04, 4.00010e+00])

The commands above are executed when calling the `load_movielens_ratings()` function below:

In [None]:
test_dataset_load = data_utils.load_movielens_ratings(
    ratings_dataset=train_dataset,
    num_users=data_config.MOVIELENS_NUM_USERS,
    num_movies=data_config.MOVIELENS_NUM_MOVIES,
    user_age_lookup_dict=data_config.USER_AGE_LOOKUP,
    user_occ_lookup_dict=data_config.USER_OCC_LOOKUP,
    movie_gen_lookup_dict=data_config.MOVIE_GEN_LOOKUP,
)

print(f"num outputs: {len(test_dataset_load)}")

test_dataset_load

num outputs: 4


(array([[5., 3., 4., ..., 0., 0., 0.],
        [4., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [5., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 5., 0., ..., 0., 0., 0.]]),
 array([2.0001, 6.0001, 3.0001, ..., 3.0001, 1.0001, 3.0001]),
 array([14.0001,  7.0001, 10.0001, ..., 10.0001,  2.0001, 10.0001]),
 array([7.00010e+00, 4.00010e+00, 4.00010e+00, ..., 1.00001e+01,
        1.00000e-04, 4.00010e+00]))

In [None]:
ratings_matrix = test_dataset_load[0]
print(ratings_matrix.shape)
ratings_matrix

(943, 1682)


array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [None]:
user_age_int = test_dataset_load[1]
print(user_age_int[0])
user_age_int

2.0001


array([2.0001, 6.0001, 3.0001, ..., 3.0001, 1.0001, 3.0001])

In [None]:
user_occ_int = test_dataset_load[2]
print(user_occ_int)
user_occ_int

[ 6.0001 20.0001 10.0001 ... 10.0001 13.0001 10.0001]


array([ 6.0001, 20.0001, 10.0001, ..., 10.0001, 13.0001, 10.0001])

In [None]:
mov_gen_int = test_dataset_load[3]
print(mov_gen_int)
mov_gen_int

[7.00010e+00 4.00010e+00 4.00010e+00 ... 1.00001e+01 1.00000e-04
 4.00010e+00]


array([7.00010e+00, 4.00010e+00, 4.00010e+00, ..., 1.00001e+01,
       1.00000e-04, 4.00010e+00])

# [3] Ranking Data (listwise)

> TODO: clean-up / optimize

**define how many items should be in each example's item list:**

In [None]:
NUM_EXAMPLES_PER_LIST = 5  # 3 | 5

**write this value to config file for `data_util` functions**

In [None]:
config = f"""
NUM_EXAMPLES_PER_LIST = {NUM_EXAMPLES_PER_LIST}
"""
# TODO - cleanup
with open(f"{REPO_DOCKER_PATH_PREFIX}/{RL_SUB_DIR}/utils_config.py", "w") as f:
    f.write(config)

In [None]:
from src.per_arm_rl import utils_config as utils_config

utils_config.NUM_EXAMPLES_PER_LIST

5

In [None]:
# train = shuffled.take(80_000)
# val = shuffled.skip(80_000).take(20_000)

In [None]:
for x in train.batch(1).take(1):
    pprint(x)

{'bucketized_user_age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([35.], dtype=float32)>,
 'movie_genres': <tf.Tensor: shape=(1, 1), dtype=int64, numpy=array([[7]])>,
 'movie_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'898'], dtype=object)>,
 'movie_title': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Postman, The (1997)'], dtype=object)>,
 'raw_user_age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([44.], dtype=float32)>,
 'timestamp': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([885409515])>,
 'user_gender': <tf.Tensor: shape=(1,), dtype=bool, numpy=array([False])>,
 'user_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'681'], dtype=object)>,
 'user_occupation_label': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([14])>,
 'user_occupation_text': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'marketing'], dtype=object)>,
 'user_rating': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([4.], dtype=float32)>,
 'user_zip_cod

## Create train listwise ds

In [None]:
NUM_EXAMPLES_PER_LIST = 5  # 3 | 5

In [None]:
config = f"""
NUM_EXAMPLES_PER_LIST = {NUM_EXAMPLES_PER_LIST}
"""
# TODO - cleanup
with open(f"{REPO_DOCKER_PATH_PREFIX}/{RL_SUB_DIR}/utils_config.py", "w") as f:
    f.write(config)

In [None]:
from src.per_arm_rl import utils_config as utils_config

utils_config.NUM_EXAMPLES_PER_LIST

In [None]:
# We sample 50 lists for each user for the training data. For each list we
# sample (3 | 5) movies from the movies the user rated.
train_lw = data_utils.create_listwise_ds(
    train, num_list_per_user=50, num_examples_per_list=NUM_EXAMPLES_PER_LIST, seed=42
)

In [None]:
for example in train_lw.skip(7).take(1):
    pprint(example)

{'movie_genres': <tf.Tensor: shape=(5,), dtype=int64, numpy=array([4, 7, 7, 2, 7])>,
 'movie_id': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'294', b'690', b'1176', b'538', b'310'], dtype=object)>,
 'user_id': <tf.Tensor: shape=(), dtype=string, numpy=b'681'>,
 'user_rating': <tf.Tensor: shape=(5,), dtype=float32, numpy=array([5., 4., 4., 3., 3.], dtype=float32)>}


In [None]:
example["user_rating"].numpy()

array([5., 4., 4., 3., 3.], dtype=float32)

In [None]:
len(list(train_lw))

47150

### write TF records file

In [None]:
DATA_PATH

'gs://rec-bandits-v2-hybrid-vertex-bucket/data'

In [None]:
# train split
TF_RECORD_FILE_lw_train = f"ml-100k-listwise-{NUM_EXAMPLES_PER_LIST}n-train.tfrecord"
LOCAL_TF_RECORD_lw_train = f"./{TF_RECORD_FILE_lw_train}"

In [None]:
data_utils.write_tfrecords(LOCAL_TF_RECORD_lw_train, train_lw, list_wise=True)

#### validate TRAIN TF record file(s)

In [None]:
# test TF record local
tmp_lw_dataset = tf.data.TFRecordDataset(LOCAL_TF_RECORD_lw_train)
tmp_lw_dataset = tmp_lw_dataset.map(data_utils.parse_lw_tfrecord)

for x in tmp_lw_dataset.batch(1).take(1):
    pprint(x)

{'movie_genres': <tf.Tensor: shape=(1, 5), dtype=int64, numpy=array([[ 7,  4,  7,  7, 10]])>,
 'movie_id': <tf.Tensor: shape=(1, 5), dtype=string, numpy=array([[b'898', b'294', b'258', b'1176', b'682']], dtype=object)>,
 'user_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'681'], dtype=object)>,
 'user_rating': <tf.Tensor: shape=(1, 5), dtype=float32, numpy=array([[4., 5., 1., 4., 1.]], dtype=float32)>}


In [None]:
lw_train_prefix = f"{DATA_GCS_PREFIX}/listwise-{NUM_EXAMPLES_PER_LIST}n-train"
LW_TRAIN_DATA_PATH = f"{BUCKET_URI}/{lw_train_prefix}"

! gsutil -q cp $LOCAL_TF_RECORD_lw_train $LW_TRAIN_DATA_PATH/

In [None]:
# ! gsutil ls $LW_TRAIN_DATA_PATH

gs://rec-bandits-v2-hybrid-vertex-bucket/data/listwise-5n-train/ml-100k-listwise-5n-train.tfrecord


In [None]:
test_lw_files = []

for blob in storage_client.list_blobs(
    f"{BUCKET_NAME}", prefix=f"{lw_train_prefix}/", delimiter="/"
):
    if ".tfrecord" in blob.name:
        test_lw_files.append(
            blob.public_url.replace("https://storage.googleapis.com/", "gs://")
        )

tmp_lw_dataset = tf.data.TFRecordDataset(test_lw_files)
tmp_lw_dataset = tmp_lw_dataset.map(data_utils.parse_lw_tfrecord)

for x in tmp_lw_dataset.batch(1).take(1):
    pprint(x)

{'movie_genres': <tf.Tensor: shape=(1, 5), dtype=int64, numpy=array([[ 7,  4,  7,  7, 10]])>,
 'movie_id': <tf.Tensor: shape=(1, 5), dtype=string, numpy=array([[b'898', b'294', b'258', b'1176', b'682']], dtype=object)>,
 'user_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'681'], dtype=object)>,
 'user_rating': <tf.Tensor: shape=(1, 5), dtype=float32, numpy=array([[4., 5., 1., 4., 1.]], dtype=float32)>}


## Create val listwise ds

In [None]:
val_lw = data_utils.create_listwise_ds(
    val, num_list_per_user=1, num_examples_per_list=NUM_EXAMPLES_PER_LIST, seed=42
)

In [None]:
for example in val_lw.take(3):
    pprint(example)

{'movie_genres': <tf.Tensor: shape=(5,), dtype=int64, numpy=array([ 3,  0,  0,  0, 19])>,
 'movie_id': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'94', b'245', b'403', b'50', b'470'], dtype=object)>,
 'user_id': <tf.Tensor: shape=(), dtype=string, numpy=b'346'>,
 'user_rating': <tf.Tensor: shape=(5,), dtype=float32, numpy=array([3., 4., 3., 5., 3.], dtype=float32)>}
{'movie_genres': <tf.Tensor: shape=(5,), dtype=int64, numpy=array([7, 0, 0, 2, 7])>,
 'movie_id': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'678', b'127', b'343', b'1', b'125'], dtype=object)>,
 'user_id': <tf.Tensor: shape=(), dtype=string, numpy=b'602'>,
 'user_rating': <tf.Tensor: shape=(5,), dtype=float32, numpy=array([4., 5., 2., 4., 4.], dtype=float32)>}
{'movie_genres': <tf.Tensor: shape=(5,), dtype=int64, numpy=array([4, 7, 4, 0, 4])>,
 'movie_id': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'168', b'1053', b'26', b'110', b'1048'], dtype=object)>,
 'user_id': <tf.Tensor: shape=(), dtype=st

In [None]:
len(list(val_lw))

836

### write TF records file

In [None]:
# val split
TF_RECORD_FILE_lw_val = f"ml-100k-listwise-{NUM_EXAMPLES_PER_LIST}n-val.tfrecord"
LOCAL_TF_RECORD_lw_val = f"./{TF_RECORD_FILE_lw_val}"

In [None]:
data_utils.write_tfrecords(LOCAL_TF_RECORD_lw_val, val_lw, list_wise=True)

#### validate VAL TF record file(s)

In [None]:
# test TF record local
tmp_lw_dataset = tf.data.TFRecordDataset(LOCAL_TF_RECORD_lw_val)
tmp_lw_dataset = tmp_lw_dataset.map(data_utils.parse_lw_tfrecord)

for x in tmp_lw_dataset.batch(1).take(1):
    pprint(x)

{'movie_genres': <tf.Tensor: shape=(1, 5), dtype=int64, numpy=array([[ 3,  0,  0,  0, 19]])>,
 'movie_id': <tf.Tensor: shape=(1, 5), dtype=string, numpy=array([[b'94', b'245', b'403', b'50', b'470']], dtype=object)>,
 'user_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'346'], dtype=object)>,
 'user_rating': <tf.Tensor: shape=(1, 5), dtype=float32, numpy=array([[3., 4., 3., 5., 3.]], dtype=float32)>}


In [None]:
lw_val_prefix = f"{DATA_GCS_PREFIX}/listwise-{NUM_EXAMPLES_PER_LIST}n-val"
LW_VAL_DATA_PATH = f"{BUCKET_URI}/{lw_val_prefix}"

! gsutil -q cp $LOCAL_TF_RECORD_lw_val $LW_VAL_DATA_PATH/

In [None]:
# ! gsutil ls $LW_VAL_DATA_PATH

gs://rec-bandits-v2-hybrid-vertex-bucket/data/listwise-5n-val/ml-100k-listwise-5n-val.tfrecord


In [None]:
test_lw_files = []

for blob in storage_client.list_blobs(
    f"{BUCKET_NAME}", prefix=f"{lw_val_prefix}/", delimiter="/"
):
    if ".tfrecord" in blob.name:
        test_lw_files.append(
            blob.public_url.replace("https://storage.googleapis.com/", "gs://")
        )

tmp_lw_dataset = tf.data.TFRecordDataset(test_lw_files)
tmp_lw_dataset = tmp_lw_dataset.map(data_utils.parse_lw_tfrecord)

for x in tmp_lw_dataset.batch(1).take(1):
    pprint(x)

{'movie_genres': <tf.Tensor: shape=(1, 5), dtype=int64, numpy=array([[ 3,  0,  0,  0, 19]])>,
 'movie_id': <tf.Tensor: shape=(1, 5), dtype=string, numpy=array([[b'94', b'245', b'403', b'50', b'470']], dtype=object)>,
 'user_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'346'], dtype=object)>,
 'user_rating': <tf.Tensor: shape=(1, 5), dtype=float32, numpy=array([[3., 4., 3., 5., 3.]], dtype=float32)>}


### get uniques

In [None]:
movies = ratings.map(lambda x: x["movie_id"])
unique_movie_ids = np.unique(np.concatenate(list(movies.batch(1000))))

len(unique_movie_ids)

1682

In [None]:
users = ratings.map(lambda x: x["user_id"])

unique_user_ids = np.unique(np.concatenate(list(users.batch(1000))))

len(unique_user_ids)

943

In [None]:
# ! gsutil -m cp -r gs://$BUCKET_NAME/$DATA_GCS_PREFIX/train gs://$BUCKET_NAME/$DATA_GCS_PREFIX/train_v1
# ! gsutil -m cp -r gs://$BUCKET_NAME/$DATA_GCS_PREFIX/val gs://$BUCKET_NAME/$DATA_GCS_PREFIX/val_v1

# Done

**Finished**