# Preparing off-policy training data for RL

> "Off-policy" refers to the situation where for a data record, given its observation, the current policy in training might not choose the same action as the one in said data record

## Load env config

* use the prefix from `00-env-setup`

In [1]:
PREFIX = 'mabv1'

**run the next cell to populate env vars**

In [2]:
# staging GCS
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}-bucket'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PROJECT_ID               = "wortz-project-352116"
PROJECT_NUM              = "679926387543"
LOCATION                 = "us-central1"

REGION                   = "us-central1"
BQ_LOCATION              = "US"
VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"

VERTEX_SA                = "679926387543-compute@developer.gserviceaccount.com"

PREFIX                   = "mabv1"
VERSION                  = "v1"

BUCKET_NAME              = "mabv1-wortz-project-352116-bucket"
BUCKET_URI               = "gs://mabv1-wortz-project-352116-bucket"
DATA_GCS_PREFIX          = "data"
DATA_PATH                = "gs://mabv1-wortz-project-352116-bucket/data"
VOCAB_SUBDIR             = "vocabs"
VOCAB_FILENAME           = "vocab_dict.pkl"

VPC_NETWORK_FULL         = "projects/679926387543/global/networks/ucaip-haystack-vpc-network"

BIGQUERY_DATASET_ID      = "wortz-project-352116.movielens_dataset_mabv1"
BIGQUERY_TABLE_ID        = "wortz-project-352116.movielens_dataset_mabv1.training_dataset"

REPO_D

In [3]:
! gsutil ls $BUCKET_URI

gs://mabv1-wortz-project-352116-bucket/config/
gs://mabv1-wortz-project-352116-bucket/data/
gs://mabv1-wortz-project-352116-bucket/neural-linear-bandits-v1/
gs://mabv1-wortz-project-352116-bucket/vocabs/


## imports

In [4]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [5]:
import numpy as np
import sys
import time
from pprint import pprint
from IPython import display
import matplotlib.pyplot as plt

import logging
logging.disable(logging.WARNING)

import tensorflow as tf
import tensorflow_datasets as tfds

# GPU
from numba import cuda
import gc

# google cloud
from google.cloud import aiplatform, storage

caused by: ['/opt/conda/envs/tensorflow/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/envs/tensorflow/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [6]:
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
    
gpus

[]

In [7]:
# device = cuda.get_current_device()
# device.reset()
# gc.collect()

### Initialize GCP clients

In [8]:
# cloud storage client
storage_client = storage.Client(project=PROJECT_ID)

# Vertex client
aiplatform.init(project=PROJECT_ID, location=LOCATION)

### Create `data_utils.py`

> this will be used to support data processing throughout the development workflow

In [9]:
# REPO_DOCKER_PATH_PREFIX = 'src'
# RL_SUB_DIR              = 'per_arm_rl'

In [10]:
# ! rm -rf {REPO_DOCKER_PATH_PREFIX}/{RL_SUB_DIR}
# ! mkdir -p {REPO_DOCKER_PATH_PREFIX}/{RL_SUB_DIR}
# ! touch {REPO_DOCKER_PATH_PREFIX}/{RL_SUB_DIR}/__init__.py

In [11]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/{RL_SUB_DIR}/data_utils.py
# Copyright 2021 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and

import os
import numpy as np
from typing import Dict

import tensorflow as tf

# ============================================
# features
# ============================================

def get_all_features():
    
    feats = {
        # user - global context features
        'user_id': tf.io.FixedLenFeature(shape=(), dtype=tf.string),
        'user_rating': tf.io.FixedLenFeature(shape=(), dtype=tf.float32),
        'bucketized_user_age': tf.io.FixedLenFeature(shape=(), dtype=tf.float32),
        'user_occupation_text': tf.io.FixedLenFeature(shape=(), dtype=tf.string),
        'user_occupation_label': tf.io.FixedLenFeature(shape=(), dtype=tf.int64),
        'user_zip_code': tf.io.FixedLenFeature(shape=(), dtype=tf.string),
        'user_gender': tf.io.FixedLenFeature(shape=(), dtype=tf.string),
        'timestamp': tf.io.FixedLenFeature(shape=(), dtype=tf.int64),

        # movie - per arm features
        'movie_id': tf.io.FixedLenFeature(shape=(), dtype=tf.string),
        'movie_title': tf.io.FixedLenFeature(shape=(), dtype=tf.string),
        'movie_genres': tf.io.FixedLenFeature(shape=(1,), dtype=tf.int64),
    }
    
    return feats

# ================================================
# converting features to `tf.train.Example` proto
# ================================================

def _bytes_feature(value):
    """
    Get byte features
    """
    # value = tf.io.serialize_tensor(value)
    # value = value.numpy()
    if type(value) == list:
        return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))
    else:
        return tf.train.Feature(bytes_list=tf.train.BytesList(value=[i.numpy() for i in [value]]))

def _int64_feature(value):
    """
    Get int64 feature
    """
    if type(value) == list:
        return tf.train.Feature(int64_list=tf.train.Int64List(value=[int(v) for v in value]))
    else:
        return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
    
def _int64_list_feature(value):
    """
    Get int64 list feature
    """
    value = value.numpy().tolist()[0]
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _simple_string(value):
    """
    Gender Feature - True = Male in the training dataset
    """
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[str(value).encode('utf-8')]))

def _string_array(value, shape=1):
    """
    Returns a bytes_list from a string / byte.
    """
    value = value.numpy() # .tolist()[0]
    # try:
    #     value = value.numpy()
    # except:
    #     pass
    if type(value) == list:
        return tf.train.Feature(bytes_list=tf.train.BytesList(value=[str(v) for v in value]))
    else:
        return tf.train.Feature(bytes_list=tf.train.BytesList(value=[str(value).encode('utf-8')]))

def _float_feature(value, shape=1):
    """
    Returns a float_list from a float / double.
    """
    if type(value) == list:
        return tf.train.Feature(float_list=tf.train.FloatList(value=value))
    else:
        return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

    

def build_example(data) -> tf.train.Example:
    """
    Returns: A `tf.train.Example` object holding the same data as `data_row`.
    """
    feature = {
        # user - global context features 
        "user_id": _bytes_feature(data['user_id'])
        , "user_rating": _float_feature(data['user_rating'])
        , "bucketized_user_age": _float_feature(data['bucketized_user_age'])
        , "user_occupation_text": _bytes_feature(data['user_occupation_text'])
        , "user_occupation_label": _int64_feature(data['user_occupation_label'])
        , "user_zip_code": _bytes_feature(data['user_zip_code'])
        , "user_gender": _string_array(data['user_gender'])
        , "timestamp": _int64_feature(data['timestamp'])
        
        # movie - per arm features
        , "movie_id": _bytes_feature(data['movie_id'])
        , "movie_title": _bytes_feature(data['movie_title'])
        , "movie_genres": _int64_list_feature(data['movie_genres'])
    }
    example_proto = tf.train.Example(
        features=tf.train.Features(feature=feature)
    )
    return example_proto


# ============================================
# tf data parsing functions
# ============================================
options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA

def parse_tfrecord(example):
    """
    Reads a serialized example from GCS and converts to tfrecord
    """
    feats = get_all_features()
    
    example = tf.io.parse_example(
        example,
        feats
        # features=feats
    )
    return example

# data loading and parsing
def full_parse(data):
    # used for interleave - takes tensors and returns a tf.dataset
    data = tf.data.TFRecordDataset(data)
    return data

# ============================================
# TF lookup dictionary
# ============================================

def get_dictionary_lookup_by_tf_data_key(key, dataset) -> Dict:
    tensor = dataset.map(lambda x: x[key])
    unique_elems = set()
    for x in tensor:
        val = x.numpy()
        if type(val) is np.ndarray: # if multi dimesnional only grab first one
            val = val[0]
        unique_elems.add(val)
    
    #return a dictionary of keys by integer values for the feature space
    return {val: i for i, val in enumerate(unique_elems)}


# ============================================
# TF-Record Writer
# ============================================
def write_tfrecords(tfrecord_file, dataset):
    with tf.io.TFRecordWriter(tfrecord_file) as writer:
        for data_row in dataset:
            example = build_example(data_row)
            writer.write(example.SerializeToString())

# ============================================
# load movielens
# ============================================
def load_movielens_ratings(
    ratings_dataset
    , num_users: int
    , num_movies: int
    , user_age_lookup_dict: dict
    , user_occ_lookup_dict: dict
    , movie_gen_lookup_dict: dict
):
    """
    > loads (wide) movielens ratings data 
    > returns ratings matrix
    """
    ratings_matrix = np.zeros([num_users, num_movies])
    
    local_data = ratings_dataset.map(
        lambda x: {
            'user_id': x['user_id']
            ,'movie_id':  x['movie_id']
            ,'user_rating':  x['user_rating']
            ,'bucketized_user_age': x['bucketized_user_age']
            ,'user_occupation_text': x['user_occupation_text']
            ,'movie_genres': x['movie_genres'][0]
        }
    )
    user_age_int = []
    user_occ_int = []
    mov_gen_int = []
    
    for row in local_data:
        ratings_matrix[
            int(row['user_id'].numpy()) - 1
            , int(row['movie_id'].numpy()) - 1
        ] = float(row['user_rating'].numpy())
        
        user_age_int.append(
            float(user_age_lookup_dict[row['bucketized_user_age'].numpy()]) + .0001
        )
        user_occ_int.append(
            float(user_occ_lookup_dict[row['user_occupation_text'].numpy()]) + .0001
        )
        mov_gen_int.append(
            float(movie_gen_lookup_dict[row['movie_genres'].numpy()]) + .0001
        ) 
    return ratings_matrix, np.array(user_age_int), np.array(user_occ_int), np.array(mov_gen_int)

Overwriting src/per_arm_rl/data_utils.py


## Prepare Movielens dataset

### load data from Tensorflow Datasets

* see [TFDS documentation](https://www.tensorflow.org/datasets/catalog/movielens#movielens100k-ratings) for more details on this dataset, feature descriptions, and other versions

In [12]:
ratings = tfds.load("movielens/100k-ratings", split="train")

for x in ratings.batch(1).take(1):
    pprint(x)

{'bucketized_user_age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([45.], dtype=float32)>,
 'movie_genres': <tf.Tensor: shape=(1, 1), dtype=int64, numpy=array([[7]])>,
 'movie_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'357'], dtype=object)>,
 'movie_title': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b"One Flew Over the Cuckoo's Nest (1975)"], dtype=object)>,
 'raw_user_age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([46.], dtype=float32)>,
 'timestamp': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([879024327])>,
 'user_gender': <tf.Tensor: shape=(1,), dtype=bool, numpy=array([ True])>,
 'user_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'138'], dtype=object)>,
 'user_occupation_label': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([4])>,
 'user_occupation_text': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'doctor'], dtype=object)>,
 'user_rating': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([4.], dtype=float32)>,

### write dataset to TF Records

In [13]:
from src.per_arm_rl import data_utils # as data_utils

TF_RECORD_FILE = "ml-ratings-100k-full.tfrecord"

In [14]:
data_utils.write_tfrecords(TF_RECORD_FILE, ratings)

### save TF Records to GCS

In [15]:
LOCAL_TF_RECORD = f"./{TF_RECORD_FILE}"

! gsutil -q cp $LOCAL_TF_RECORD $DATA_PATH/

! gsutil ls $DATA_PATH

gs://mabv1-wortz-project-352116-bucket/data/ml-ratings-100k-full.tfrecord
gs://mabv1-wortz-project-352116-bucket/data/train/
gs://mabv1-wortz-project-352116-bucket/data/val/


In [16]:
#save to train
! gsutil -q cp $LOCAL_TF_RECORD $DATA_PATH/train/

## validate TF Records

In [21]:
train_files = []
for blob in storage_client.list_blobs(f"{BUCKET_NAME}", prefix=f'{DATA_GCS_PREFIX}/'):
    if '.tfrecord' in blob.name:
        train_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
        
train_files

['gs://mabv1-wortz-project-352116-bucket/data/ml-ratings-100k-full.tfrecord']

In [22]:
train_dataset = tf.data.TFRecordDataset(train_files)

train_dataset

<TFRecordDatasetV2 element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>

In [23]:
train_dataset = train_dataset.map(data_utils.parse_tfrecord)

for x in train_dataset.batch(3).take(1):
    pprint(x)

{'bucketized_user_age': <tf.Tensor: shape=(3,), dtype=float32, numpy=array([45., 25., 18.], dtype=float32)>,
 'movie_genres': <tf.Tensor: shape=(3, 1), dtype=int64, numpy=
array([[7],
       [4],
       [4]])>,
 'movie_id': <tf.Tensor: shape=(3,), dtype=string, numpy=array([b'357', b'709', b'412'], dtype=object)>,
 'movie_title': <tf.Tensor: shape=(3,), dtype=string, numpy=
array([b"One Flew Over the Cuckoo's Nest (1975)",
       b'Strictly Ballroom (1992)', b'Very Brady Sequel, A (1996)'],
      dtype=object)>,
 'timestamp': <tf.Tensor: shape=(3,), dtype=int64, numpy=array([879024327, 875654590, 882075110])>,
 'user_gender': <tf.Tensor: shape=(3,), dtype=string, numpy=array([b'True', b'True', b'True'], dtype=object)>,
 'user_id': <tf.Tensor: shape=(3,), dtype=string, numpy=array([b'138', b'92', b'301'], dtype=object)>,
 'user_occupation_label': <tf.Tensor: shape=(3,), dtype=int64, numpy=array([ 4,  5, 17])>,
 'user_occupation_text': <tf.Tensor: shape=(3,), dtype=string, numpy=array([b

## Generate look-up dicts

**TODO** - use more Tensorflow native method for generating vocabs and stats

### unique IDs

In [24]:
# Get the unique movies and users
unique_movie_ids = train_dataset.map(lambda x: x["movie_id"])

unique_movie_ids = np.unique([x.numpy() for x in unique_movie_ids])

MOVIELENS_NUM_MOVIES = len(unique_movie_ids)

print(f"len(unique_movie_ids) : {len(unique_movie_ids)}")
print(f"unique_movie_ids      : {unique_movie_ids[:2]}")

len(unique_movie_ids) : 1682
unique_movie_ids      : [b'1' b'10']


In [25]:
# Get the unique movies and users
# unique_movie_ids = ratings.map(lambda x: x["movie_id"])
unique_occ_ids = train_dataset.map(lambda x: x["user_occupation_text"])

unique_occ_ids = np.unique([x.numpy() for x in unique_occ_ids])

NUM_OCCS = len(unique_occ_ids)

print(f"len(unique_occ_ids) : {len(unique_occ_ids)}")
print(f"unique_occ_ids      : {unique_occ_ids[:2]}")

len(unique_occ_ids) : 21
unique_occ_ids      : [b'administrator' b'artist']


In [26]:
# unique_user_ids = ratings.map(lambda x: x["user_id"])
unique_user_ids = train_dataset.map(lambda x: x["user_id"])

unique_user_ids = np.unique([x.numpy() for x in unique_user_ids])

MOVIELENS_NUM_USERS = len(unique_user_ids)

print(f"len(unique_user_ids) : {len(unique_user_ids)}")
print(f"unique_user_ids      : {unique_user_ids[:2]}")

len(unique_user_ids) : 943
unique_user_ids      : [b'1' b'10']


### lookup dictionaries

In [27]:
USER_AGE_LOOKUP = data_utils.get_dictionary_lookup_by_tf_data_key(
    key = 'bucketized_user_age'
    , dataset= train_dataset
)

USER_AGE_DIM = len(USER_AGE_LOOKUP)
print(f"USER_AGE_DIM: {USER_AGE_DIM}")

USER_AGE_LOOKUP

USER_AGE_DIM: 7


{1.0: 0, 35.0: 1, 45.0: 2, 18.0: 3, 50.0: 4, 56.0: 5, 25.0: 6}

In [28]:
USER_OCC_LOOKUP = data_utils.get_dictionary_lookup_by_tf_data_key(
    key = 'user_occupation_text'
    , dataset= train_dataset
)
USER_OCC_DIM = len(USER_OCC_LOOKUP)
print(f"USER_OCC_DIM: {USER_OCC_DIM}")

USER_OCC_LOOKUP

USER_OCC_DIM: 21


{b'homemaker': 0,
 b'technician': 1,
 b'engineer': 2,
 b'administrator': 3,
 b'writer': 4,
 b'retired': 5,
 b'librarian': 6,
 b'entertainment': 7,
 b'scientist': 8,
 b'doctor': 9,
 b'marketing': 10,
 b'programmer': 11,
 b'lawyer': 12,
 b'educator': 13,
 b'executive': 14,
 b'other': 15,
 b'healthcare': 16,
 b'salesman': 17,
 b'artist': 18,
 b'none': 19,
 b'student': 20}

In [None]:
MOVIE_GEN_LOOKUP = data_utils.get_dictionary_lookup_by_tf_data_key(
    key = 'movie_genres'
    , dataset= train_dataset
)
MOVIE_GEN_DIM = len(MOVIE_GEN_LOOKUP)
print(f"MOVIE_GEN_DIM: {MOVIE_GEN_DIM}")

MOVIE_GEN_LOOKUP

## create `data_config.py`

> write data config for subsequent notebooks

In [None]:
config = f'''
USER_AGE_LOOKUP       = {USER_AGE_LOOKUP}
USER_AGE_DIM          = {USER_AGE_DIM}

USER_OCC_LOOKUP       = {USER_OCC_LOOKUP}
USER_OCC_DIM          = {USER_OCC_DIM}

MOVIE_GEN_LOOKUP      = {MOVIE_GEN_LOOKUP}
MOVIE_GEN_DIM         = {MOVIE_GEN_DIM}

MOVIELENS_NUM_MOVIES  = {MOVIELENS_NUM_MOVIES}
MOVIELENS_NUM_USERS   = {MOVIELENS_NUM_USERS}
'''
# TODO - cleanup
with open(f'{REPO_DOCKER_PATH_PREFIX}/{RL_SUB_DIR}/data_config.py', 'w') as f:
    f.write(config)

### Validate creating the ratings matrix

In [None]:
from src.per_arm_rl import data_config # as data_config

In [None]:
data_config.USER_AGE_LOOKUP

In [None]:
data_config.USER_OCC_LOOKUP

In [None]:
data_config.MOVIE_GEN_LOOKUP

In [None]:
test_dataset_load = data_utils.load_movielens_ratings(
    ratings_dataset = train_dataset
    , num_users = data_config.MOVIELENS_NUM_USERS
    , num_movies = data_config.MOVIELENS_NUM_MOVIES
    , user_age_lookup_dict = data_config.USER_AGE_LOOKUP
    , user_occ_lookup_dict = data_config.USER_OCC_LOOKUP
    , movie_gen_lookup_dict = data_config.MOVIE_GEN_LOOKUP
)

test_dataset_load

In [None]:
ratings_matrix = test_dataset_load[0]
print(ratings_matrix.shape)
ratings_matrix

# Data splits

In [None]:
from src.per_arm_rl import data_utils

tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

In [None]:
for x in train.batch(1).take(1):
    pprint(x)

In [None]:
for x in test.batch(1).take(1):
    pprint(x)

### write TF Records

In [None]:
TF_RECORD_FILE_train = "ml-ratings-100k-train.tfrecord"
LOCAL_TF_RECORD_train = f"./{TF_RECORD_FILE_train}"

TF_RECORD_FILE_val = "ml-ratings-100k-val.tfrecord"
LOCAL_TF_RECORD_val = f"./{TF_RECORD_FILE_val}"

TRAIN_DATA_PATH = f"{DATA_PATH}/train"
VAL_DATA_PATH = f"{DATA_PATH}/val"

print(f"TRAIN_DATA_PATH  : {TRAIN_DATA_PATH}")
print(f"VAL_DATA_PATH    : {VAL_DATA_PATH}")

In [None]:
data_utils.write_tfrecords(TF_RECORD_FILE_train, train)

In [None]:
data_utils.write_tfrecords(TF_RECORD_FILE_val, test)

In [None]:
! gsutil -q cp $LOCAL_TF_RECORD_train $TRAIN_DATA_PATH/

! gsutil -q cp $LOCAL_TF_RECORD_val $VAL_DATA_PATH/

In [None]:
! gsutil ls $DATA_PATH

### validate TF Records

In [None]:
## validate

train_files = []
for blob in storage_client.list_blobs(f"{BUCKET_NAME}", prefix=f'{DATA_GCS_PREFIX}/train'):
    if '.tfrecord' in blob.name:
        train_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
        
train_files

In [None]:
train_dataset = tf.data.TFRecordDataset(train_files)

train_dataset = train_dataset.map(data_utils.parse_tfrecord)

for x in train_dataset.batch(1).take(1):
    pprint(x)

In [None]:
# Get the unique movies and users
unique_user_ratings = train_dataset.map(lambda x: x["user_rating"])

unique_user_ratings = np.unique([x.numpy() for x in unique_user_ratings])

unique_user_ratings

# EDA - TODO

* RLDS dataset utils [guide](https://colab.sandbox.google.com/github/google-research/rlds/blob/main/rlds/examples/rlds_performance.ipynb#scrollTo=nGMkkZI9gGVD)
* RLDS [examples](https://colab.sandbox.google.com/github/google-research/rlds/blob/main/rlds/examples/rlds_examples.ipynb#scrollTo=nGMkkZI9gGVD)

### size of dataset

> Just so that we know how big is the dataset we play with, lets first compute the number of episodes and steps.

In [None]:
# size of dataset
episodes = 0
steps = 0
for episode in dataset:
    episodes += 1
    steps += episode[rlds.STEPS].cardinality()

print(f'Episodes: {episodes}, steps: {steps}')

### Computing the total reward

We will experiment with RL dataset pipeline performance by trying to compute a sum of steps' rewards returned in all episodes of the example dataset. The starting point implementation is a simple Python's double loop over episodes and steps:

In [None]:
def compute_return(episode_dataset):
  result = 0
  for episode in episode_dataset:
    for step in episode[rlds.STEPS]:
      result += step[rlds.REWARD]
  return result

benchmark(compute_return, dataset)

### Prefetching
 
The double loop from the example above is very simple, while execution time is quite significant given the total number of steps in the dataset. One could expect the source of slowness is retrieval of elements from the dataset. If so, prefetching a dataset could help.

In [None]:
def compute_return(episode_dataset):
    result = 0
    for episode in episode_dataset.prefetch(2):
        for step in episode[rlds.STEPS].prefetch(2):
            result += step[rlds.REWARD]
    return result

benchmark(compute_return, dataset)

Python loop can be replaced with a [tf.data.Dataset.reduce](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#reduce) operation.

In [None]:
def episode_return_sum(episode):
    return episode[rlds.STEPS].reduce(np.float32(0), lambda x, step: step[rlds.REWARD] + x)

def compute_return(episode_dataset):
    return episode_dataset.reduce(np.float32(0), lambda x, episode: episode_return_sum(episode) + x)

benchmark(compute_return, dataset)

### Vectorized transformations

An example we analyzed so far focused on computing aggregated statistics for a given dataset. Sometimes it is required to perform custom per-step modifications of the dataset instead. For that reason RLDS provides *map_nested_steps* operation that maintains the episodic structure. In this example, we will try to implement a simple transformation ourselves with the use of [tf.data.Dataset](https://www.tensorflow.org/api_docs/python/tf/data/Dataset) operators. Lets implement a transformation which changes a given episode dataset into a collection of steps with doubled reward values.

In [None]:
def double_reward(step):
    step[rlds.REWARD] *= 2
    return step

double_reward_dataset = dataset.flat_map(lambda x: x[rlds.STEPS]).map(lambda step : double_reward(step))

Lets now measure the performance of the new dataset:

In [None]:
def compute_return(step_dataset):
    return step_dataset.batch(100).reduce(np.float32(0), lambda x, step: tf.math.reduce_sum(step[rlds.REWARD]) + x)

benchmark(compute_return, double_reward_dataset)

Similarly to the previous examples, the main bottleneck is the per-step call of the *double_reward* function. We can reduce that overhead by first batching multiple steps, then applying vectorized version of the *double_reward* and un-batching the result.

In [None]:
def vectorized_double_reward(steps):
    return tf.vectorized_map(double_reward, steps)

double_reward_dataset = dataset.flat_map(lambda x: x[rlds.STEPS]).batch(100).map(vectorized_double_reward).unbatch()

benchmark(compute_return, double_reward_dataset)

# Vocab Generation - TODO

* see [working with preprocessing layers](https://www.tensorflow.org/guide/keras/preprocessing_layers)

In [49]:
EMBEDDING_SIZE = 128

for x in ratings.batch(1).take(1):
    pprint(x)

{'bucketized_user_age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([45.], dtype=float32)>,
 'movie_genres': <tf.Tensor: shape=(1, 1), dtype=int64, numpy=array([[7]])>,
 'movie_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'357'], dtype=object)>,
 'movie_title': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b"One Flew Over the Cuckoo's Nest (1975)"], dtype=object)>,
 'raw_user_age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([46.], dtype=float32)>,
 'timestamp': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([879024327])>,
 'user_gender': <tf.Tensor: shape=(1,), dtype=bool, numpy=array([ True])>,
 'user_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'138'], dtype=object)>,
 'user_occupation_label': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([4])>,
 'user_occupation_text': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'doctor'], dtype=object)>,
 'user_rating': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([4.], dtype=float32)>,

In [50]:
str_features = [
    "movie_id", "user_id", "user_occupation_text"
    # "user_zip_code", "user_occupation_text", "movie_title"
]

int_features = [
    "timestamp", "movie_genres", 
    # "user_occupation_label", "user_gender"
]

float_features = [
    "bucketized_user_age", "user_rating",
    # "raw_user_age"
]

feature_names = str_features + int_features + float_features

print(f"str_features  : {str_features}")
print(f"int_features  : {int_features}")
print(f"float_features  : {float_features}")
print(f"feature_names : {feature_names}")

str_features  : ['movie_id', 'user_id', 'user_occupation_text']
int_features  : ['bucketized_user_age', 'timestamp', 'movie_genres']
feature_names : ['movie_id', 'user_id', 'user_occupation_text', 'bucketized_user_age', 'timestamp', 'movie_genres']


In [None]:
# vocabularies = {}

# for feature_name in feature_names:
#     vocab = ratings.batch(1_000_000).map(lambda x: x[feature_name])
#     vocabularies[feature_name] = np.unique(np.concatenate(list(vocab)))

### movie title lookup

* see [Turning categorical features into embeddings](https://www.tensorflow.org/recommenders/examples/featurization#turning_categorical_features_into_embeddings) for details

In [38]:
movie_title_lookup = tf.keras.layers.StringLookup()
movie_title_lookup

<keras.layers.preprocessing.string_lookup.StringLookup at 0x7f74c8550eb0>

In [41]:
movie_title_lookup.adapt(ratings.map(lambda x: x["movie_title"]))
# movie_title_lookup.adapt(train_dataset.map(lambda x: x["movie_title"]))

print(f"Vocabulary: {movie_title_lookup.get_vocabulary()[:3]}")

Vocabulary: ['[UNK]', 'Star Wars (1977)', 'Contact (1997)']


Once we have this we can use the layer to translate raw tokens to embedding ids:

In [42]:
movie_title_lookup(["Star Wars (1977)", "One Flew Over the Cuckoo's Nest (1975)"])

<tf.Tensor: shape=(2,), dtype=int64, numpy=array([ 1, 58])>

In [43]:
movie_title_lookup.vocab_size()

1665

#### define embeddings

In [45]:
movie_title_embedding = tf.keras.layers.Embedding(
    # Let's use the explicit vocabulary lookup.
    input_dim=movie_title_lookup.vocab_size(),
    output_dim=EMBEDDING_SIZE
)

movie_title_embedding

<keras.layers.core.embedding.Embedding at 0x7f74c8558160>

**example embeddings from movie title:**

In [46]:
movie_title_model = tf.keras.Sequential([movie_title_lookup, movie_title_embedding])

movie_title_model(["Star Wars (1977)"])

<tf.Tensor: shape=(1, 128), dtype=float32, numpy=
array([[ 0.02504338, -0.0012939 ,  0.04811487, -0.02999126, -0.0292394 ,
         0.01156382,  0.03041414, -0.02798716,  0.00799823,  0.04248938,
        -0.04263956, -0.01784793,  0.02749519, -0.00937803, -0.01001156,
        -0.0056101 ,  0.03250836, -0.00172166, -0.0108378 , -0.04652712,
         0.04713202,  0.00197013, -0.0049839 , -0.01328664, -0.00308267,
         0.04895042, -0.04782685,  0.00406177,  0.03135257, -0.02687558,
         0.01241463, -0.04897828,  0.01407217, -0.0348866 , -0.0373833 ,
        -0.00677862,  0.01129251,  0.00500541,  0.01277152,  0.01554528,
        -0.01913993, -0.02839179, -0.00459641,  0.02090592, -0.00293946,
         0.02296934, -0.02400823, -0.03008862,  0.04921992, -0.00659008,
         0.04569877,  0.02329716,  0.00493069, -0.01955005,  0.02327912,
        -0.02437162,  0.04842639, -0.01809146, -0.01159244, -0.02352436,
         0.00771639, -0.04491389, -0.02470828,  0.02996274, -0.0112378 ,
 

### movie genres

> ragged text layer

In [None]:
# genre_text_vectorizer = tf.keras.layers.TextVectorization() - no genre text available

In [51]:
# genre_lookup = tf.keras.layers.IntegerLookup()
# genre_lookup

# MOVIE_GEN_DIM = len(MOVIE_GEN_LOOKUP)
# genre_text_vectorizer.adapt(ratings.map(lambda x: x["movie_genres"]))
# # genre_text_vectorizer.adapt(train_dataset.map(lambda x: x["movie_genres"]))

# genre_text_vectorizer

In [52]:
# genre_text_vectorizer = tf.keras.layers.TextVectorization(
#     max_tokens=max_tokens,
#     ngrams=ngrams
# )

In [53]:
# vocab = np.array([vocab_dict['track_name_pl']]).flatten()

In [None]:

# start = time.time()
# text_layer = tf.keras.layers.TextVectorization(
#     max_tokens=max_tokens,
#     ngrams=ngrams
# )
# text_layer.adapt(train_parsed.map(lambda x: tf.reshape(x[f'{feature_name}'], [-1, MAX_PLAYLIST_LENGTH, 1])))
# end = time.time()

# logging.info(f'Layer adapt elapsed time: {round((end - start), 2)} seconds')

### movie ID (action)

In [None]:
FEATURE_ACTION = 'movie_id'
FEATURE_REWARD = 'user_rating'

def process_example(example_proto):
    """
    Returns a dataset of actions for each example.
    """
    _, sequence_feature = tf.io.parse_single_sequence_example(
        example_proto,
        sequence_features={
            FEATURE_ACTION:
                # tf.io.FixedLenSequenceFeature([], dtype=tf.int64, default_value=None),
                tf.io.FixedLenFeature(shape=(), dtype=tf.string, default_value=None),
        })
    actions = sequence_feature[FEATURE_ACTION]
    return tf.data.Dataset.from_tensor_slices(actions)


def generate_vocabulary(
    train_data_path, 
    output_vocabulary_file,
    max_items_to_process=None
):
    """
    Generate a vocabulary file for actions.
    """

    dataset_files = tf.io.gfile.glob(train_data_path)

    example_dataset = tf.data.RecordIODataset(dataset_files)
    action_dataset = example_dataset.interleave(
        process_example,
        cycle_length=16,
        block_length=16,
        num_parallel_calls=10,
        deterministic=False)

    if max_items_to_process:
        action_dataset = action_dataset.take(max_items_to_process)

    action_lookup_layer = IntegerLookup(mask_value=None, num_oov_indices=0)
    action_lookup_layer.adapt(action_dataset)

    action_vocabulary = action_lookup_layer.get_vocabulary()

    with tf.io.gfile.GFile(output_vocabulary_file, 'w') as output_file:
        output_file.write('\n'.join(str(action) for action in action_vocabulary))

### Generate Vocab - sequences - TODO

In [None]:
FEATURE_ACTION = 'movie_id'
FEATURE_REWARD = 'user_rating'

def process_example(example_proto):
    """
    Returns a dataset of actions for each example.
    """
    _, sequence_feature = tf.io.parse_single_sequence_example(
        example_proto
        , sequence_features={
            FEATURE_ACTION:
                tf.io.FixedLenSequenceFeature([], tf.int64, default_value=None)
            , FEATURE_REWARD:
                tf.io.FixedLenSequenceFeature([], tf.int64, default_value=None)
        }
    )
    actions = sequence_feature[FEATURE_ACTION]
    rewards = sequence_feature[FEATURE_REWARD]
    
    return actions, rewards

In [None]:
# %%time
dataset_files = tf.io.gfile.glob(dataset_path)
dataset_files

In [None]:
example_dataset = tf.data.RecordIODataset(dataset_files)
example_dataset

In [None]:
example_dataset = example_dataset.map(
    process_example
    , num_parallel_calls=tf.data.experimental.AUTOTUNE
)
example_dataset

In [None]:
max_items_to_process = 1000000
num_elements = 0
sequence_lengths = []
actions = []
rewards = []

start_time = time.time()
for elem in example_dataset.as_numpy_iterator():
    action, reward = elem
    sequence_lengths.append(len(action))
    actions.append(action)
    rewards.append(reward)
    num_elements += 1
    if num_elements % 10000 == 0:
        print(num_elements)
    if num_elements > max_items_to_process:
        break

print('Num sequences = ', num_elements)