# Movielens download

1. download [MovieLens 1M](https://www.tensorflow.org/datasets/catalog/movielens#movielens1m-movies)) public dataset
2. Write datasets to TF-Records
3. Generate dataset vocabulary and look-up dictionaries

## Notebook config

In [1]:
VERSION        = "v2"                       # TODO
PREFIX         = f'rec-bandits-{VERSION}'   # TODO

print(f"PREFIX: {PREFIX}")

PREFIX: rec-bandits-v2


In [3]:
# staging GCS
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}-bucket'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PROJECT_ID               = "hybrid-vertex"
PROJECT_NUM              = "934903580331"
LOCATION                 = "us-central1"

REGION                   = "us-central1"
BQ_LOCATION              = "US"
VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"

VERTEX_SA                = "934903580331-compute@developer.gserviceaccount.com"

PREFIX                   = "rec-bandits-v2"
VERSION                  = "v2"

BUCKET_NAME              = "rec-bandits-v2-hybrid-vertex-bucket"
BUCKET_URI               = "gs://rec-bandits-v2-hybrid-vertex-bucket"
DATA_GCS_PREFIX          = "data"
DATA_PATH                = "gs://rec-bandits-v2-hybrid-vertex-bucket/data"
VOCAB_SUBDIR             = "vocabs"
VOCAB_FILENAME           = "vocab_dict.pkl"
DATA_PATH_KFP_DEMO       = "gs://rec-bandits-v2-hybrid-vertex-bucket/data/kfp_demo_data/u.data"

VPC_NETWORK_FULL         = "projects/934903580331/global/networks/ucaip-haystack-vpc-network"

BIGQUERY_DATASET_NAME    = "mvlens_rec_bandits_v2"
BIGQUERY_TABLE_NA

## imports

In [67]:
import sys
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

from pprint import pprint
import pickle as pkl
import numpy as np
import pandas as pd

# tensorflow
import tensorflow as tf

# google cloud
from google.cloud import aiplatform, storage

# cloud storage client
storage_client = storage.Client(project=PROJECT_ID)
# bucket = storage_client.bucket(BUCKET_NAME)

# Vertex client
aiplatform.init(project=PROJECT_ID, location=LOCATION)

sys.path.append("..")
from src.data import data_utils as data_utils

### specify data tag

## Training data preperation

The examples generation process performs the following steps:

* Downloads movielens dataset
* Groups movie rating records by `user`, and orders per-user movie rating records by `timestamp`.
* Generates TensorFlow examples with features:
  
```
 feature_description = {
    # context sequence item features
    'context_movie_id': tf.io.FixedLenFeature(shape=(MAX_CONTEXT_LENGTH), dtype=tf.string),
    'context_movie_rating': tf.io.FixedLenFeature(shape=(MAX_CONTEXT_LENGTH), dtype=tf.float32),
    'context_rating_timestamp': tf.io.FixedLenFeature(shape=(MAX_CONTEXT_LENGTH), dtype=tf.int64),
    'context_movie_genre': tf.io.FixedLenFeature(shape=(MAX_GENRE_LENGTH), dtype=tf.string),
    'context_movie_year': tf.io.FixedLenFeature(shape=(MAX_CONTEXT_LENGTH), dtype=tf.int64),
    'context_movie_title': tf.io.FixedLenFeature(shape=(MAX_CONTEXT_LENGTH), dtype=tf.string),

    # target/label item features
    'target_movie_id': tf.io.FixedLenFeature(shape=(), dtype=tf.string),
    'target_movie_rating': tf.io.FixedLenFeature(shape=(), dtype=tf.float32),
    'target_rating_timestamp': tf.io.FixedLenFeature(shape=(), dtype=tf.int64),
    'target_movie_genres': tf.io.FixedLenFeature(shape=(data_config.MAX_GENRE_LENGTH), dtype=tf.string),
    'target_movie_year': tf.io.FixedLenFeature(shape=(), dtype=tf.int64),
    'target_movie_title': tf.io.FixedLenFeature(shape=(), dtype=tf.string),

    # user - global context features
    'user_id': tf.io.FixedLenFeature(shape=(), dtype=tf.string),
    'user_gender': tf.io.FixedLenFeature(shape=(), dtype=tf.string),
    'user_age': tf.io.FixedLenFeature(shape=(), dtype=tf.int64),
    'user_occupation_text': tf.io.FixedLenFeature(shape=(), dtype=tf.string),
    'user_zip_code': tf.io.FixedLenFeature(shape=(), dtype=tf.string),
}
```

There's case that one user activity will have multiple values for a single feature. For example, the movie genre feature in movielens dataset, each movie can have multiple genres. For this case, we suggest to concatenate all movies' genres for the activity sequence. Let's look at one example, if the user activity sequence is:

```
Star Wars: Episode IV - A New Hope (1977), Genres: Action|Adventure|Fantasy
Terminator 2: Judgment Day (1991), Genres: Action|Sci-Fi|Thriller
Jurassic Park (1993), Genres: Action|Adventure|Sci-Fi
```
The context_movie_genre feature will be

> "Action, Adventure, Fantasy, Action, Sci-Fi, Thriller, Action, Adventure, Sci-Fi"


## set download and prep config

In [52]:
EXAMPLE_GEN_GCS_PATH = "data/movielens/m1m"
LOCAL_OUTPUT_DIR     = "local_data/examples"
LOCAL_EXTRACT_DIR    = "local_data/raw"
TF_RECORD_PREFIX     = "ml-1m-gen"

MAX_CONTEXT_LENGTH   = 10
MAX_GENRE_LENGTH     = 10
MIN_TIMELINE_LENGTH  = 3
MIN_RATING           = 1

NUM_TRAIN_RECORDS    = 8
NUM_VAL_RECORDS      = 3

BUILD_VOCAB          = True

## copy these commands to command line terminal

> either navigate to `src/data` directory or add appropriate prefix to `example_gen_movielens` command

In [53]:
print("copy these commands into terminal:\n")
print(f"export PROJECT_ID={PROJECT_ID}")
print(f"export BUCKET_NAME={BUCKET_NAME}")
print(f"export EXAMPLE_GEN_GCS_PATH={EXAMPLE_GEN_GCS_PATH}")
print(f"export TF_RECORD_PREFIX={TF_RECORD_PREFIX}")
print(f"export LOCAL_EXTRACT_DIR={LOCAL_EXTRACT_DIR}")
print(f"export LOCAL_OUTPUT_DIR={LOCAL_OUTPUT_DIR}")
print(f"export MAX_GENRE_LENGTH={MAX_GENRE_LENGTH}")
print(f"export MAX_CONTEXT_LENGTH={MAX_CONTEXT_LENGTH}")
print(f"export MIN_TIMELINE_LENGTH={MIN_TIMELINE_LENGTH}")
print(f"export MIN_RATING={MIN_RATING}")
print(f"export BUILD_VOCAB={BUILD_VOCAB}")
print(f"export NUM_TRAIN_RECORDS={NUM_TRAIN_RECORDS}")
print(f"export NUM_VAL_RECORDS={NUM_VAL_RECORDS}")
# print(f"chmod +x example_gen_movielens.py")

copy these commands into terminal:

export PROJECT_ID=hybrid-vertex
export BUCKET_NAME=rec-bandits-v2-hybrid-vertex-bucket
export EXAMPLE_GEN_GCS_PATH=data/movielens/m1m
export TF_RECORD_PREFIX=ml-1m-gen
export LOCAL_EXTRACT_DIR=data/raw
export LOCAL_OUTPUT_DIR=data/examples
export MAX_GENRE_LENGTH=10
export MAX_CONTEXT_LENGTH=10
export MIN_TIMELINE_LENGTH=3
export MIN_RATING=1
export BUILD_VOCAB=True
export NUM_TRAIN_RECORDS=8
export NUM_VAL_RECORDS=3


In [54]:
data_gen_command = """python example_gen_movielens.py --project_id=$PROJECT_ID --gcs_bucket_name=$BUCKET_NAME \
--gcs_data_path_prefix=$EXAMPLE_GEN_GCS_PATH \
--tfrecord_prefix=$TF_RECORD_PREFIX \
--local_data_dir=$LOCAL_EXTRACT_DIR \
--local_output_dir=$LOCAL_OUTPUT_DIR \
--min_timeline_length=$MIN_TIMELINE_LENGTH \
--max_context_length=$MAX_CONTEXT_LENGTH \
--max_context_movie_genre_length=$MAX_GENRE_LENGTH \
--min_rating=$MIN_RATING \
--train_data_fraction=0.9 \
--build_vocabs=$BUILD_VOCAB \
--num_train_tfrecords=$NUM_TRAIN_RECORDS \
--num_test_tfrecords=$NUM_VAL_RECORDS \
"""
print(data_gen_command)

python example_gen_movielens.py --project_id=$PROJECT_ID --gcs_bucket_name=$BUCKET_NAME --gcs_data_path_prefix=$EXAMPLE_GEN_GCS_PATH --tfrecord_prefix=$TF_RECORD_PREFIX --local_data_dir=$LOCAL_EXTRACT_DIR --local_output_dir=$LOCAL_OUTPUT_DIR --min_timeline_length=$MIN_TIMELINE_LENGTH --max_context_length=$MAX_CONTEXT_LENGTH --max_context_movie_genre_length=$MAX_GENRE_LENGTH --min_rating=$MIN_RATING --train_data_fraction=0.9 --build_vocabs=$BUILD_VOCAB --num_train_tfrecords=$NUM_TRAIN_RECORDS --num_test_tfrecords=$NUM_VAL_RECORDS 


### Create `data_config.py`

In [55]:
data_config = f"""
PROJECT_ID           = "{PROJECT_ID}"
EXAMPLE_GEN_GCS_PATH = "{EXAMPLE_GEN_GCS_PATH}"
TF_RECORD_PREFIX     = "{TF_RECORD_PREFIX}"
MAX_CONTEXT_LENGTH   = {MAX_CONTEXT_LENGTH}
MAX_GENRE_LENGTH     = {MAX_GENRE_LENGTH}
"""
print(data_config)


PROJECT_ID           = "hybrid-vertex"
EXAMPLE_GEN_GCS_PATH = "data/movielens/m1m"
TF_RECORD_PREFIX     = "ml-1m-gen"
MAX_CONTEXT_LENGTH   = 10
MAX_GENRE_LENGTH     = 10



In [56]:
LOCAL_DATA_CONFIG_FILE = f"../src/local_data/data_config.py"

with open(LOCAL_DATA_CONFIG_FILE, 'w') as f:
    f.write(data_config)

In [57]:
sys.path.append("..")
from src.data import data_config as data_config

PROJECT_ID_v1           = data_config.PROJECT_ID
EXAMPLE_GEN_GCS_PATH_v1 = data_config.EXAMPLE_GEN_GCS_PATH
TF_RECORD_PREFIX_v1     = data_config.TF_RECORD_PREFIX
MAX_CONTEXT_LENGTH_v1   = data_config.MAX_CONTEXT_LENGTH
MAX_GENRE_LENGTH_v1     = data_config.MAX_GENRE_LENGTH

print(f"PROJECT_ID_v1           = {PROJECT_ID_v1}")
print(f"EXAMPLE_GEN_GCS_PATH_v1 = {EXAMPLE_GEN_GCS_PATH_v1}")
print(f"TF_RECORD_PREFIX_v1     = {TF_RECORD_PREFIX_v1}")
print(f"MAX_CONTEXT_LENGTH_v1   = {MAX_CONTEXT_LENGTH_v1}")
print(f"MAX_GENRE_LENGTH_v1     = {MAX_GENRE_LENGTH_v1}")

PROJECT_ID_v1           =hybrid-vertex
EXAMPLE_GEN_GCS_PATH_v1 =data/movielens/mv1m-gen
TF_RECORD_PREFIX_v1     =ml-1m-gen
MAX_CONTEXT_LENGTH_v1   =10
MAX_GENRE_LENGTH_v1     =10


Once complete, your local `data` dir should resemble this:

```
├── __init__.py
├── data
│   ├── examples
│   │   ├── csvs
│   │   ├── train
│   │   │   ├── ml-1m-gen-001-of-008.tfrecord
│   │   │   ├── ml-1m-gen-002-of-008.tfrecord
│   │   │   ├── ml-1m-gen-003-of-008.tfrecord
│   │   │   ├── ml-1m-gen-004-of-008.tfrecord
│   │   │   ├── ml-1m-gen-005-of-008.tfrecord
│   │   │   ├── ml-1m-gen-006-of-008.tfrecord
│   │   │   ├── ml-1m-gen-007-of-008.tfrecord
│   │   │   └── ml-1m-gen-008-of-008.tfrecord
│   │   ├── val
│   │   │   ├── ml-1m-gen-001-of-003.tfrecord
│   │   │   ├── ml-1m-gen-002-of-003.tfrecord
│   │   │   └── ml-1m-gen-003-of-003.tfrecord
│   │   └── vocabs
│   │       └── vocab_dict.pkl
│   └── raw
│       └── datasets
│           ├── ml-1m
│           │   ├── README
│           │   ├── movies.dat
│           │   ├── ratings.dat
│           │   └── users.dat
│           └── ml-1m.zip
├── data_config.py
├── data_utils.py
├── example_gen_movielens.py
```

**(optional) delete local**

> paste the following in command line terminal

# Confim tfrecords

In [60]:
GCS_DATA_PATH = f"{BUCKET_URI}/{EXAMPLE_GEN_GCS_PATH}"

print(f"GCS_DATA_PATH : {GCS_DATA_PATH}")

! gsutil ls $GCS_DATA_PATH

GCS_DATA_PATH : gs://rec-bandits-v2-hybrid-vertex-bucket/data/movielens/m1m
gs://rec-bandits-v2-hybrid-vertex-bucket/data/movielens/m1m/train/
gs://rec-bandits-v2-hybrid-vertex-bucket/data/movielens/m1m/val/
gs://rec-bandits-v2-hybrid-vertex-bucket/data/movielens/m1m/vocabs/


In [61]:
all_files = []
for blob in storage_client.list_blobs(
    f"{BUCKET_NAME}", 
    prefix=f'{EXAMPLE_GEN_GCS_PATH}/train/', 
    # delimiter='/'
):
    if '.tfrecord' in blob.name:
        all_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
        
all_files

['gs://rec-bandits-v2-hybrid-vertex-bucket/data/movielens/m1m/train/ml-1m-gen-001-of-008.tfrecord',
 'gs://rec-bandits-v2-hybrid-vertex-bucket/data/movielens/m1m/train/ml-1m-gen-002-of-008.tfrecord',
 'gs://rec-bandits-v2-hybrid-vertex-bucket/data/movielens/m1m/train/ml-1m-gen-003-of-008.tfrecord',
 'gs://rec-bandits-v2-hybrid-vertex-bucket/data/movielens/m1m/train/ml-1m-gen-004-of-008.tfrecord',
 'gs://rec-bandits-v2-hybrid-vertex-bucket/data/movielens/m1m/train/ml-1m-gen-005-of-008.tfrecord',
 'gs://rec-bandits-v2-hybrid-vertex-bucket/data/movielens/m1m/train/ml-1m-gen-006-of-008.tfrecord',
 'gs://rec-bandits-v2-hybrid-vertex-bucket/data/movielens/m1m/train/ml-1m-gen-007-of-008.tfrecord',
 'gs://rec-bandits-v2-hybrid-vertex-bucket/data/movielens/m1m/train/ml-1m-gen-008-of-008.tfrecord']

In [62]:
for blob in storage_client.list_blobs(
    f"{BUCKET_NAME}", 
    prefix=f'{EXAMPLE_GEN_GCS_PATH}/val/', 
    # delimiter='/'
):
    if '.tfrecord' in blob.name:
        all_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
        
all_files

['gs://rec-bandits-v2-hybrid-vertex-bucket/data/movielens/m1m/train/ml-1m-gen-001-of-008.tfrecord',
 'gs://rec-bandits-v2-hybrid-vertex-bucket/data/movielens/m1m/train/ml-1m-gen-002-of-008.tfrecord',
 'gs://rec-bandits-v2-hybrid-vertex-bucket/data/movielens/m1m/train/ml-1m-gen-003-of-008.tfrecord',
 'gs://rec-bandits-v2-hybrid-vertex-bucket/data/movielens/m1m/train/ml-1m-gen-004-of-008.tfrecord',
 'gs://rec-bandits-v2-hybrid-vertex-bucket/data/movielens/m1m/train/ml-1m-gen-005-of-008.tfrecord',
 'gs://rec-bandits-v2-hybrid-vertex-bucket/data/movielens/m1m/train/ml-1m-gen-006-of-008.tfrecord',
 'gs://rec-bandits-v2-hybrid-vertex-bucket/data/movielens/m1m/train/ml-1m-gen-007-of-008.tfrecord',
 'gs://rec-bandits-v2-hybrid-vertex-bucket/data/movielens/m1m/train/ml-1m-gen-008-of-008.tfrecord',
 'gs://rec-bandits-v2-hybrid-vertex-bucket/data/movielens/m1m/val/ml-1m-gen-001-of-003.tfrecord',
 'gs://rec-bandits-v2-hybrid-vertex-bucket/data/movielens/m1m/val/ml-1m-gen-002-of-003.tfrecord',
 'gs

In [63]:
mv_dataset = tf.data.TFRecordDataset(all_files)
# train_dataset = train_dataset.map(movielens_ds_utils.parse_tfrecord)
mv_dataset = mv_dataset.map(data_utils._parse_function)

for x in mv_dataset.batch(1).take(1):
    pprint(x)

{'target_movie_genres': <tf.Tensor: shape=(1, 10), dtype=string, numpy=
array([[b'Animation', b"Children's", b'Comedy', b'Musical', b'Romance',
        b'UNK', b'UNK', b'UNK', b'UNK', b'UNK']], dtype=object)>,
 'target_movie_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'2080'], dtype=object)>,
 'target_movie_rating': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([3.], dtype=float32)>,
 'target_movie_title': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Lady and the Tramp (1955)'], dtype=object)>,
 'target_movie_year': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([1955])>,
 'target_rating_timestamp': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([964040305])>,
 'user_age': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([18])>,
 'user_gender': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'M'], dtype=object)>,
 'user_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'4631'], dtype=object)>,
 'user_occupation_text': <tf.Tensor: shape=(1,), dty

## Validate vocab file

In [82]:
VOCAB_FILENAME

'vocab_dict.pkl'

In [83]:
EXISTING_VOCAB_FILE = f'gs://{BUCKET_NAME}/{EXAMPLE_GEN_GCS_PATH}/vocabs/{VOCAB_FILENAME}'
print(f"Downloading vocab...")

os.system(f'gsutil -q cp {EXISTING_VOCAB_FILE} .')
print(f"Downloaded vocab from: {EXISTING_VOCAB_FILE}\n")

filehandler = open(VOCAB_FILENAME, 'rb')
vocab_dict = pkl.load(filehandler)
filehandler.close()

for key in vocab_dict.keys():
    pprint(key)

Downloading vocab...
Downloaded vocab from: gs://rec-bandits-v2-hybrid-vertex-bucket/data/movielens/m1m/vocabs/vocab_dict.pkl

'movie_id'
'movie_year'
'movie_genre'
'movie_title'
'user_id'
'user_gender_vocab'
'user_age_vocab'
'user_occ_vocab'
'user_zip_vocab'
'min_timestamp'
'max_timestamp'
'timestamp_buckets'


In [88]:
# vocab_dict['movie_year']

In [94]:
MOVIELENS_NUM_MOVIES = len(vocab_dict['movie_id'])
MOVIELENS_NUM_USERS = len(vocab_dict['user_id'])

print(f"MOVIELENS_NUM_MOVIES : {MOVIELENS_NUM_MOVIES}")
print(f"MOVIELENS_NUM_USERS  : {MOVIELENS_NUM_USERS}")

MOVIELENS_NUM_MOVIES : 3884
MOVIELENS_NUM_USERS  : 6041


# Create look-up dictionaries

In [93]:
USER_AGE_LOOKUP = data_utils.get_dictionary_lookup_by_tf_data_key(
    key = 'user_age'
    , dataset = mv_dataset
)

USER_AGE_DIM = len(USER_AGE_LOOKUP)
print(f"USER_AGE_DIM: {USER_AGE_DIM}")

# USER_AGE_LOOKUP

USER_AGE_DIM: 7


{1: 0, 35: 1, 45: 2, 18: 3, 50: 4, 56: 5, 25: 6}

In [95]:
USER_OCC_LOOKUP = data_utils.get_dictionary_lookup_by_tf_data_key(
    key = 'user_occupation_text'
    , dataset= mv_dataset
)
USER_OCC_DIM = len(USER_OCC_LOOKUP)
print(f"USER_OCC_DIM: {USER_OCC_DIM}")

# USER_OCC_LOOKUP

USER_OCC_DIM: 21


In [98]:
config = f'''
USER_AGE_LOOKUP       = {USER_AGE_LOOKUP}
USER_AGE_DIM          = {USER_AGE_DIM}
USER_OCC_LOOKUP       = {USER_OCC_LOOKUP}
USER_OCC_DIM          = {USER_OCC_DIM}
MOVIELENS_NUM_MOVIES  = {MOVIELENS_NUM_MOVIES}
MOVIELENS_NUM_USERS   = {MOVIELENS_NUM_USERS}
'''
    
with open(f'../{REPO_DOCKER_PATH_PREFIX}/data/mv_lookup_dicts.py', 'w') as f:
    f.write(config)

# Clean-up

In [None]:
# (optional) delete local
# cd src/data
# rm -rf local_data

### move files with `gsutil`

In [89]:
# !gsutil cp gs://$BUCKET_NAME/data/movielens/movielens-1m/vocab_dict.pkl gs://$BUCKET_NAME/data/movielens/movielens-1m-gen/vocab_dict.pkl

**Finished**