# Create preprocessed dataset for optimized data loading

> preparing dataset in this way will be similar to that needed during online training iterations

In this notebook we will extend the `02b-example-optimized-dataset` example and preprocess the entire dataset. To avoid storing large intermediate files locally, and prepare for MLOps, we will submit this job as a Vertex Pipeline step

**Things to consider**
* the `batch_size` used will be the `batch_size` used for downstream training steps. This and other params may need to be shared between data preprocessing and training steps. Some include:
  * embedding dimensions for global and per_arm features: `GLOBAL_DIM` and `PER_ARM_DIM`
  * reward calculation
  * feature config
* if we are using an environment to generate simulated examples, these could easily be an input
* trajectory data stored in BigQuery will not have the `BATCH` and `TIME` dimensions e.g., `[B, T, ...]`, we'll need to account for this when we parse the trajecotries for our TF record

**orchestrate with Vertex Pipelines**

<img src="../imgs/data_pipeline.png" 
     align="center" 
     width="650"
     height="650"/>

## Notebook config

In [1]:
VERSION        = "v2"                       # TODO
PREFIX         = f'rec-bandits-{VERSION}'   # TODO

print(f"PREFIX: {PREFIX}")

PREFIX: rec-bandits-v2


In [2]:
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}-bucket'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PROJECT_ID               = "hybrid-vertex"
PROJECT_NUM              = "934903580331"
LOCATION                 = "us-central1"

REGION                   = "us-central1"
BQ_LOCATION              = "US"
VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"
VERTEX_SA                = "934903580331-compute@developer.gserviceaccount.com"

PREFIX                   = "rec-bandits-v2"
VERSION                  = "v2"

BUCKET_NAME              = "rec-bandits-v2-hybrid-vertex-bucket"
BUCKET_URI               = "gs://rec-bandits-v2-hybrid-vertex-bucket"
DATA_GCS_PREFIX          = "data"
DATA_PATH                = "gs://rec-bandits-v2-hybrid-vertex-bucket/data"
VOCAB_SUBDIR             = "vocabs"
VOCAB_FILENAME           = "vocab_dict.pkl"

VPC_NETWORK_FULL         = "projects/934903580331/global/networks/ucaip-haystack-vpc-network"

BIGQUERY_DATASET_NAME    = "mvlens_rec_bandits_v2"
BIGQUERY_TABLE_NAME      = "training_dataset"

REPOSITORY               = "rl-movielens-rec-bandits-v2"

DOCKERNAM

## Imports

In [3]:
import os
import sys
import time
import numpy as np
import pickle as pkl
from pprint import pprint
from typing import Callable, Dict, List, Optional, TypeVar, Any

### pipelines
import kfp
from kfp import compiler, dsl, components
from kfp.dsl import component, Metrics
from google_cloud_pipeline_components.types import artifact_types

# logging
import logging
logging.disable(logging.WARNING)

#python warning 
import warnings
warnings.filterwarnings("ignore")

# google cloud
from google.cloud import aiplatform, storage, bigquery

KFP_SDK_VERSION = kfp.__version__
GCS_SDK_VERSION = storage.__version__
BQ_SDK_VERSION  = bigquery.__version__
AIP_SDK_VERSION = aiplatform.__version__

print(f'kfp version           : {KFP_SDK_VERSION}')
print(f'storage SDK version   : {GCS_SDK_VERSION}')
print(f'bigquery SDK version  : {BQ_SDK_VERSION}')
print(f'vertex_ai SDK version : {AIP_SDK_VERSION}')

kfp version           : 2.7.0
storage SDK version   : 2.14.0
bigquery SDK version  : 3.25.0
vertex_ai SDK version : 1.71.0


## GCP clients

In [4]:
# cloud storage client
storage_client = storage.Client(project=PROJECT_ID)

# Vertex client
aiplatform.init(project=PROJECT_ID, location=LOCATION)

# bigquery client
bqclient = bigquery.Client(project=PROJECT_ID,)

# Data preprocessing with Vertex Pipelines

## create data preprocessing dir

In [5]:
REPO_SRC  = "src"
LOCAL_PREPROCESS_DIR = "data_preprocessor"
PREPROCESS_SUBDIR = "components"

!pwd

/home/jupyter/tf_vertex_agents/02-supervised-to-bandit-training/optimized_dataset


In [6]:
! rm -rf ../../$REPO_SRC/$LOCAL_PREPROCESS_DIR/$PREPROCESS_SUBDIR
! mkdir ../../$REPO_SRC/$LOCAL_PREPROCESS_DIR/$PREPROCESS_SUBDIR

In [6]:
!ls ../../$REPO_SRC/$LOCAL_PREPROCESS_DIR/$PREPROCESS_SUBDIR

pipeline_config.py   write_tf_records.py
train_validation.py  write_trajectories_to_bq.py


## custom components

In [8]:
# DATA_PIPELINE_IMAGE = f"gcr.io/{PROJECT_ID}/rl-mv-preprocessing"

# gcr.io/hybrid-vertex/train-perarm-feats-v2
# DATA_PIPELINE_IMAGE = IMAGE_URI_02

# TODO: from section 05
POLICY_TRAIN_IMAGE = f"gcr.io/{PROJECT_ID}/mv-gpi-pipeline" # mv-gpi-pipeline | mv-gpi-train
DATA_PIPELINE_IMAGE = POLICY_TRAIN_IMAGE

DATA_PIPELINE_IMAGE

'gcr.io/hybrid-vertex/mv-gpi-pipeline'

In [9]:
pipe_config = f'''PROJECT_ID          = \"{PROJECT_ID}\"
REGION              = \"{REGION}\"
PREFIX              = \"{PREFIX}\"
BUCKET_NAME         = \"{BUCKET_NAME}\"
DATA_PIPELINE_IMAGE = \"{DATA_PIPELINE_IMAGE}"
KFP_SDK_VERSION     = \"{KFP_SDK_VERSION}\"
GCS_SDK_VERSION     = \"{GCS_SDK_VERSION}\"
BQ_SDK_VERSION      = \"{BQ_SDK_VERSION}\"
AIP_SDK_VERSION     = \"{AIP_SDK_VERSION}\"
'''
print(pipe_config)

PROJECT_ID          = "hybrid-vertex"
REGION              = "us-central1"
PREFIX              = "rec-bandits-v2"
BUCKET_NAME         = "rec-bandits-v2-hybrid-vertex-bucket"
DATA_PIPELINE_IMAGE = "gcr.io/hybrid-vertex/mv-gpi-pipeline"
KFP_SDK_VERSION     = "2.7.0"
GCS_SDK_VERSION     = "2.14.0"
BQ_SDK_VERSION      = "3.25.0"
AIP_SDK_VERSION     = "1.71.0"



In [10]:
with open(f'../../{REPO_SRC}/{LOCAL_PREPROCESS_DIR}/{PREPROCESS_SUBDIR}/pipeline_config.py', 'w') as f:
    f.write(pipe_config)

In [11]:
!ls ../../$REPO_SRC/$LOCAL_PREPROCESS_DIR/$PREPROCESS_SUBDIR

pipeline_config.py


### Write Trajectories to BigQuery

In [12]:
%%writefile ../../{REPO_SRC}/{LOCAL_PREPROCESS_DIR}/{PREPROCESS_SUBDIR}/write_trajectories_to_bq.py

import kfp
from typing import Any, Callable, Dict, NamedTuple, Optional, List
from kfp.dsl import (
    component, 
    Metrics
)
from . import pipeline_config

@component(
    base_image=pipeline_config.DATA_PIPELINE_IMAGE,
    install_kfp_package=False
)
def write_trajectories_to_bq(
    project_id: str,
    location: str,
    pipeline_version: str,
    bq_dataset_name: str,
    bucket_name: str,
    example_gen_gcs_path: str,
    global_emb_size: int,
    mv_emb_size: int,
    num_oov_buckets: int,
    batch_size: int,
    dataset_size: int = 0,
    vocab_filename: str = "vocab_dict.pkl",
    is_testing: bool = False,
) -> NamedTuple('Outputs', [
    ('global_dim', int),
    ('per_arm_dim', int),
    ('tf_record_file', str),
    ('bq_table_ref', str),
    ('batch_size', int),
]):
    import os
    import json
    import logging
    import numpy as np
    import pickle as pkl
    from google.cloud import aiplatform, bigquery, storage
    from typing import Callable, Dict, List, Optional, TypeVar, Any
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
    
    # tf agents
    import tensorflow as tf
    from tf_agents import trajectories
    from tf_agents.trajectories import trajectory
    from tf_agents.bandits.policies import policy_utilities
    from tf_agents.bandits.specs import utils as bandit_spec_utils
    
    # this repo
    from src.networks import encoding_network as emb_features
    from src.data import data_utils as data_utils
    from src.data import data_config as data_config
    from src.utils import reward_factory as reward_factory
    from src.data_preprocessor import preprocess_utils

    # set client SDKs
    aiplatform.init(
        project=project_id,
        location=location,
        # experiment=experiment_name,
    )
    storage_client = storage.Client(project=project_id)
    bqclient = bigquery.Client(project=project_id)
    
    # set variables
    GCS_DATA_PATH        = f"gs://{bucket_name}/{example_gen_gcs_path}"
    NUM_GLOBAL_FEATURES  = len(data_utils.USER_FEATURE_NAMES)     # 6
    NUM_ARM_FEATURES     = len(data_utils.MOVIE_FEATURE_NAMES)    # 5
    EXPECTED_GLOBAL_DIM  = global_emb_size * NUM_GLOBAL_FEATURES
    EXPECTED_PER_ARM_DIM = mv_emb_size * NUM_ARM_FEATURES
    
    logging.info(f'GCS_DATA_PATH       : {GCS_DATA_PATH}')
    logging.info(f'NUM_GLOBAL_FEATURES : {NUM_GLOBAL_FEATURES}')
    logging.info(f'NUM_ARM_FEATURES    : {NUM_ARM_FEATURES}')
    logging.info(f'EXPECTED_GLOBAL_DIM : {EXPECTED_GLOBAL_DIM}')
    logging.info(f'EXPECTED_PER_ARM_DIM: {EXPECTED_PER_ARM_DIM}')

    # =========================================================
    # get data
    # =========================================================
    # download vocabs
    LOCAL_VOCAB_FILENAME = 'vocab_dict.pkl'
    print(f"Downloading vocab...")
    data_utils.download_blob(
        project_id = project_id,
        bucket_name = bucket_name, 
        source_blob_name = f'{example_gen_gcs_path}/vocabs/{vocab_filename}', 
        destination_file_name= LOCAL_VOCAB_FILENAME
    )
    filehandler = open(f"{LOCAL_VOCAB_FILENAME}", 'rb')
    vocab_dict = pkl.load(filehandler)
    filehandler.close()

    # get train and val examples
    options = tf.data.Options()
    options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.AUTO
    data_splits = ["train","val"]
    all_files = []
    
    for _split in data_splits:

        for blob in storage_client.list_blobs(
            f"{bucket_name}", 
            prefix=f'{example_gen_gcs_path}/{_split}'
        ):
            if '.tfrecord' in blob.name:
                all_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
    print("Found these tfrecords:")
    print(all_files)

    if is_testing:
        all_files = all_files[:2]
        print(f"in testing mode; only using: {all_files}")
    dataset = tf.data.TFRecordDataset(all_files)
    dataset = dataset.map(data_utils._parse_function)
    
    # =========================================================
    # get emb dims
    # =========================================================
    print(f"getting embedding dimensions...")
    for i in range(1):
        iterator = iter(dataset.batch(1))
        data = next(iterator)
        
    embs = emb_features.EmbeddingModel(
        vocab_dict = vocab_dict,
        num_oov_buckets = num_oov_buckets,
        global_emb_size = global_emb_size,
        mv_emb_size = mv_emb_size,
        max_genre_length = data_config.MAX_GENRE_LENGTH,
    )
    test_globals = embs._get_global_context_features(data)
    test_arms = embs._get_per_arm_features(data)
    GLOBAL_DIM = test_globals.shape[1]            
    PER_ARM_DIM = test_arms.shape[1]
    print(f"GLOBAL_DIM  : {GLOBAL_DIM}")
    print(f"PER_ARM_DIM : {PER_ARM_DIM}")
    
    # =========================================================
    # trajectory function
    # =========================================================
    BQ_TMP_FILE   = "tmp_bq.json"
    BQ_TABLE_NAME = f"mv_b{batch_size}_g{global_emb_size}_a{mv_emb_size}_{pipeline_version}"
    BQ_TABLE_REF  = f"{project_id}.{bq_dataset_name}.{BQ_TABLE_NAME}"
    DS_GCS_DIR_PATH = f"gs://{bucket_name}/{example_gen_gcs_path}/{BQ_TABLE_NAME}"
    TFRECORD_FILE = f"{DS_GCS_DIR_PATH}/{BQ_TABLE_NAME}.tfrecord"
    
    print(f"BQ_TMP_FILE   : {BQ_TMP_FILE}")
    print(f"BQ_TABLE_NAME : {BQ_TABLE_NAME}")
    print(f"BQ_TABLE_REF  : {BQ_TABLE_REF}")
    print(f"DS_GCS_DIR_PATH : {DS_GCS_DIR_PATH}")
    print(f"TFRECORD_FILE : {TFRECORD_FILE}")
    
    # my trajectory functions
    def my_trajectory_fn(element):
        """Converts a dataset element into a trajectory."""
        global_features = embs._get_global_context_features(element)
        arm_features = embs._get_per_arm_features(element)

        observation = {
            bandit_spec_utils.GLOBAL_FEATURE_KEY: global_features
        }
        reward = reward_factory._get_rewards(element)

        policy_info = policy_utilities.PerArmPolicyInfo(
            chosen_arm_features=arm_features,
        )
        return trajectory.single_step(
            observation=observation,
            action=tf.zeros_like(
                reward, dtype=tf.int32
            ),
            policy_info=policy_info,
            reward=reward,
            discount=tf.zeros_like(reward)
        )

    # # calculate dataset_size
    # if not dataset_size:
    #     print(f"getting size of dataset...")
    #     dataset_size = dataset.reduce(0, lambda x,_: x+1).numpy()
    
    # write to local file
    print(f"writting trajectories to tmp file...")
    with open(BQ_TMP_FILE, "w") as f:
        for example in dataset.batch(batch_size, drop_remainder=True): #.take(count=dataset_size):
            _trajectories = my_trajectory_fn(example)
            _traj_dict = preprocess_utils.build_dict_from_trajectory(_trajectories)
            f.write(json.dumps(_traj_dict) + "\n")

    print(f"saving tmp file to: {example_gen_gcs_path}/{BQ_TABLE_NAME}/{BQ_TMP_FILE}")
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(f"{example_gen_gcs_path}/{BQ_TABLE_NAME}/{BQ_TMP_FILE}")
    blob.upload_from_filename(BQ_TMP_FILE)
    
    print(f"loading tmp file to bigquery...")
    with open(BQ_TMP_FILE, "rb") as source_file:
        load_job = bqclient.load_table_from_file(
            source_file, 
            BQ_TABLE_REF, 
            job_config=preprocess_utils.job_config
        )
    load_job.result() 
    
    # check table
    bq_table = bqclient.get_table(BQ_TABLE_REF)
    print(f"Got table: `{bq_table.project}.{bq_table.dataset_id}.{bq_table.table_id}`")
    print("Table has {} rows".format(bq_table.num_rows))

    return (
        GLOBAL_DIM,
        PER_ARM_DIM,
        TFRECORD_FILE,
        BQ_TABLE_REF,
        batch_size
    )

Writing ../src/data_preprocessor/components/write_trajectories_to_bq.py


### Write TF Records from BigQuery table

In [13]:
%%writefile ../../{REPO_SRC}/{LOCAL_PREPROCESS_DIR}/{PREPROCESS_SUBDIR}/write_tf_records.py

import kfp
from typing import Any, Callable, Dict, NamedTuple, Optional, List
from kfp.dsl import (
    component, 
    Metrics
)
from . import pipeline_config

@component(
    base_image=pipeline_config.DATA_PIPELINE_IMAGE,
    install_kfp_package=False
)
def write_tf_records(
    project_id: str,
    location: str,
    pipeline_version: str,
    bq_table_ref: str,
    tf_record_file: str,
    global_dim: int,
    per_arm_dim: int,
) -> NamedTuple('Outputs', [
    ('tf_record_file', str),
    ('global_dim', int),
    ('per_arm_dim', int),
    ('bq_table_ref', str),
]):
    
    from google.cloud import bigquery
    
    # this repo
    from src.data import data_utils as data_utils
    from src.data_preprocessor import preprocess_utils
    
    bqclient = bigquery.Client(project=project_id)
    
    # get bq table iterator
    print(f"getting bq table iterator...")
    
    bq_table = bqclient.get_table(bq_table_ref)
    print(f"Got table: `{bq_table.project}.{bq_table.dataset_id}.{bq_table.table_id}`")
    print("Table has {} rows".format(bq_table.num_rows))

    table_row_iter = bqclient.list_rows(bq_table)
    
    print(f"writting bq to tf records...")
    preprocess_utils.write_tfrecords(tf_record_file, table_row_iter)
    
    print(f"tf record complete: {tf_record_file}")
    
    return (
        f'{tf_record_file}',
        global_dim,
        per_arm_dim,
        bq_table_ref,
    )

Writing ../src/data_preprocessor/components/write_tf_records.py


### Validate dataset with train job

> here we'll create a simple train job to validate our dataset is formatted correctly and the trajectories can be consumed with `agent.train(...)`

In [14]:
%%writefile ../../{REPO_SRC}/{LOCAL_PREPROCESS_DIR}/{PREPROCESS_SUBDIR}/train_validation.py
import kfp
from typing import Any, Callable, Dict, NamedTuple, Optional, List
from kfp.dsl import (
    component, 
    Metrics
)
from . import pipeline_config

@component(
    base_image=pipeline_config.DATA_PIPELINE_IMAGE,
    install_kfp_package=False
)
def train_validation(
    project_id: str,
    location: str,
    pipeline_version: str,
    bucket_name: str,
    bq_table_ref: str,
    tf_record_file: str,
    batch_size: int,
    num_actions: int,
    global_dim: int,
    per_arm_dim: int,
    experiment_name: str,
    num_epochs: int = 2,
) -> NamedTuple('Outputs', [
    ('log_dir', str),
    ('tf_record_file', str),
]):
    import os
    import time
    import numpy as np
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
    from google.cloud import aiplatform, storage
    from typing import Dict, List, Any
    
    import tensorflow as tf
    from tf_agents.specs import array_spec
    from tf_agents.specs import tensor_spec
    from tf_agents.policies import policy_saver
    from tf_agents import trajectories
    from tf_agents.trajectories import time_step as ts
    from tf_agents.bandits.policies import policy_utilities
    from tf_agents.bandits.specs import utils as bandit_spec_utils
    from tf_agents.metrics import tf_metrics
    
    # this repo
    from src.utils import train_utils as train_utils
    from src.data_preprocessor import preprocess_utils
    from src.agents import agent_factory as agent_factory
    
    # set experiment config for tracking
    invoke_time       = time.strftime("%Y%m%d-%H%M%S")
    RUN_NAME          = f'run-{invoke_time}'
    BASE_OUTPUT_DIR   = f"gs://{bucket_name}/{experiment_name}/{RUN_NAME}"
    LOG_DIR           = f"{BASE_OUTPUT_DIR}/logs"
    ARTIFACTS_DIR     = f"{BASE_OUTPUT_DIR}/artifacts"
    
    print(f"BASE_OUTPUT_DIR : {BASE_OUTPUT_DIR}")
    print(f"LOG_DIR         : {LOG_DIR}")
    print(f"ARTIFACTS_DIR   : {ARTIFACTS_DIR}")
    
    aiplatform.init(project=project_id, location=location)
    
    # tensorboard = aiplatform.Tensorboard.create(
    #     display_name=experiment_name
    #     , project=project_id
    #     , location=location
    # )
    # TB_RESOURCE_NAME = tensorboard.resource_name
    # TB_ID = TB_RESOURCE_NAME.split('/')[-1]
    
    # set agent config
    AGENT_TYPE      = 'epsGreedy' # 'LinUCB' | 'LinTS |, 'epsGreedy' | 'NeuralLinUCB'
    AGENT_ALPHA     = 0.1
    EPSILON         = 0.01
    LR              = 0.05
    ENCODING_DIM    = 1
    EPS_PHASE_STEPS = 1000
    GLOBAL_LAYERS   = [global_dim, int(global_dim/2), int(global_dim/4)]
    ARM_LAYERS      = [per_arm_dim, int(per_arm_dim/2), int(per_arm_dim/4)]
    FIRST_COMMON_LAYER = GLOBAL_LAYERS[-1] + ARM_LAYERS[-1]
    COMMON_LAYERS = [
        int(FIRST_COMMON_LAYER),
        int(FIRST_COMMON_LAYER/4)
    ]
    NETWORK_TYPE = "commontower"
    
    # set tensor specs
    observation_spec = {
        'global': tf.TensorSpec([global_dim], tf.float32),
        'per_arm': tf.TensorSpec([num_actions, per_arm_dim], tf.float32) #excluding action dim here
    }
    action_spec = tensor_spec.BoundedTensorSpec(
        shape=[], 
        dtype=tf.int32,
        minimum=tf.constant(0),            
        maximum=num_actions-1, # n degrees of freedom and will dictate the expected mean reward spec shape
        name="action_spec"
    )
    time_step_spec = ts.time_step_spec(observation_spec = observation_spec)

    reward_spec = {
        "reward": array_spec.ArraySpec(
            shape=[batch_size], 
            dtype=np.float32, name="reward"
        )
    }
    reward_tensor_spec = train_utils.from_spec(reward_spec)
    
    # create agent
    global_step = tf.compat.v1.train.get_or_create_global_step()
    agent = agent_factory.PerArmAgentFactory._get_agent(
        agent_type = AGENT_TYPE,
        network_type = NETWORK_TYPE,
        time_step_spec = time_step_spec,
        action_spec = action_spec,
        observation_spec=observation_spec,
        global_layers = GLOBAL_LAYERS,
        arm_layers = ARM_LAYERS,
        common_layers = COMMON_LAYERS,
        agent_alpha = AGENT_ALPHA,
        learning_rate = LR,
        epsilon = EPSILON,
        train_step_counter = global_step,
        output_dim = ENCODING_DIM,
        eps_phase_steps = EPS_PHASE_STEPS,
        summarize_grads_and_vars = False,
        debug_summaries = True
    )
    agent.initialize()
    print(f'agent: {agent.name}')
    
    train_summary_writer = tf.compat.v2.summary.create_file_writer(
        f"{LOG_DIR}", flush_millis=10 * 1000
    )
    train_summary_writer.set_as_default()
    saver = policy_saver.PolicySaver(
        agent.policy, 
        train_step=global_step
    )
    metrics = [
        # tf_metrics.NumberOfEpisodes(),
        # tf_metrics.AverageEpisodeLengthMetric(batch_size=batch_size),
        tf_metrics.AverageReturnMetric(batch_size=batch_size)
    ]
    # create dataset
    raw_dataset = tf.data.TFRecordDataset([tf_record_file])
    parsed_dataset = raw_dataset.map(
        preprocess_utils._parse_record
    ).prefetch(
        tf.data.experimental.AUTOTUNE
    )
    # trajectory function
    def _build_trajectory_from_tfrecord(
        parsed_record: Dict[str, tf.Tensor],
        batch_size: int,
        num_actions: int,
        # policy_info: policies.utils.PolicyInfo
    ) -> trajectories.Trajectory:
        """
        Builds a `trajectories.Trajectory` object from `parsed_record`.

        Args:
          parsed_record: A dict mapping feature names to values as `tf.Tensor`
            objects of type string containing serialized protos.
          policy_info: Policy information specification.

        Returns:
          A `trajectories.Trajectory` object that contains values as de-serialized
          `tf.Tensor` objects from `parsed_record`.
        """
        dummy_rewards = tf.zeros([batch_size, 1, num_actions])

        global_features = tf.expand_dims(
            tf.io.parse_tensor(parsed_record["observation"], out_type=tf.float32),
            axis=1
        )
        observation = {
            bandit_spec_utils.GLOBAL_FEATURE_KEY: global_features
        }

        arm_features = tf.expand_dims(
            tf.io.parse_tensor(parsed_record["chosen_arm_features"], out_type=tf.float32),
            axis=1
        )

        policy_info = policy_utilities.PerArmPolicyInfo(
            chosen_arm_features=arm_features,
            predicted_rewards_mean=dummy_rewards,
            bandit_policy_type=tf.zeros([batch_size, 1, 1], dtype=tf.int32)
        )

        return trajectories.Trajectory(
            step_type=tf.expand_dims(
                tf.io.parse_tensor(parsed_record["step_type"], out_type=tf.int32),
                axis=1
            ),
            observation = observation,
            action=tf.expand_dims(
                tf.io.parse_tensor(parsed_record["action"], out_type=tf.int32),
                axis=1
            ),
            policy_info=policy_info,
            next_step_type=tf.expand_dims(
                tf.io.parse_tensor(
                    parsed_record["next_step_type"], out_type=tf.int32),
                axis=1
            ),
            reward=tf.expand_dims(
                tf.io.parse_tensor(parsed_record["reward"], out_type=tf.float32),
                axis=1
            ),
            discount=tf.expand_dims(
                tf.io.parse_tensor(parsed_record["discount"], out_type=tf.float32),
                axis=1
            )
        )
    
    # train job
    list_o_loss = []
    # Reset the train step
    agent.train_step_counter.assign(0)

    print(f"starting train job...")
    start_time = time.time()
    # tf.profiler.experimental.start(LOG_DIR)
    for i in range(num_epochs):

        print(f"epoch: {i+1}")

        for parsed_record in parsed_dataset:

            _trajectories = _build_trajectory_from_tfrecord(
                parsed_record, batch_size, num_actions
            )

            step = agent.train_step_counter.numpy()
            loss = agent.train(experience=_trajectories)
            list_o_loss.append(loss.loss.numpy())

            train_utils._export_metrics_and_summaries(
                step=i, 
                metrics=metrics
            )

            # print step loss
            if step % 100 == 0:
                print(
                    'step = {0}: train loss = {1}'.format(
                        step, round(loss.loss.numpy(), 2)
                    )
                )
    # tf.profiler.experimental.stop()
    runtime_mins = int((time.time() - start_time) / 60)
    print(f"train runtime_mins: {runtime_mins}")
    
    # # one time upload
    # aiplatform.upload_tb_log(
    #     tensorboard_id=TB_ID,
    #     tensorboard_experiment_name=experiment_name,
    #     logdir=LOG_DIR,
    #     experiment_display_name=experiment_name,
    #     run_name_prefix=RUN_NAME,
    #     # description=description,
    # )
    
    print(f"LOG_DIR: {LOG_DIR}")
    print(f"tf_record_file: {tf_record_file}")
    
    return (
        LOG_DIR,
        tf_record_file
    )

Writing ../src/data_preprocessor/components/train_validation.py


# Create Vertex Pipeline

In [15]:
import sys
sys.path.append("../..")
from src.data_preprocessor.components import (
    write_trajectories_to_bq,
    train_validation,
    write_tf_records,
)

In [16]:
PIPE_VERSION = "v8"
EXPERIMENT_NAME = "movielens-rl-data-pipeline"
DISPLAY_NAME = f"{EXPERIMENT_NAME}-{PIPE_VERSION}".replace("_","-")
print(f"DISPLAY_NAME: {DISPLAY_NAME}") 

DISPLAY_NAME: movielens-rl-data-pipeline-v8


In [17]:
@dsl.pipeline(
    name=f"{DISPLAY_NAME}",
)
def data_preprocess_pipeline(
    project_id: str,
    location: str,
    pipeline_version: str,
    experiment_name: str,
    bq_dataset_name: str,
    bucket_name: str,
    example_gen_gcs_path: str,
    batch_size: int,
    num_actions: int,
    global_emb_size: int,
    mv_emb_size: int,
    num_oov_buckets: int,
    dataset_size: int = 0,
    num_epochs: int = 2,
    vocab_filename: str = "vocab_dict.pkl",
    is_testing: bool = True
):
    import logging
    
    write_trajectories_op = (
        write_trajectories_to_bq.write_trajectories_to_bq(
            project_id=project_id,
            location=location,
            pipeline_version=pipeline_version,
            bq_dataset_name=bq_dataset_name,
            bucket_name=bucket_name,
            example_gen_gcs_path=example_gen_gcs_path,
            global_emb_size=global_emb_size,
            mv_emb_size=mv_emb_size,
            num_oov_buckets=num_oov_buckets,
            batch_size=batch_size,
            dataset_size=dataset_size,
            vocab_filename="vocab_dict.pkl",
            is_testing=is_testing,
        )
        .set_display_name("Write to BQ")
        .set_caching_options(True)
    )
    
    write_tf_records_op = (
        write_tf_records.write_tf_records(
            project_id=project_id,
            location=location,
            pipeline_version=pipeline_version,
            bq_table_ref=write_trajectories_op.outputs['bq_table_ref'],
            tf_record_file=write_trajectories_op.outputs['tf_record_file'],
            global_dim=write_trajectories_op.outputs['global_dim'],
            per_arm_dim=write_trajectories_op.outputs['per_arm_dim'],
        )
        .set_display_name("Write TF Records")
        .set_caching_options(True)
    )
    
    train_validation_op = (
        train_validation.train_validation(
            project_id=project_id,
            location=location,
            pipeline_version=pipeline_version,
            bucket_name=bucket_name,
            bq_table_ref=write_tf_records_op.outputs['bq_table_ref'],
            tf_record_file=write_tf_records_op.outputs['tf_record_file'],
            batch_size=batch_size,
            num_actions=num_actions,
            global_dim=write_tf_records_op.outputs['global_dim'],
            per_arm_dim=write_tf_records_op.outputs['per_arm_dim'],
            experiment_name=experiment_name,
            num_epochs=num_epochs,
        )
        .set_display_name("Test w/ agent")
        .set_caching_options(True)
    )

In [18]:
PIPELINE_YAML_FILENAME = "preprocess_pipeline.yaml"

! rm -f $PIPELINE_YAML_FILENAME

compiler.Compiler().compile(
    pipeline_func=data_preprocess_pipeline, 
    package_path=PIPELINE_YAML_FILENAME
)
PIPELINE_ROOT = f"gs://{BUCKET_NAME}/data-preprocess-pipelines/{EXPERIMENT_NAME}"
PIPELINES_FILEPATH = f"{PIPELINE_ROOT}/{PIPELINE_YAML_FILENAME}"

!gsutil cp $PIPELINE_YAML_FILENAME $PIPELINES_FILEPATH

Copying file://preprocess_pipeline.yaml [Content-Type=application/octet-stream]...
/ [1 files][ 31.2 KiB/ 31.2 KiB]                                                
Operation completed over 1 objects/31.2 KiB.                                     


In [19]:
sys.path.append("../..")
from src.data import data_config as data_config
from src.data import data_utils as data_utils

IS_TESTING = False

BATCH_SIZE            = 256
NUM_ACTIONS           = 2
EXAMPLE_GEN_GCS_PATH  = data_config.EXAMPLE_GEN_GCS_PATH
NUM_OOV_BUCKETS       = 1
GLOBAL_EMBEDDING_SIZE = 12
MV_EMBEDDING_SIZE     = 16

NUM_GLOBAL_FEATURES   = len(data_utils.USER_FEATURE_NAMES)     # 6
NUM_ARM_FEATURES      = len(data_utils.MOVIE_FEATURE_NAMES)    # 5
EXPECTED_GLOBAL_DIM   = GLOBAL_EMBEDDING_SIZE * NUM_GLOBAL_FEATURES
EXPECTED_PER_ARM_DIM  = MV_EMBEDDING_SIZE * NUM_ARM_FEATURES

NUM_EPOCHS = 2

print(f"BATCH_SIZE            : {BATCH_SIZE}")
print(f"NUM_ACTIONS           : {NUM_ACTIONS}")
print(f"EXAMPLE_GEN_GCS_PATH  : {EXAMPLE_GEN_GCS_PATH}")
print(f"NUM_OOV_BUCKETS       : {NUM_OOV_BUCKETS}")
print(f"GLOBAL_EMBEDDING_SIZE : {GLOBAL_EMBEDDING_SIZE}")
print(f"MV_EMBEDDING_SIZE     : {MV_EMBEDDING_SIZE}")
print(f"EXPECTED_GLOBAL_DIM   : {EXPECTED_GLOBAL_DIM}")
print(f"EXPECTED_PER_ARM_DIM  : {EXPECTED_PER_ARM_DIM}")

BATCH_SIZE            : 256
NUM_ACTIONS           : 2
EXAMPLE_GEN_GCS_PATH  : data/movielens/m1m
NUM_OOV_BUCKETS       : 1
GLOBAL_EMBEDDING_SIZE : 12
MV_EMBEDDING_SIZE     : 16
EXPECTED_GLOBAL_DIM   : 72
EXPECTED_PER_ARM_DIM  : 64


In [20]:
!ls ../../$REPO_SRC/$LOCAL_PREPROCESS_DIR/$PREPROCESS_SUBDIR

__pycache__	    train_validation.py  write_trajectories_to_bq.py
pipeline_config.py  write_tf_records.py


In [21]:
job = aiplatform.PipelineJob(
    display_name=DISPLAY_NAME,
    location=LOCATION,
    template_path=PIPELINE_YAML_FILENAME,
    pipeline_root=PIPELINE_ROOT,
    failure_policy='fast',
    parameter_values={
        "project_id": PROJECT_ID, # str,
        "location": LOCATION, # str,
        "pipeline_version": PIPE_VERSION, # str,
        "experiment_name": EXPERIMENT_NAME, # str,
        "bq_dataset_name": BIGQUERY_DATASET_NAME, # str,
        "bucket_name": BUCKET_NAME, # str,
        "example_gen_gcs_path": EXAMPLE_GEN_GCS_PATH, # str,
        "batch_size": BATCH_SIZE, # int,
        "num_actions": NUM_ACTIONS, # int,
        "global_emb_size": GLOBAL_EMBEDDING_SIZE, # int,
        "mv_emb_size": MV_EMBEDDING_SIZE, # int,
        "num_oov_buckets": NUM_OOV_BUCKETS, # int,
        "dataset_size": 0, # int = 0,
        "num_epochs": NUM_EPOCHS,
        "vocab_filename": "vocab_dict.pkl", # str = "vocab_dict.pkl",
        "is_testing": IS_TESTING, # bool = True
        
    },
    enable_caching=True,
)

job.submit(
    # experiment=EXPERIMENT_NAME,
    # sync=False,
    service_account=VERTEX_SA,
)

**Finished**