# Simulate Generalized Policy Iteration (GPI) for Contextual Bandits

> Orchestrate GPI jobs with Vertex Pipelines to simulate iterative policy evaluation and improvement

### GPI for Contextual Bandits

**Generalized Policy Iteration (GPI)** consists of two simultaneous, interacting processes that eventually converge to the optimal policy and value functions as the environment is interacted with:

1. **Policy evaluation** updates the value function to make it more consistent with the current policy and environment
2. **Policy improvement** updates the policy to make it more greedy with respect to the current value function

<img src="imgs/gpi.png" 
     align="center" 
     width="850"
     height="850"/>

**Relationship between model and policy:**
* Model: estimate value of a particular item (or slate)
* Policy: use the model to generate a decision response (e.g., recommendation, ranking)

In the diagram below, not pushing the "model" itself, but rather a *policy* based on the model. 

RL Agent's include:
* **policy**: mapping function determing agent's behavior
* **value function**: Tells agent how “good” each state is
* **model**: Agent’s representation of the environment

<img src="imgs/policy_improvement_and_eval.png" 
     align="center" 
     width="850"
     height="850"/>
     
*As duration of the GPI cycle decreases, we approach fully continuous, online learning*

## Notebook config

In [1]:
VERSION        = "v2"                       # TODO
PREFIX         = f'rec-bandits-{VERSION}'   # TODO

print(f"PREFIX: {PREFIX}")

PREFIX: rec-bandits-v2


In [2]:
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}-bucket'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PROJECT_ID               = "hybrid-vertex"
PROJECT_NUM              = "934903580331"
LOCATION                 = "us-central1"

REGION                   = "us-central1"
BQ_LOCATION              = "US"
VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"
VERTEX_SA                = "934903580331-compute@developer.gserviceaccount.com"

PREFIX                   = "rec-bandits-v2"
VERSION                  = "v2"

BUCKET_NAME              = "rec-bandits-v2-hybrid-vertex-bucket"
BUCKET_URI               = "gs://rec-bandits-v2-hybrid-vertex-bucket"
DATA_GCS_PREFIX          = "data"
DATA_PATH                = "gs://rec-bandits-v2-hybrid-vertex-bucket/data"
VOCAB_SUBDIR             = "vocabs"
VOCAB_FILENAME           = "vocab_dict.pkl"

VPC_NETWORK_FULL         = "projects/934903580331/global/networks/ucaip-haystack-vpc-network"

BIGQUERY_DATASET_NAME    = "mvlens_rec_bandits_v2"
BIGQUERY_TABLE_NAME      = "training_dataset"

REPOSITORY               = "rl-movielens-rec-bandits-v2"

DOCKERNAM

In [3]:
import os
import sys
import time
import numpy as np
import pickle as pkl
from pprint import pprint
import matplotlib.pyplot as plt
from typing import Callable, Dict, List, Optional, TypeVar, Any

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

### pipelines
import kfp
from kfp import compiler, dsl, components
from kfp.dsl import component, Metrics
from google_cloud_pipeline_components.types import artifact_types

# logging
import logging
logging.disable(logging.WARNING)

#python warning 
import warnings
warnings.filterwarnings("ignore")

# tensorflow
import tensorflow as tf

# tf exceptions and vars
if tf.__version__[0] != "2":
    raise Exception("The trainer only runs with TensorFlow version 2.")

T = TypeVar("T")
import tf_agents
from tf_agents.specs import tensor_spec
from tf_agents.trajectories import time_step as ts
from tf_agents.bandits.policies import policy_utilities
from tf_agents.bandits.specs import utils as bandit_spec_utils
from tf_agents.metrics import tf_metrics
from tf_agents.bandits.metrics import tf_metrics as tf_bandit_metrics
from tf_agents.trajectories import trajectory
from tf_agents import trajectories

from tf_agents.policies import py_tf_eager_policy
from tf_agents.trajectories import time_step as ts
from tf_agents.policies import policy_saver
from tf_agents.metrics import export_utils
from tf_agents.eval import metric_utils
from tf_agents.utils import common

TF_VERSION = tf.__version__
TFA_VERSION = tf_agents.__version__
print(f'tensorflow version : {TF_VERSION}')
print(f'tf-agents version  : {TFA_VERSION}')

# google cloud
from google.cloud import aiplatform, storage, bigquery

# Vertex client
aiplatform.init(project=PROJECT_ID, location=LOCATION)
# cloud storage client
storage_client = storage.Client(project=PROJECT_ID)
# bigquery client
bqclient = bigquery.Client(project=PROJECT_ID,)

KFP_SDK_VERSION = kfp.__version__
GCS_SDK_VERSION = storage.__version__
BQ_SDK_VERSION  = bigquery.__version__
AIP_SDK_VERSION = aiplatform.__version__
print(f'kfp version           : {KFP_SDK_VERSION}')
print(f'storage SDK version   : {GCS_SDK_VERSION}')
print(f'bigquery SDK version  : {BQ_SDK_VERSION}')
print(f'vertex_ai SDK version : {AIP_SDK_VERSION}')

tensorflow version : 2.13.0
tf-agents version  : 0.17.0
kfp version           : 2.3.0
storage SDK version   : 2.16.0
bigquery SDK version  : 3.11.4
vertex_ai SDK version : 1.46.0


In [None]:
# pip install --upgrade requests-toolbelt
# pip install --force-reinstall -v "requests-toolbelt==0.10.1"

In [4]:
# this repo
sys.path.append("..")
# from src.utils import train_utils
from src.data import data_utils, data_config
# from src.trainer import train_batched_ds

In [5]:
# # GPU
# from numba import cuda 
# import gc

# print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

# device = cuda.get_current_device()
# device.reset()
# gc.collect()

# Policy improvement pipeline

In [6]:
REPO_SRC  = "src"
POLICY_PIPE_DIR = "policy_pipeline"
POLICY_PIPE_SUBDIR = "components"

!pwd

/home/jupyter/tf_vertex_agents/05-online-learning


In [7]:
! rm -rf ../$REPO_SRC/$POLICY_PIPE_DIR/$POLICY_PIPE_SUBDIR
# ! mkdir ../$REPO_SRC/$POLICY_PIPE_DIR
! mkdir ../$REPO_SRC/$POLICY_PIPE_DIR/$POLICY_PIPE_SUBDIR

In [8]:
!ls ../$REPO_SRC/$POLICY_PIPE_DIR/$POLICY_PIPE_SUBDIR

## custom components

In [9]:
POLICY_PIPE_IMAGE = f"gcr.io/{PROJECT_ID}/mv-gpi-pipeline"
DOCKERNAME_GPI_PIPE  = "Dockerfile_gpi_pipe"

POLICY_TRAIN_IMAGE = f"gcr.io/{PROJECT_ID}/mv-gpi-train"
DOCKERNAME_GPI_TRAIN  = "Dockerfile_gpi_train"

In [10]:
pipe_config = f'''PROJECT_ID          = \"{PROJECT_ID}\"
REGION              = \"{REGION}\"
PREFIX              = \"{PREFIX}\"
BUCKET_NAME         = \"{BUCKET_NAME}\"
POLICY_PIPE_IMAGE   = \"{POLICY_PIPE_IMAGE}"
POLICY_TRAIN_IMAGE  = \"{POLICY_TRAIN_IMAGE}"
KFP_SDK_VERSION     = \"{KFP_SDK_VERSION}\"
GCS_SDK_VERSION     = \"{GCS_SDK_VERSION}\"
BQ_SDK_VERSION      = \"{BQ_SDK_VERSION}\"
AIP_SDK_VERSION     = \"{AIP_SDK_VERSION}\"
TF_VERSION          = \"{TF_VERSION}\"
TFA_VERSION         = \"{TFA_VERSION}\"
'''
print(pipe_config)

PROJECT_ID          = "hybrid-vertex"
REGION              = "us-central1"
PREFIX              = "rec-bandits-v2"
BUCKET_NAME         = "rec-bandits-v2-hybrid-vertex-bucket"
POLICY_PIPE_IMAGE   = "gcr.io/hybrid-vertex/mv-gpi-pipeline"
POLICY_TRAIN_IMAGE  = "gcr.io/hybrid-vertex/mv-gpi-train"
KFP_SDK_VERSION     = "2.3.0"
GCS_SDK_VERSION     = "2.16.0"
BQ_SDK_VERSION      = "3.11.4"
AIP_SDK_VERSION     = "1.46.0"
TF_VERSION          = "2.13.0"
TFA_VERSION         = "0.17.0"



In [11]:
with open(f'../{REPO_SRC}/{POLICY_PIPE_DIR}/{POLICY_PIPE_SUBDIR}/pipeline_config.py', 'w') as f:
    f.write(pipe_config)

In [12]:
!ls ../$REPO_SRC/$POLICY_PIPE_DIR/$POLICY_PIPE_SUBDIR

pipeline_config.py


### prepare eval dataset

In [13]:
%%writefile ../{REPO_SRC}/{POLICY_PIPE_DIR}/{POLICY_PIPE_SUBDIR}/prep_eval_ds.py
import kfp
from typing import Any, Callable, Dict, NamedTuple, Optional, List
from kfp import dsl
from . import pipeline_config

@dsl.component(
    base_image=pipeline_config.POLICY_PIPE_IMAGE,
    install_kfp_package=False
)
def prep_eval_ds(
    project_id: str,
    location: str,
    pipeline_version: str,
    bucket_name: str,
    example_gen_gcs_path: str,
    eval_ds: dsl.Output[dsl.Dataset],
    ds_skip: int = 0,
    ds_take: int = 0,
) -> NamedTuple('Outputs', [
    ('num_eval_samples', int),
    ('total_eval_rewards', float),
    ('ds_skip', int),
    ('ds_take', int),
]):
    
    import os
    import json
    import logging
    import numpy as np
    import pickle as pkl
    from google.cloud import aiplatform, bigquery, storage
    from typing import Callable, Dict, List, Optional, TypeVar, Any
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
    
    import tensorflow as tf
    
    # this repo
    from src.data import data_utils
    
    # set client SDKs
    aiplatform.init(
        project=project_id,
        location=location,
        # experiment=experiment_name,
    )
    storage_client = storage.Client(project=project_id)
    
    # get eval tf-records
    val_files = []
    for blob in storage_client.list_blobs(f"{bucket_name}", prefix=f'{example_gen_gcs_path}/val'):
        if '.tfrecord' in blob.name:
            val_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
            
    val_dataset = tf.data.TFRecordDataset(val_files)
    val_dataset = val_dataset.map(data_utils._parse_function, num_parallel_calls=tf.data.AUTOTUNE)
    
    eval_ds = val_dataset.batch(1)
    
    if ds_skip > 0:
        eval_ds = eval_ds.skip(ds_skip)
        logging.info(f"setting dataset skip: {ds_skip}")
    
    if ds_take > 0:
        eval_ds = eval_ds.take(ds_take)
        logging.info(f"setting dataset take: {ds_take}")
        
    # get length (size) of eval ds
    NUM_EVAL_SAMPLES = len(list(eval_ds))
    logging.info(f"NUM_EVAL_SAMPLES : {NUM_EVAL_SAMPLES}")
    
    # get total rewards from eval slice
    val_rewards = []
    for x in eval_ds:
        val_rewards.append(x[f"{data_utils.TARGET_FEATURE_NAME}"][0].numpy())
    
    TOTAL_EVAL_REWARD = tf.reduce_sum(val_rewards).numpy().tolist()
    logging.info(f"TOTAL_EVAL_REWARD : {TOTAL_EVAL_REWARD}")
    
    return (
        NUM_EVAL_SAMPLES,
        TOTAL_EVAL_REWARD,
        ds_skip,
        ds_take,
    )

Writing ../src/policy_pipeline/components/prep_eval_ds.py


### Train agent

In [14]:
%%writefile ../{REPO_SRC}/{POLICY_PIPE_DIR}/{POLICY_PIPE_SUBDIR}/train_agent.py
import kfp
from typing import Any, Callable, Dict, NamedTuple, Optional, List
from kfp import dsl
from . import pipeline_config

@dsl.component(
    base_image=pipeline_config.POLICY_PIPE_IMAGE,
    install_kfp_package=False
)
def train_agent(
    project_id: str,
    location: str,
    pipeline_version: str,
    bucket_name: str,
    example_gen_gcs_path: str,
    tfrecord_name: str,
    hparams: str,
    experiment_name: str,
    experiment_run_tag: str,
    tensorboard_resource_name: str,
    service_account: str,
    # train job
    num_epochs: int,
    log_interval: int,
    total_train_take: int,
    total_train_skip: int,
    gpi_image_name: str, # TODO
    # train compute
    replica_count: int,
    machine_type: str,
    accelerator_count: int, 
    accelerator_type: str,
    # train_ds: dsl.Input[google.VertexDataset], # google.VertexDataset
) -> NamedTuple('Outputs', [
    ('base_output_dir', str),
    ('log_dir', str),
    ('artifacts_dir', str),
]):
    # imports
    import os
    import time
    import logging
    logging.disable(logging.WARNING)
    
    # this repo
    from src.utils import train_utils
    
    from google.cloud import aiplatform, storage
    
    # GCP clients
    aiplatform.init(
        project=project_id, 
        location=location,
        experiment=experiment_name
    )
    storage_client = storage.Client(project=project_id)
    
    # dataset
    TFRECORD_FILE = (
        f"gs://{bucket_name}/{example_gen_gcs_path}/{tfrecord_name}/{tfrecord_name}.tfrecord"
    )
    # experiment
    invoke_time       = time.strftime("%Y%m%d-%H%M%S")
    RUN_NAME          = f'{experiment_run_tag}-{invoke_time}'
    CHECKPT_DIR       = f"gs://{bucket_name}/{experiment_name}/chkpoint"
    BASE_OUTPUT_DIR   = f"gs://{bucket_name}/{experiment_name}/{RUN_NAME}"
    LOG_DIR           = f"{BASE_OUTPUT_DIR}/logs"
    ARTIFACTS_DIR     = f"{BASE_OUTPUT_DIR}/artifacts"
    
    # job config 
    JOB_NAME = f'train-{experiment_name}-{experiment_run_tag}'
    logging.info(f'JOB_NAME: {JOB_NAME}')
    
    TF_GPU_THREAD_COUNT   = '4'      # '1' | '4' | '8'
    
    WORKER_ARGS = [
        f"--project={project_id}"
        , f"--location={location}"
        , f"--bucket_name={bucket_name}"
        , f"--experiment_name={experiment_name}"
        , f"--experiment_run={RUN_NAME}"
        , f"--log_dir={LOG_DIR}"
        , f"--artifacts_dir={ARTIFACTS_DIR}"
        , f"--chkpoint_dir={CHECKPT_DIR}"
        , f"--hparams={hparams}"
        ### job config
        , f"--num_epochs={num_epochs}"
        , f"--tf_record_file={TFRECORD_FILE}"
        , f"--log_interval={log_interval}"
        , f"--total_train_take={total_train_take}"
        , f"--total_train_skip={total_train_skip}"
        ### performance
        , f"--tf_gpu_thread_count={TF_GPU_THREAD_COUNT}"
        , f"--use_gpu"
        # , f"--use_tpu"
        , f"--cache_train_data"
    ]
    
    WORKER_POOL_SPECS = train_utils.prepare_worker_pool_specs(
        image_uri=f"{gpi_image_name}:latest",
        args=WORKER_ARGS,
        replica_count=replica_count,
        machine_type=machine_type,
        accelerator_count=accelerator_count,
        accelerator_type=accelerator_type,
        reduction_server_count=0,
        reduction_server_machine_type="n1-highcpu-16",
    )
    logging.info(f'WORKER_POOL_SPECS: {WORKER_POOL_SPECS}')
    
    #start the timer and training
    job = aiplatform.CustomJob(
        display_name=JOB_NAME,
        worker_pool_specs=WORKER_POOL_SPECS,
        base_output_dir=BASE_OUTPUT_DIR,
        staging_bucket=f"{BASE_OUTPUT_DIR}/staging",
    )
    logging.info(f'Submitting train job to Vertex AI...')
    job.run(
        tensorboard=tensorboard_resource_name,
        service_account=f'{service_account}',
        restart_job_on_worker_restart=False,
        enable_web_access=True,
        sync=False,
    )
    # wait for job to complete
    job.wait()
    
    train_job_dict = job.to_dict()
    logging.info(f'train_job_dict: {train_job_dict}')
    
    return (
        f'{BASE_OUTPUT_DIR}',
        f'{LOG_DIR}',
        f'{ARTIFACTS_DIR}',
    )

Writing ../src/policy_pipeline/components/train_agent.py


### Eval agent policy

In [15]:
%%writefile ../{REPO_SRC}/{POLICY_PIPE_DIR}/{POLICY_PIPE_SUBDIR}/eval_agent_policy.py
import kfp
from typing import Any, Callable, Dict, NamedTuple, Optional, List
from kfp import dsl
from . import pipeline_config

@dsl.component(
    base_image=pipeline_config.POLICY_PIPE_IMAGE,
    install_kfp_package=False
)
def eval_agent_policy(
    project_id: str,
    location: str,
    pipeline_version: str,
    bucket_name: str,
    example_gen_gcs_path: str,
    # agent
    hparams: str,
    arftifacts_dir: str,
    # data
    ds_skip: int,
    ds_take: int,
    num_eval_samples: int, 
    total_eval_rewards: float,
    eval_ds: dsl.Input[dsl.Dataset],
    metrics: dsl.Output[dsl.Metrics]
):
    # imports
    import os
    import json
    import time
    import logging
    import numpy as np
    import pickle as pkl
    from pprint import pprint
    from google.cloud import aiplatform, storage
    from typing import Callable, Dict, List, Optional, TypeVar, Any
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
    
    # tf
    import tensorflow as tf
    from tf_agents.policies import py_tf_eager_policy
    
    # this repo
    from src.trainer import eval_perarm
    from src.data import data_utils
    
    # convert hparam dict
    HPARAMS = json.loads(hparams)
    pprint(HPARAMS)
    
    # set client SDKs
    aiplatform.init(
        project=project_id,
        location=location,
    )
    storage_client = storage.Client(project=project_id)
    # =========================================================
    # download vocabs
    # =========================================================
    LOCAL_VOCAB_FILENAME = 'vocab_dict.pkl'
    print(f"Downloading vocab...")
    data_utils.download_blob(
        project_id = project_id,
        bucket_name = bucket_name, 
        source_blob_name = f'{example_gen_gcs_path}/vocabs/{LOCAL_VOCAB_FILENAME}', 
        destination_file_name= LOCAL_VOCAB_FILENAME
    )
    filehandler = open(f"{LOCAL_VOCAB_FILENAME}", 'rb')
    vocab_dict = pkl.load(filehandler)
    filehandler.close()
    
    # =========================================================
    # get eval tf-records
    # =========================================================
    val_files = []
    for blob in storage_client.list_blobs(f"{bucket_name}", prefix=f'{example_gen_gcs_path}/val'):
        if '.tfrecord' in blob.name:
            val_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
    val_dataset = tf.data.TFRecordDataset(val_files)
    val_dataset = val_dataset.map(data_utils._parse_function, num_parallel_calls=tf.data.AUTOTUNE)
    
    eval_ds = val_dataset.batch(1)
    
    if ds_skip > 0:
        eval_ds = eval_ds.skip(ds_skip)
        logging.info(f"setting dataset skip: {ds_skip}")
    
    if ds_take > 0:
        eval_ds = eval_ds.take(ds_take)
        logging.info(f"setting dataset take: {ds_take}")
        
    # =========================================================
    # load policy
    # =========================================================
    my_policy = py_tf_eager_policy.SavedModelPyTFEagerPolicy(
        arftifacts_dir, load_specs_from_pbtxt=True
    )
    
    # =========================================================
    # run policy eval on val dataset
    # =========================================================
    print(f"evaluating loaded policy...")
    start_time = time.time()
    
    val_loss, preds, tr_rewards = eval_perarm._run_bandit_eval(
        policy = my_policy,
        data = eval_ds,
        eval_batch_size = HPARAMS['eval_batch_size'],
        per_arm_dim = HPARAMS['per_arm_dim'],
        global_dim = HPARAMS['global_dim'],
        vocab_dict = vocab_dict,
        num_oov_buckets = 1,
        global_emb_size = HPARAMS['global_emb_size'],
        mv_emb_size = HPARAMS['arm_emb_size'],
    )
    runtime_mins = int((time.time() - start_time) / 60)
    print(f"post-train val_loss     : {val_loss}")
    print(f"post-train eval runtime : {runtime_mins}")
    
    # =========================================================
    # log metrics
    # =========================================================
    total_pred_rewards = round(tf.reduce_sum(preds).numpy().tolist(), 2)
    reward_diff = round(abs(total_eval_rewards - total_pred_rewards), 2)
    avg_reward_vals = np.average([total_pred_rewards, total_eval_rewards])
    reward_percentage_diff = round((reward_diff / avg_reward_vals) * 100.0, 2)
    
    print(f"total_eval_rewards : {total_eval_rewards}")
    print(f"total_pred_rewards : {total_pred_rewards}")
    print(f"reward_diff        : {reward_diff}")
    print(f"avg_reward_vals    : {avg_reward_vals}")
    print(f"reward % diff      : {reward_percentage_diff}%")
    
    metrics.log_metric("total_eval_ds_rewards", total_eval_rewards)
    metrics.log_metric("total_predicted_rewards", total_pred_rewards)
    metrics.log_metric("reward_%_diff", reward_percentage_diff)
    metrics.log_metric("val_loss", round(val_loss.numpy().tolist(), 2))

Writing ../src/policy_pipeline/components/eval_agent_policy.py


# Create Vertex Pipeline

In [16]:
# get eval tf-records
BUCKET_NAME = "rec-bandits-v2-hybrid-vertex-bucket"
example_gen_gcs_path = "data/movielens/m1m"

val_files = []
for blob in storage_client.list_blobs(f"{BUCKET_NAME}", prefix=f'{example_gen_gcs_path}/val'):
    if '.tfrecord' in blob.name:
        val_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))

val_dataset = tf.data.TFRecordDataset(val_files)
val_dataset = val_dataset.map(data_utils._parse_function, num_parallel_calls=tf.data.AUTOTUNE)

eval_ds = val_dataset.batch(1)

In [17]:
len(list(eval_ds))

99417

In [18]:
import sys
sys.path.append("..")
from src.policy_pipeline.components import (
    prep_eval_ds,
    train_agent,
    eval_agent_policy,
)

In [19]:
PIPE_VERSION = "v2"
EXPERIMENT_NAME = "gpi-simulation-pipe"

DISPLAY_NAME = f"{EXPERIMENT_NAME}-{PIPE_VERSION}".replace("_","-")
print(f"DISPLAY_NAME: {DISPLAY_NAME}") 

DISPLAY_NAME: gpi-simulation-pipe-v2


In [20]:
@dsl.pipeline(
    name=f"{DISPLAY_NAME}",
)
def gpi_pipeline(
    project_id: str,
    location: str,
    pipeline_version: str,
    bucket_name: str,
    example_gen_gcs_path: str,
    tfrecord_name: str,
    hparams: str,
    experiment_name: str,
    experiment_run_tag: str,
    tensorboard_resource_name: str,
    service_account: str,
    # train job
    num_epochs: int,
    gpi_image_name: str, # TODO
    replica_count: int,
    machine_type: str,
    accelerator_count: int, 
    accelerator_type: str,
):
    import logging
    # from google_cloud_pipeline_components import aiplatform as gcc_aip
    from google_cloud_pipeline_components.v1.dataset.create_tabular_dataset.component import tabular_dataset_create as TabularDatasetCreateOp
    
    # train_dataset_op = TabularDatasetCreateOp(
    #     project=project_id,
    #     display_name="movielens-gpi-train",
    #     gcs_source=tfrecord_name,
    #     labels={"pipeline_version": pipeline_version},
    # )
    
    prepare_eval_data_op = (
        prep_eval_ds.prep_eval_ds(
            project_id=project_id,
            location=location,
            pipeline_version=pipeline_version,
            bucket_name=bucket_name,
            example_gen_gcs_path=example_gen_gcs_path,
            ds_skip = 0,
            ds_take = 50_000,
        )
        .set_display_name("Prepare eval slice")
        .set_caching_options(True)
    )
    
    train_agent_policy_op = (
        train_agent.train_agent(
            project_id=project_id,
            location=location,
            pipeline_version=pipeline_version,
            bucket_name=bucket_name,
            example_gen_gcs_path=example_gen_gcs_path,
            tfrecord_name=tfrecord_name,
            hparams=hparams,
            experiment_name=experiment_name,
            experiment_run_tag=experiment_run_tag,
            tensorboard_resource_name=tensorboard_resource_name,
            service_account=service_account,
            #train job
            num_epochs=num_epochs,
            log_interval=100,
            total_train_take=2_000,
            total_train_skip=0,
            gpi_image_name=gpi_image_name,
            # train compute
            replica_count = 1,
            machine_type = 'n1-highmem-16',
            accelerator_count = 1, 
            accelerator_type = "NVIDIA_TESLA_T4",
            # train_ds=train_dataset_op.outputs["dataset"],
        )
        .set_display_name("Train policy v1")
        .set_caching_options(True)
    )
    
    eval_policy_op = (
        eval_agent_policy.eval_agent_policy(
            project_id=project_id,
            location=location,
            pipeline_version=pipeline_version,
            bucket_name=bucket_name,
            example_gen_gcs_path=example_gen_gcs_path,
            hparams=hparams,
            arftifacts_dir=train_agent_policy_op.outputs["artifacts_dir"],
            ds_skip = prepare_eval_data_op.outputs["ds_skip"], # 0
            ds_take = prepare_eval_data_op.outputs["ds_take"], # 50_000
            num_eval_samples=prepare_eval_data_op.outputs["num_eval_samples"],
            total_eval_rewards=prepare_eval_data_op.outputs["total_eval_rewards"],
            eval_ds=prepare_eval_data_op.outputs["eval_ds"],
        )
        .set_display_name("Eval policy v1")
        .set_caching_options(True)
    )
    
    # retraining on next slice
    train_agent_policy_op_2 = (
        train_agent.train_agent(
            project_id=project_id,
            location=location,
            pipeline_version=pipeline_version,
            bucket_name=bucket_name,
            example_gen_gcs_path=example_gen_gcs_path,
            tfrecord_name=tfrecord_name,
            hparams=hparams,
            experiment_name=experiment_name,
            experiment_run_tag=experiment_run_tag,
            tensorboard_resource_name=tensorboard_resource_name,
            service_account=service_account,
            #train job
            num_epochs=num_epochs,
            log_interval=100,
            total_train_take=2_000,
            total_train_skip=2_000,
            gpi_image_name=gpi_image_name,
            # train compute
            replica_count = 1,
            machine_type = 'n1-highmem-16',
            accelerator_count = 1, 
            accelerator_type = "NVIDIA_TESLA_T4",
            # train_ds=train_dataset_op.outputs["dataset"],
        )
        .set_display_name("Train policy v2")
        .set_caching_options(True)
        # .after(eval_policy_op)
    )
    
    eval_policy_op_2 = (
        eval_agent_policy.eval_agent_policy(
            project_id=project_id,
            location=location,
            pipeline_version=pipeline_version,
            bucket_name=bucket_name,
            example_gen_gcs_path=example_gen_gcs_path,
            hparams=hparams,
            arftifacts_dir=train_agent_policy_op_2.outputs["artifacts_dir"],
            ds_skip = prepare_eval_data_op.outputs["ds_skip"], # 0
            ds_take = prepare_eval_data_op.outputs["ds_take"], # 50_000
            num_eval_samples=prepare_eval_data_op.outputs["num_eval_samples"],
            total_eval_rewards=prepare_eval_data_op.outputs["total_eval_rewards"],
            eval_ds=prepare_eval_data_op.outputs["eval_ds"],
        )
        .set_display_name("Eval policy v2")
        .set_caching_options(True)
    )
    
    # retraining on next slice
    train_agent_policy_op_3 = (
        train_agent.train_agent(
            project_id=project_id,
            location=location,
            pipeline_version=pipeline_version,
            bucket_name=bucket_name,
            example_gen_gcs_path=example_gen_gcs_path,
            tfrecord_name=tfrecord_name,
            hparams=hparams,
            experiment_name=experiment_name,
            experiment_run_tag=experiment_run_tag,
            tensorboard_resource_name=tensorboard_resource_name,
            service_account=service_account,
            #train job
            num_epochs=num_epochs,
            log_interval=100,
            total_train_take=2_000,
            total_train_skip=4_000,
            gpi_image_name=gpi_image_name,
            # train compute
            replica_count = 1,
            machine_type = 'n1-highmem-16',
            accelerator_count = 1, 
            accelerator_type = "NVIDIA_TESLA_T4",
            # train_ds=train_dataset_op.outputs["dataset"],
        )
        .set_display_name("Train policy v3")
        .set_caching_options(True)
        # .after(eval_policy_op_2)
    )
    
    eval_policy_op_3 = (
        eval_agent_policy.eval_agent_policy(
            project_id=project_id,
            location=location,
            pipeline_version=pipeline_version,
            bucket_name=bucket_name,
            example_gen_gcs_path=example_gen_gcs_path,
            hparams=hparams,
            arftifacts_dir=train_agent_policy_op_3.outputs["artifacts_dir"],
            ds_skip = prepare_eval_data_op.outputs["ds_skip"], # 0
            ds_take = prepare_eval_data_op.outputs["ds_take"], # 50_000
            num_eval_samples=prepare_eval_data_op.outputs["num_eval_samples"],
            total_eval_rewards=prepare_eval_data_op.outputs["total_eval_rewards"],
            eval_ds=prepare_eval_data_op.outputs["eval_ds"],
        )
        .set_display_name("Eval policy v3")
        .set_caching_options(True)
    )

## Compile Pipeline

In [21]:
PIPELINE_YAML_FILENAME = "gpi_pipeline.yaml"

! rm -f $PIPELINE_YAML_FILENAME

compiler.Compiler().compile(
    pipeline_func=gpi_pipeline, 
    package_path=PIPELINE_YAML_FILENAME
)
PIPELINE_ROOT = f"gs://{BUCKET_NAME}/gpi-pipelines/{DISPLAY_NAME}"
PIPELINES_FILEPATH = f"{PIPELINE_ROOT}/{PIPELINE_YAML_FILENAME}"

!gsutil cp $PIPELINE_YAML_FILENAME $PIPELINES_FILEPATH

Copying file://gpi_pipeline.yaml [Content-Type=application/octet-stream]...
/ [1 files][ 59.5 KiB/ 59.5 KiB]                                                
Operation completed over 1 objects/59.5 KiB.                                     


## Define pipeline params

In [22]:
EXAMPLE_GEN_GCS_PATH = data_config.EXAMPLE_GEN_GCS_PATH

TFRECORD_NAME = "mv_b128_g12_a16_v6"
TFRECORD_FILE = (
    f"{BUCKET_URI}/{EXAMPLE_GEN_GCS_PATH}/{TFRECORD_NAME}/{TFRECORD_NAME}.tfrecord"
)

GLOBAL_EMBEDDING_SIZE = 12
MV_EMBEDDING_SIZE = 16

# sanity check
N_GLOBAL_FEATURES = len(data_utils.USER_FEATURE_NAMES)
N_ARM_FEATURES    = len(data_utils.MOVIE_FEATURE_NAMES)
GLOBAL_DIM        = GLOBAL_EMBEDDING_SIZE * N_GLOBAL_FEATURES
PER_ARM_DIM       = MV_EMBEDDING_SIZE * N_ARM_FEATURES

print(f"TF Record : {TFRECORD_FILE}")
print(f"GLOBAL_DIM : {GLOBAL_DIM}")
print(f"PER_ARM_DIM : {PER_ARM_DIM}")

print(f"TFRECORD_FILE : {TFRECORD_FILE}")

TF Record : gs://rec-bandits-v2-hybrid-vertex-bucket/data/movielens/m1m/mv_b128_g12_a16_v6/mv_b128_g12_a16_v6.tfrecord
GLOBAL_DIM : 72
PER_ARM_DIM : 64
TFRECORD_FILE : gs://rec-bandits-v2-hybrid-vertex-bucket/data/movielens/m1m/mv_b128_g12_a16_v6/mv_b128_g12_a16_v6.tfrecord


In [23]:
BATCH_SIZE      = 128
NUM_ACTIONS     = 2

EVAL_BATCH_SIZE = 1

AGENT_TYPE      = 'epsGreedy' # 'LinUCB' | 'LinTS |, 'epsGreedy' | 'NeuralLinUCB'
AGENT_ALPHA     = 0.1
EPSILON         = 0.01
LR              = 0.05
ENCODING_DIM    = 1
EPS_PHASE_STEPS = 1000

GLOBAL_LAYERS   = [GLOBAL_DIM, int(GLOBAL_DIM/2), int(GLOBAL_DIM/4)]
ARM_LAYERS      = [PER_ARM_DIM, int(PER_ARM_DIM/2), int(PER_ARM_DIM/4)]
FIRST_COMMON_LAYER = GLOBAL_LAYERS[-1] + ARM_LAYERS[-1] # min(GLOBAL_LAYERS[-1], ARM_LAYERS[-1])
COMMON_LAYERS = [
    int(FIRST_COMMON_LAYER),
    # int(FIRST_COMMON_LAYER/2),
    int(FIRST_COMMON_LAYER/4)
]

NETWORK_TYPE = "commontower" # 'commontower' | 'dotproduct'
if AGENT_TYPE == 'NeuralLinUCB':
    NETWORK_TYPE = 'commontower'
    ENCODING_DIM = COMMON_LAYERS[-1] 
if NETWORK_TYPE == 'dotproduct':
    assert GLOBAL_LAYERS[0] == ARM_LAYERS[0]

HPARAMS = {
    "batch_size": BATCH_SIZE,
    "eval_batch_size" : EVAL_BATCH_SIZE,
    "num_actions": NUM_ACTIONS,
    "agent_type": AGENT_TYPE,
    "network_type": NETWORK_TYPE,
    "global_emb_size": GLOBAL_EMBEDDING_SIZE,
    "arm_emb_size": MV_EMBEDDING_SIZE,
    "global_dim": GLOBAL_DIM,
    "per_arm_dim": PER_ARM_DIM,
    "agent_alpha": AGENT_ALPHA,
    "global_layers": GLOBAL_LAYERS,
    "per_arm_layers": ARM_LAYERS,
    "common_layers": COMMON_LAYERS,
    "learning_rate": LR,
    "epsilon": EPSILON,
    "encoding_dim": ENCODING_DIM,
    "eps_phase_steps": EPS_PHASE_STEPS,
    "summarize_grads_and_vars" : True,
    "debug_summaries": True,
}
pprint(HPARAMS)

{'agent_alpha': 0.1,
 'agent_type': 'epsGreedy',
 'arm_emb_size': 16,
 'batch_size': 128,
 'common_layers': [34, 8],
 'debug_summaries': True,
 'encoding_dim': 1,
 'eps_phase_steps': 1000,
 'epsilon': 0.01,
 'eval_batch_size': 1,
 'global_dim': 72,
 'global_emb_size': 12,
 'global_layers': [72, 36, 18],
 'learning_rate': 0.05,
 'network_type': 'commontower',
 'num_actions': 2,
 'per_arm_dim': 64,
 'per_arm_layers': [64, 32, 16],
 'summarize_grads_and_vars': True}


### Create Managed TensorBoard

In [24]:
NEW_TENSORBOARD = True

In [25]:
if NEW_TENSORBOARD:
    # # create new TB instance
    TENSORBOARD_DISPLAY_NAME=f"{EXPERIMENT_NAME}"
    tensorboard = aiplatform.Tensorboard.create(
        display_name=TENSORBOARD_DISPLAY_NAME
        , project=PROJECT_ID
        , location=REGION
    )
    TB_RESOURCE_NAME = tensorboard.resource_name
else:
    # use existing TB instance
    TB_RESOURCE_NAME = 'projects/934903580331/locations/us-central1/tensorboards/7845855491065380864' # TODO
    tensorboard = aiplatform.Tensorboard(
        tensorboard_name=TB_RESOURCE_NAME
    )
print(f"TB_RESOURCE_NAME: {TB_RESOURCE_NAME}")
print(f"TB display name: {tensorboard.display_name}")

TB_RESOURCE_NAME: projects/934903580331/locations/us-central1/tensorboards/6306398474690625536
TB display name: gpi-simulation-pipe


### Vertex Experiments

In [26]:
RUN_NAME          = f'run-{PIPE_VERSION}'

# CHECKPT_DIR       = f"{BUCKET_URI}/{EXPERIMENT_NAME}/chkpoint"
# BASE_OUTPUT_DIR   = f"{BUCKET_URI}/{EXPERIMENT_NAME}/{RUN_NAME}"
# LOG_DIR           = f"{BASE_OUTPUT_DIR}/logs"
# ARTIFACTS_DIR     = f"{BASE_OUTPUT_DIR}/artifacts"

print(f"EXPERIMENT_NAME   : {EXPERIMENT_NAME}")
print(f"RUN_NAME          : {RUN_NAME}\n")
# print(f"CHECKPT_DIR       : {CHECKPT_DIR}")
# print(f"BASE_OUTPUT_DIR   : {BASE_OUTPUT_DIR}")
# print(f"LOG_DIR           : {LOG_DIR}")
# print(f"ARTIFACTS_DIR     : {ARTIFACTS_DIR}")

EXPERIMENT_NAME   : gpi-simulation-pipe
RUN_NAME          : run-v2



In [27]:
import json

dumped_hparams = json.dumps(HPARAMS)
dumped_hparams

'{"batch_size": 128, "eval_batch_size": 1, "num_actions": 2, "agent_type": "epsGreedy", "network_type": "commontower", "global_emb_size": 12, "arm_emb_size": 16, "global_dim": 72, "per_arm_dim": 64, "agent_alpha": 0.1, "global_layers": [72, 36, 18], "per_arm_layers": [64, 32, 16], "common_layers": [34, 8], "learning_rate": 0.05, "epsilon": 0.01, "encoding_dim": 1, "eps_phase_steps": 1000, "summarize_grads_and_vars": true, "debug_summaries": true}'

In [28]:
loaded_hparams = json.loads(dumped_hparams)
loaded_hparams

{'batch_size': 128,
 'eval_batch_size': 1,
 'num_actions': 2,
 'agent_type': 'epsGreedy',
 'network_type': 'commontower',
 'global_emb_size': 12,
 'arm_emb_size': 16,
 'global_dim': 72,
 'per_arm_dim': 64,
 'agent_alpha': 0.1,
 'global_layers': [72, 36, 18],
 'per_arm_layers': [64, 32, 16],
 'common_layers': [34, 8],
 'learning_rate': 0.05,
 'epsilon': 0.01,
 'encoding_dim': 1,
 'eps_phase_steps': 1000,
 'summarize_grads_and_vars': True,
 'debug_summaries': True}

In [29]:
job = aiplatform.PipelineJob(
    display_name=DISPLAY_NAME,
    location=LOCATION,
    template_path=PIPELINE_YAML_FILENAME,
    pipeline_root=PIPELINE_ROOT,
    failure_policy='fast',
    parameter_values={
        "project_id": PROJECT_ID,
        "location": LOCATION,
        "pipeline_version": PIPE_VERSION,
        "bucket_name": BUCKET_NAME,
        "example_gen_gcs_path": EXAMPLE_GEN_GCS_PATH,
        "tfrecord_name": TFRECORD_NAME, # TFRECORD_FILE,
        "hparams": dumped_hparams, # HPARAMS,
        "experiment_name": EXPERIMENT_NAME,
        "experiment_run_tag": RUN_NAME,
        "tensorboard_resource_name": TB_RESOURCE_NAME,
        "service_account": VERTEX_SA,
        # train job
        "num_epochs" : 3,
        "gpi_image_name": POLICY_TRAIN_IMAGE,
        "replica_count" : 1,
        "machine_type" : "n1-highmem-16",
        "accelerator_count" : 1, 
        "accelerator_type" : "NVIDIA_TESLA_T4",
        
    },
    enable_caching=True,
)

job.submit(
    # experiment=EXPERIMENT_NAME,
    # sync=False,
    service_account=VERTEX_SA,
)

**Finished**

# Stash

In [None]:
# %%writefile ../{REPO_SRC}/{POLICY_PIPE_DIR}/{POLICY_PIPE_SUBDIR}/define_agent_specs.py
# import kfp
# from typing import Any, Callable, Dict, NamedTuple, Optional, List
# from kfp import dsl
# from . import pipeline_config

# @dsl.component(
#     base_image=pipeline_config.POLICY_PIPE_IMAGE,
#     install_kfp_package=False
# )
# def define_agent_specs(
#     project_id: str,
#     location: str,
#     pipeline_version: str,
#     bucket_name: str,
#     example_gen_gcs_path: str,
#     hparams: dict,
# ) -> NamedTuple('Outputs', [
#     ('agent_spec_dict', dict),
# ]):
#     # imports
#     import os
#     os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
    
#     import tensorflow as tf
    
#     # tf-agents
#     from tf_agents.specs import array_spec
#     from tf_agents.specs import tensor_spec
#     from tf_agents.trajectories import time_step as ts
    
#     # this repo
#     from src.utils import train_utils as train_utils
    
#     # ====================================================
#     # define agent
#     # ====================================================
#     observation_spec = {
#         'global': tf.TensorSpec([hparams['global_dim']], tf.float32),
#         'per_arm': tf.TensorSpec([hparams['num_actions'], hparams['per_arm_dim']], tf.float32)
#     }
#     action_spec = tensor_spec.BoundedTensorSpec(
#         shape=[], 
#         dtype=tf.int32,
#         minimum=tf.constant(0),            
#         maximum=hparams['num_actions']-1,
#         name="action_spec"
#     )
#     time_step_spec = ts.time_step_spec(observation_spec = observation_spec)
#     reward_spec = {
#         "reward": array_spec.ArraySpec(
#             shape=[hparams['batch_size']], 
#             dtype=np.float32, 
#             name="reward"
#         )
#     }
#     reward_tensor_spec = train_utils.from_spec(reward_spec)
    
#     agent_spec_dict = {}
#     agent_spec_dict['observation_spec'] = observation_spec
#     agent_spec_dict['action_spec'] = action_spec
#     agent_spec_dict['time_step_spec'] = time_step_spec
#     agent_spec_dict['reward_spec'] = reward_spec
#     agent_spec_dict['reward_tensor_spec'] = reward_tensor_spec
    
#     return (
#         agent_spec_dict
#     )