In [None]:
# Install necessary packages if running locally or in a new environment
# !pip3 install google-cloud-pipeline-components
# !pip3 install kfp
# !pip3 install google-cloud-aiplatform
# !pip install tensorboard

import math
import json
import pandas as pd
from kfp import compiler
from google_cloud_pipeline_components.preview.llm import rlhf_pipeline
import google.cloud.aiplatform as aiplatform
from utils import authenticate, print_d

In [None]:
# Define a path to the yaml file
RLHF_PIPELINE_PKG_PATH = "rlhf_pipeline.yaml"

# Execute the compile function
compiler.Compiler().compile(
    pipeline_func=rlhf_pipeline,
    package_path=RLHF_PIPELINE_PKG_PATH
)

# Verify the file was created
!head rlhf_pipeline.yaml

In [None]:
# --- Calculate Reward Model Training Steps ---

# Preference dataset size and batch size
PREF_DATASET_SIZE = 3000
BATCH_SIZE = 64

REWARD_STEPS_PER_EPOCH = math.ceil(PREF_DATASET_SIZE / BATCH_SIZE)
REWARD_NUM_EPOCHS = 30

# Calculate number of steps in the reward model training
reward_model_train_steps = REWARD_STEPS_PER_EPOCH * REWARD_NUM_EPOCHS
print(f"Reward Model Train Steps: {reward_model_train_steps}")

In [None]:
# --- Calculate Reinforcement Learning Training Steps ---

# Prompt dataset size
PROMPT_DATASET_SIZE = 2000
BATCH_SIZE = 64

RL_STEPS_PER_EPOCH = math.ceil(PROMPT_DATASET_SIZE / BATCH_SIZE)
RL_NUM_EPOCHS = 10

# Calculate the number of steps in the RL training
reinforcement_learning_train_steps = RL_STEPS_PER_EPOCH * RL_NUM_EPOCHS
print(f"Reinforcement Learning Train Steps: {reinforcement_learning_train_steps}")

In [None]:
# Define parameter values for the pipeline
parameter_values={
        "preference_dataset": \
    "gs://vertex-ai/generative-ai/rlhf/text_small/summarize_from_feedback_tfds/comparisons/train/*.jsonl",
        "prompt_dataset": \
    "gs://vertex-ai/generative-ai/rlhf/text_small/reddit_tfds/train/*.jsonl",
        "eval_dataset": \
    "gs://vertex-ai/generative-ai/rlhf/text_small/reddit_tfds/val/*.jsonl",
        "large_model_reference": "llama-2-7b",
        "reward_model_train_steps": reward_model_train_steps,
        "reinforcement_learning_train_steps": reinforcement_learning_train_steps,
        "reward_model_learning_rate_multiplier": 1.0,
        "reinforcement_learning_rate_multiplier": 1.0,
        "kl_coeff": 0.1, # increased to reduce reward hacking
        "instruction":\
    "Summarize in less than 50 words"}

In [None]:
# Authenticate and Initialize Vertex AI
credentials, PROJECT_ID, STAGING_BUCKET = authenticate()
REGION = "europe-west4"

aiplatform.init(project = PROJECT_ID,
                location = REGION,
                credentials = credentials)

In [None]:
# Create the pipeline job
job = aiplatform.PipelineJob(
    display_name="tutorial-rlhf-tuning",
    pipeline_root=STAGING_BUCKET,
    template_path=RLHF_PIPELINE_PKG_PATH,
    parameter_values=parameter_values)

# Run the pipeline job
job.run()

--- 
### Model Evaluation
The following cells handle Tensorboard monitoring and comparing the tuned vs untuned model results.

In [None]:
%load_ext tensorboard

In [None]:
# Run Tensorboard for reward logs
port = %env PORT1
%tensorboard --logdir reward-logs --port $port --bind_all

In [None]:
# Run Tensorboard for reinforcer logs
port = %env PORT2
%tensorboard --logdir reinforcer-logs --port $port --bind_all

In [None]:
# Run Tensorboard for full data logs
port = %env PORT3
%tensorboard --logdir reinforcer-fulldata-logs --port $port --bind_all

In [None]:
# Load Evaluation Data (Tuned vs Untuned)
eval_tuned_path = 'eval_results_tuned.jsonl'
eval_untuned_path = 'eval_results_untuned.jsonl'

eval_data_tuned = []
eval_data_untuned = []

# Read Tuned Data
with open(eval_tuned_path) as f:
    for line in f:
        eval_data_tuned.append(json.loads(line))

# Read Untuned Data
with open(eval_untuned_path) as f:
    for line in f:
        eval_data_untuned.append(json.loads(line))

# Print sample to verify
from utils import print_d
print("Tuned Sample:")
print_d(eval_data_tuned[0])
print("\nUntuned Sample:")
print_d(eval_data_untuned[0])

In [None]:
# Create comparison dataframe

# Extract prompts
prompts = [sample['inputs']['inputs_pretokenized'] for sample in eval_data_tuned]

# Extract completions
untuned_completions = [sample['prediction'] for sample in eval_data_untuned]
tuned_completions = [sample['prediction'] for sample in eval_data_tuned]

# Build DataFrame
results = pd.DataFrame(
    data={'prompt': prompts,
          'base_model':untuned_completions,
          'tuned_model': tuned_completions})

pd.set_option('display.max_colwidth', None)
results