# Proximal Policy Optimization example for T5 text summarization model
### Based on [this notebook](https://github.com/sinanuozdemir/pearson-gpt-training-engineer/blob/main/notebooks/rl_flan_t5_summaries.ipynb) by [Sinan Ozdemir](https://github.com/sinanuozdemir) with minor changes

In [1]:
# Import necessary dependencies
import warnings  # Import Warnings to suppress unnecessary warnings

# Suppress warning messages
warnings.filterwarnings("ignore")

# Import datasets module and print its version
import datasets
print(datasets.__version__)

# Import the load_dataset function from datasets
from datasets import load_dataset

# Import other essential libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Import the load_dataset function from datasets (again, redundant)
from datasets import load_dataset

# Import tokenizer and pipeline from transformers
from transformers import AutoTokenizer, pipeline

# Import components related to reinforcement learning from TRL (Text-to-Text Reinforcement Learning) library
from trl import PPOTrainer, PPOConfig, create_reference_model, AutoModelForSeq2SeqLMWithValueHead
import torch

# Import tqdm for progress bars
from tqdm.auto import tqdm

# Import tokenizer and model for sequence classification from transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification


2.20.0


In [2]:
# Create a PyTorch device object that represents the selected CUDA device(s).
device = torch.device("cuda")

# Print the selected CUDA device to the console.
print(device)

cuda


In [3]:
# Define parameters
N_EPOCHS = 5 # number of epochs for training
BATCH_SIZE = 16 # batch size
TRY_SAMPLE_DATA = False # If True, try a small sample of data to accelerate the pipeline
INITIAL_MODEL = "google/flan-t5-small" # Initial model
SAVED_MODEL = "flan-t5-small-with-ppo" # Saved model location
EVAL_SUMMARIES = 50 # Number of summaries produced during evaluation

# Create reward pipeline

In [4]:
# Create a sentiment analysis pipeline using the 'cardiffnlp/twitter-roberta-base-sentiment' model.
sentiment_pipeline = pipeline('text-classification', 'cardiffnlp/twitter-roberta-base-sentiment')

# Define a function to extract neutral sentiment scores from a list of texts.
def get_neutral_scores(texts):
    scores = []  # Initialize an empty list to store the neutral sentiment scores.

    # Perform sentiment analysis on the input texts, requesting raw logits as results.
    results = sentiment_pipeline(texts, function_to_apply='none', top_k=None)

    # Iterate through the results for each input text.
    for result in results:
        # Iterate through the labels and their corresponding scores in the result.
        for label in result:
            # Check if the label corresponds to neutral sentiment ('LABEL_1').
            if label['label'] == 'LABEL_1':
                # Append the neutral sentiment score to the 'scores' list.
                scores.append(label['score'])
    
    # Return the list of neutral sentiment scores.
    return scores

# Test the 'get_neutral_scores' function with a list of example texts.
neutral_scores = get_neutral_scores(['hello', 'I love you!', 'I hate you'])

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [5]:
neutral_scores

[0.8519183397293091, -0.7468031644821167, -0.5696877837181091]

In [6]:
# Importing necessary libraries and modules
cola_tokenizer = AutoTokenizer.from_pretrained("textattack/roberta-base-CoLA")  # Load the tokenizer for RoBERTa model
cola_model = AutoModelForSequenceClassification.from_pretrained("textattack/roberta-base-CoLA")  # Load the pre-trained RoBERTa model

# Creating a text classification pipeline using the loaded model and tokenizer
cola_pipeline = pipeline('text-classification', model=cola_model, tokenizer=cola_tokenizer)

# Define a function to get CoLA scores for a list of input texts
def get_cola_scores(texts):
    scores = []  # Initialize an empty list to store scores
    
    # Use the pipeline to generate scores for the input texts
    # Set function_to_apply to 'none' to get logits, which can be negative (desired)
    # Set top_k to None to include all possible labels
    results = cola_pipeline(texts, function_to_apply='none', top_k=None)
    
    # Iterate through the results
    for result in results:
        for label in result:
            if label['label'] == 'LABEL_1':  # Check if the label corresponds to 'LABEL_1' (good grammar)
                scores.append(label['score'])  # Append the score to the scores list
    
    return scores  # Return the list of scores for 'LABEL_1'

Some weights of the model checkpoint at textattack/roberta-base-CoLA were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [7]:
# The following code defines a string variable 'test' that contains a news report.
test = """German police arrest 29-year-old man suspected of plotting truck attack on ice rink in Berlin, 
killing 12 people, Reuters-German police say: 'It's a shame to have been arrested,' 
a German police spokesman said in a statement."""

# The 'get_cola_scores' function is used to analyze the linguistic acceptability of the text.
# It assesses how grammatically correct and well-structured the sentence is.
cola_scores = get_cola_scores([test])
print("CoLA Scores (Linguistic Acceptability):", cola_scores)

# The 'get_neutral_scores' function is used to evaluate the sentiment of the text.
# It determines whether the text has a neutral, positive, or negative sentiment.
neutral_scores = get_neutral_scores([test])
print("Sentiment Analysis Scores (Neutral, Positive, Negative):", neutral_scores)

CoLA Scores (Linguistic Acceptability): [-0.03150612488389015]
Sentiment Analysis Scores (Neutral, Positive, Negative): [0.30984699726104736]


In [10]:
# Reinforcement Learning Configuration

# Create a configuration object for the PPO (Proximal Policy Optimization) algorithm.
config = PPOConfig(
    reward_model_path=INITIAL_MODEL,           # Specify the name of the pre-trained model to use.
    batch_size=BATCH_SIZE,              # Set the batch size for training.
    mini_batch_size=4,
    learning_rate=2e-5,                 # Define the learning rate for the optimizer.
    remove_unused_columns=False,        # Keep unused data columns in the training dataset.
    log_with="wandb",                  
    gradient_accumulation_steps=1,      # Number of gradient accumulation steps before updating the model.
)

# Set a random seed for reproducibility.
np.random.seed(42)

TypeError: PPOConfig.__init__() got an unexpected keyword argument 'log_with'

In [31]:
# Initialize a FLAN-T5 model for sequence-to-sequence tasks with a value head
t5_model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained(
    config.model_name
)  # This model will be updated during training

# Create a reference model based on the initial FLAN-T5 model
t5_model_ref = create_reference_model(t5_model)  # This reference model is never updated

# Initialize a tokenizer for the FLAN-T5 model
t5_tokenizer = AutoTokenizer.from_pretrained(config.model_name)

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [32]:
# Load the "argilla/news-summary" dataset using the load_dataset function.
# This function fetches the dataset from the Hugging Face Datasets Hub.
dataset = load_dataset("argilla/news-summary")

Downloading readme:   0%|          | 0.00/2.02k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.54M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/31.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/20417 [00:00<?, ? examples/s]

In [33]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'prediction', 'prediction_agent', 'annotation', 'annotation_agent', 'id', 'metadata', 'status', 'event_timestamp', 'metrics'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['text', 'prediction', 'prediction_agent', 'annotation', 'annotation_agent', 'id', 'metadata', 'status', 'event_timestamp', 'metrics'],
        num_rows: 20417
    })
})

In [34]:
# In this code, we are using the `map` function to process a dataset.

dataset = dataset.map(
    # Within the `map` function, we use a lambda function to transform each element in the dataset.
    lambda x: {
        "input_ids": t5_tokenizer.encode('summarize: ' + x["text"], return_tensors="pt")
        # We are adding a key "input_ids" to the dataset element's dictionary.
        # The value is generated by encoding the text with the T5 tokenizer.
        # The "summarize: " prefix is added to the input text, which is a common format for summarization tasks.
        # The `return_tensors="pt"` argument specifies that the output should be PyTorch tensors.
    },
    # The `batched` parameter is set to False, indicating that we process one element at a time.
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (631 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/20417 [00:00<?, ? examples/s]

In [35]:
# Remove unnecessary columns from the dataset.
dataset = dataset.remove_columns(['metadata', 'status', 'event_timestamp', 'metrics', 'prediction', 'prediction_agent', 'annotation'])

# Set the dataset format to PyTorch, which is commonly used for deep learning tasks.
dataset.set_format("pytorch")

In [36]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'annotation_agent', 'id', 'input_ids'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['text', 'annotation_agent', 'id', 'input_ids'],
        num_rows: 20417
    })
})

In [37]:
# Define a function called 'collator' that takes a list of dictionaries as input.
def collator(data):
    # Create a dictionary comprehension to aggregate values from each dictionary.
    # The 'key' variable iterates through the keys of the first dictionary in 'data'.
    # This assumes that 'data' is not an empty list.
    return dict((key, [d[key] for d in data]) for key in data[0])

# The function 'collator' takes a list of dictionaries and returns a new dictionary
# where keys are taken from the first dictionary in 'data', and the values are lists
# containing values associated with each key from all dictionaries in 'data'.

In [38]:
# Load our reference FLAN-T5 Model

# Import the AutoTokenizer class from the Hugging Face Transformers library to load the tokenizer for the INITIAL_MODEL model.
t5_tokenizer = AutoTokenizer.from_pretrained(INITIAL_MODEL)

# Define generation hyperparameters

# Set the minimum length of the generated output to 64 tokens.
generation_kwargs = {
    "min_length": 64,

    # Configure the number of beams for beam search. Higher values lead to more diverse but slower generation.
    "num_beams": 5,  # lookahead parameter

    # Control the repetition of n-grams in the generated text. A value of 5 reduces repetitive phrases.
    "no_repeat_ngram_size": 5,  # presence penalty

    # Enable sampling during generation to introduce randomness in the output.
    "do_sample": True,

    # Specify the padding token ID to use when generating sequences.
    "pad_token_id": t5_tokenizer.pad_token_id,

    # Set the maximum length of the generated output to 256 tokens.
    "max_length": 256,

    # Define the end-of-sequence token ID to signal the end of the generated text.
    "eos_token_id": t5_tokenizer.eos_token_id,
}

In [39]:
if TRY_SAMPLE_DATA:
    dataset['train'] = dataset['train'].select([*range(64)])
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'annotation_agent', 'id', 'input_ids'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['text', 'annotation_agent', 'id', 'input_ids'],
        num_rows: 20417
    })
})

In [40]:
# Create a PPOTrainer instance for training a Proximal Policy Optimization (PPO) model.
ppo_trainer = PPOTrainer(
    config,  # Configuration settings for the trainer.
    t5_model,  # The primary T5 model used for training.
    t5_model_ref,  # A reference T5 model for comparison or other purposes.
    t5_tokenizer,  # Tokenizer used to process input data.
    dataset['train'],  # Training dataset used to train the PPO model.
    data_collator=collator  # Data collator for processing training data batches.
)


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msuicune[0m. Use [1m`wandb login --relogin`[0m to force relogin


# Train and validate the model using PPO

In [42]:
# Loop through training epochs

for epoch in tqdm(range(N_EPOCHS)):
    avg_rewards = []
    # Iterate through batches in the data loader
    for batch in tqdm(ppo_trainer.dataloader):
        # Create a dictionary to store game data for this batch
        game_data = dict()

        # Prepend the 'summarize:' token to each text in the batch
        game_data["query"] = ['summarize: ' + b for b in batch["text"]]

        # Generate responses from the updated t5 model
        input_tensors = [_.squeeze() for _ in batch["input_ids"]]
        response_tensors = []
        for query in input_tensors:
            # Generate a response using PPO with specified generation parameters
            response = ppo_trainer.generate(query.squeeze(), **generation_kwargs)
            response_tensors.append(response.squeeze())

        # Decode the response tensors to obtain the generated text
        game_data["response"] = [t5_tokenizer.decode(r.squeeze(), skip_special_tokens=False) for r in response_tensors]

        # Calculate and store clean responses (without special tokens)
        game_data["clean_response"] = [t5_tokenizer.decode(r.squeeze(), skip_special_tokens=True) for r in response_tensors]

        # Calculate cola_scores and neutral_scores
        game_data['cola_scores'] = get_cola_scores(game_data["clean_response"])
        game_data['neutral_scores'] = get_neutral_scores(game_data["clean_response"])

        # Calculate rewards based on neutral_scores
        rewards = game_data['neutral_scores']

        # Combine cola_scores and neutral_scores into a single reward score
        transposed_lists = zip(game_data['cola_scores'], game_data['neutral_scores'])
        rewards = [1 * values[0] + 0.5 * values[1] for values in transposed_lists]
        rewards = [torch.tensor([_]) for _ in rewards]
        
        # Calculate batch average reward before the PPO training
        avg_reward = np.mean([r.cpu().numpy() for r in rewards])
        print(f"Average reward for the batch is {avg_reward:.4f}")
        
        # Append batch reward to avg rewards
        avg_rewards.append(avg_reward)

        # Run Proximal Policy Optimization (PPO) training
        stats = ppo_trainer.step(input_tensors, response_tensors, rewards)

        # Calculate and log the mean reward for this batch
        stats['env/reward'] = np.mean([r.cpu().numpy() for r in rewards])

        # Log training statistics, game data, and rewards
        ppo_trainer.log_stats(stats, game_data, rewards)
    # Print average reward at the end of each run
    print(f">>>>> Average reward for epoch {epoch} is: {np.mean(avg_rewards):.4f}" )

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

[tensor([    0,  1013,  9677,  2095,  2762, 10319, 11741,    21,  6285, 21078,
           18,   107,    32,     6, 13404,    13,  6627,   354,    77,  1531,
            6,    30,  3991,    13, 10564,    13,  2019,   826,    70, 12732,
          139,  1449,   161,    44,   112,   629,     6,     3,     9,  2095,
         2314,   243,    30,  2089,     6,     3, 18844, 16854,     3,     9,
            3,     7, 18461,    21,  1013,  7054,     3,     7,     3,     7,
        18461,    38,  2145,     5,     1], device='cuda:0'), tensor([    0, 10021, 13074,   851,    18, 10806, 20855, 10213,  1883,   160,
          525,   981, 12508,   607,     6,  6013,  3049,     5,  1914,  1230,
            6,    12,     8,   915,     6,    28,     3,     9,   580,    12,
         8994,  7459,  2523,    12,   143,   347,   112,  2055,  1104,  5146,
            5,    96,   634,  1176,   794,    21,  7459,  2523,    19,   823,
            3,    88,    56, 13859,    12,     8, 17875,  2348,    57,   334,
 

RuntimeError: No active exception to reraise

In [19]:
# Save the T5 model to the specified directory.
t5_model.save_pretrained(SAVED_MODEL)

# Save the T5 tokenizer to the same directory as the model for future use.
t5_tokenizer.save_pretrained(SAVED_MODEL)

('flan-t5-small-with-ppo/tokenizer_config.json',
 'flan-t5-small-with-ppo/special_tokens_map.json',
 'flan-t5-small-with-ppo/spiece.model',
 'flan-t5-small-with-ppo/added_tokens.json',
 'flan-t5-small-with-ppo/tokenizer.json')