# Tune T5 Paraphrase model to generate better Jeopardy question prompts

## Setup experiment

### Import dependencies

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import torch
import wandb
import time
import os
from tqdm import tqdm
import numpy as np

import re
import hashlib
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
from datasets import load_dataset, load_metric, concatenate_datasets
from trl.t5 import T5HeadWithValueModel, respond_to_batch
from trl.ppo import PPOTrainer

### Configuration

In [None]:
config = {
    "lm_name": "Vamsi/T5_Paraphrase_Paws",
    "ref_lm_name": "Vamsi/T5_Paraphrase_Paws",
    "cls_model_name": "vblagoje/bert-base-searchqa",
    "tk_name": "t5-base",
    "steps": 25600,
    "batch_size": 256,
    "forward_batch_size": 16,
    "ppo_epochs": 4,   
    "txt_in_len": 5,
    "txt_out_len": 15,
    "lr": 1.41e-5,
    "init_kl_coef":0.2,
    "target": 6,
    "horizon":10000,
    "gamma":1,
    "lam":0.95,
    "cliprange": .2,
    "cliprange_value":.2,
    "vf_coef":.1, 
}

You can see that we load a GPT2 model called `gpt2_imdb`. This model was additionally fine-tuned on the IMDB dataset for 1 epoch with the huggingface [script](https://github.com/huggingface/transformers/blob/master/examples/run_language_modeling.py) (no special settings). The other parameters are mostly taken from the original paper ["Fine-Tuning Language Models from Human Preferences"](
https://arxiv.org/pdf/1909.08593.pdf). This model as well as the BERT model is available in the Huggingface model zoo [here](https://huggingface.co/models). The following code should automatically download the models.

### Initialize W&B logger
We use `wandb`to log all the metrics during training.

In [None]:
wandb.init(name='run-42', project='t5-aqa', config=config)

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Offline run mode, not syncing to the cloud.
[34m[1mwandb[0m: W&B syncing is set to `offline` in this directory.  Run `wandb online` to enable cloud syncing.


## Load data and models

### Load SearchQA and convert it to SQuAD format
Description here

In [None]:
def strip_html(txt):
  cleanr = re.compile("<.*?>")
  cleantext = re.sub(cleanr, " ", txt)
  return cleantext

def remove_special_chars(txt):
  pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]'
  result = re.sub(pat, "", txt)
  return re.sub("\n", "", result)


def clean(txt):
  return strip_html(remove_special_chars(txt))

def convertSearchQAExampleToSquadExample(example):
  snippets = example["search_results"]["snippets"][:10]
  snippets = [x for x in snippets if x != None]
  context = "".join(snippets)
  answers = {}
  answer_for_match = ' ' + re.escape(example["answer"]) + ' '
  id = hashlib.new("sha1", example["question"].encode())
  if re.search(answer_for_match, context):
    matches = re.finditer(answer_for_match, context)
    answer_start = [pos.start() + 1 for pos in matches]
    answers = {
          'answer_start': answer_start,
          'text': [example["answer"]] * len(answer_start)
      }

  return {"id": id.hexdigest(),
          "title": example["question"],
          "question": example["question"],
          "answers": answers,
          "context": clean(context)}

## Load SearchQA and convert it to SQuAD

In [None]:
# Load SearchQA
search_qa = load_dataset("search_qa", "train_test_val")

#clean up and convert to SQuAD format
squad_qa = search_qa.map(convertSearchQAExampleToSquadExample, remove_columns=search_qa["train"].column_names)

#filter no answer questions
squad_qa = squad_qa.filter(lambda example: example["answers"]["answer_start"] is not None)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3067.0, style=ProgressStyle(description…



Downloading and preparing dataset search_qa/train_test_val (download: 2.93 GiB, generated: 6.99 GiB, post-processed: Unknown size, total: 9.93 GiB) to /Users/vblagoje/.cache/huggingface/datasets/search_qa/train_test_val/1.0.0/a2a9f2281af3826aaca532a2214573f11c1979499ac14b5639c7f02ac3ff0c63...

Dataset search_qa downloaded and prepared to /Users/vblagoje/.cache/huggingface/datasets/search_qa/train_test_val/1.0.0/a2a9f2281af3826aaca532a2214573f11c1979499ac14b5639c7f02ac3ff0c63. Subsequent calls will reuse this data.








HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1778.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Downloading', max=1.0, style=ProgressSt…

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=151295.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=43228.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=21613.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=152.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=44.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=22.0), HTML(value='')))

### Show an example entry item in the dataset

In [None]:
squad_qa["train"][6]

{'answers': {'answer_start': [18], 'text': ['Jesse James']},
 'context': "In the Wild West, Jesse James was legendary  a Robin Hoodlike figure who the public loved and lawmakers hated. The outlaw's notorious bank robbing spree...Jesse James, one of America's most notorious outlaws, is shot to death by Robert Ford, a member of his gang who hoped to collect the bounty on Jesse's head.",
 'id': 'b64e19f081a8b60c3a2b6742bd66a7275d46b5a3',
 'question': 'Outlaw: "Murdered by a traitor and a coward whose name is not worthy to appear here"',
 'title': 'Outlaw: "Murdered by a traitor and a coward whose name is not worthy to appear here"'}

### Load QA pipeline


In [None]:
qa_model = AutoModelForQuestionAnswering.from_pretrained(config["cls_model_name"])
q_tokenizer = AutoTokenizer.from_pretrained(config["cls_model_name"])
qa = pipeline("question-answering", model=qa_model, tokenizer=q_tokenizer)

The model outputs are the logits for the negative and positive class. We will use the logits for positive class as a reward signal for the language model.

In [None]:
training_example_id = 6
context = squad_qa["train"][training_example_id]["context"]
q = squad_qa["train"][training_example_id]["question"]
qa(question=q, context=context)

{'score': 0.8468492031097412, 'start': 18, 'end': 29, 'answer': 'Jesse James'}

NameError: name 'sentiment_model' is not defined

The resulting reward signal:

### Load pre-trained GPT2 language models

We load the GPT2 model with a value head and the tokenizer. We load the model twice; the first model is optimized while the second model serves as a reference to calculate the KL-divergence from the starting point. This serves as an additional reward signal in the PPO training to make sure the optimized model does not deviate too much from the original language model.

In [None]:
paraphrase_model = T5HeadWithValueModel.from_pretrained(config['lm_name'])
paraphrase_model_ref = T5HeadWithValueModel.from_pretrained(config['ref_lm_name'])
tokenizer = AutoTokenizer.from_pretrained(config['tk_name'])

Some weights of T5HeadWithValueModel were not initialized from the model checkpoint at Vamsi/T5_Paraphrase_Paws and are newly initialized: ['v_head.state_representation.weight', 'v_head.state_representation.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of T5HeadWithValueModel were not initialized from the model checkpoint at Vamsi/T5_Paraphrase_Paws and are newly initialized: ['v_head.state_representation.weight', 'v_head.state_representation.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1199.0, style=ProgressStyle(description…






HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1389353.0, style=ProgressStyle(descript…

### Watch model with wandb
This wandb magic logs the gradients and weights of the model during training.

In [None]:
wandb.watch(paraphrase_model, log='all')

[<wandb.wandb_torch.TorchGraph at 0x7facd213f610>]

### Move models to GPU

If `cuda` is available move the computations to the GPU.

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
_ = paraphrase_model.to(device)
_ = qa_model.to(device)
_ = paraphrase_model_ref.to(device)

AttributeError: 'str' object has no attribute 'to'

### Tokenize IMDB reviews

We tokenize all IMDB in advance to avoid tokenizing twice. In the first step we encode the queries and slice the first `txt_in_len` tokens. In a second step we decode these tokens back to text for later display.

In [None]:
df['tokens'] = df['review'].progress_apply(lambda x: gpt2_tokenizer.encode(x, return_tensors="pt").to(device)[0, :config['txt_in_len']])

In [None]:
df['query'] = df['tokens'].progress_apply(lambda x: gpt2_tokenizer.decode(x))

## Optimize model

**Steps**

The training loop consists of the following steps:
1. Get a batch of queries
2. Get the query responses from the policy
3. Join query and responses and tokenize for BERT analysis
4. Get sentiments for query/responses from BERT
5. Optimize policy with PPO using the (query, response, reward) triplet
6. Log all the training statistics

**Forward batching**

Since the models can be fairly big and we want to rollout large PPO batches this can lead to out-of-memory errors when doing the forward passes for text generation and sentiment analysis. We introduce the parameter `forward_batch_size` to split the forward passes into smaller batches. Although this hurts performance a little this is neglectible compared to the computations of the backward passes when optimizing the model. The same parameter is used in the `PPOTrainer` when doing forward passes. The `batch_size` should multiple of `forward_batch_size`.

**Training time**

This step takes **~2h** on a P6000 GPU with the above specified settings.

In [None]:
ppo_trainer = PPOTrainer(gpt2_model, gpt2_model_ref, **config)
fbs = config['forward_batch_size']

for epoch in tqdm(range(int(np.ceil(config["steps"]/config['batch_size'])))):
    torch.cuda.empty_cache()
    logs = dict()
    game_data = dict()
    timing = dict()
    t0 = time.time()
    
    #### get a batch from the dataset
    df_batch = df.sample(config['batch_size'])
    game_data['query'] = df_batch['query'].tolist()
    query_tensors = torch.stack(df_batch['tokens'].tolist())
    
    #### get response from gpt2
    t = time.time()
    total_length = config['txt_in_len']+config['txt_out_len']
    response_tensors = []
    for i in range(int(config['batch_size']/fbs)):
        response  = respond_to_batch(gpt2_model, query_tensors[i*fbs:(i+1)*fbs],
                                        txt_len=config['txt_out_len'])
        response_tensors.append(response)
    response_tensors = torch.cat(response_tensors)
    game_data['response'] = [gpt2_tokenizer.decode(response_tensors[i, :]) for i in range(config['batch_size'])]
    timing['time/get_response'] = time.time()-t

    #### tokenize text for sentiment analysis
    t = time.time()
    texts = [q + r for q,r in zip(game_data['query'], game_data['response'])]
    sentiment_inputs, attention_masks = build_bert_batch_from_txt(texts, sentiment_tokenizer, device)    
    timing['time/build_input_sentiment'] = time.time()-t

    #### get sentiment score
    t = time.time()
    rewards = []
    for i in range(int(config['batch_size']/fbs)):
        res = sentiment_model.forward(sentiment_inputs[i*fbs:(i+1)*fbs],
                                      attention_masks[i*fbs:(i+1)*fbs])[0][:, 1].detach()
        rewards.append(res)
    rewards = torch.cat(rewards)
    timing['time/get_sentiment_preds'] = time.time()-t

    #### Run PPO training 
    t = time.time()
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    timing['time/optimization'] = time.time()-t
     
    #### Log everything
    timing['time/epoch'] = time.time()-t0
    table_rows = [list(r) for r in zip(game_data['query'], game_data['response'], rewards.cpu().tolist())]
    logs.update({'game_log':wandb.Table(
        columns=['query', 'response', 'reward'],
        rows=table_rows)})
    logs.update(timing)
    logs.update(stats)
    logs['env/reward_mean'] = torch.mean(rewards).cpu().numpy()
    logs['env/reward_std'] = torch.std(rewards).cpu().numpy()
    logs['env/reward_dist'] = rewards.cpu().numpy()
    wandb.log(logs)

### Training progress
If you are tracking the training progress with Weights&Biases you should see a plot similar to the one below. Check out the interactive sample report on wandb.ai: [link](https://app.wandb.ai/lvwerra/trl-showcase/runs/1jtvxb1m/).

<div style="text-align: center">
<img src='images/gpt2_tuning_progress.png' width='800'>
<p style="text-align: center;"> <b>Figure:</b> Reward mean and distribution evolution during training. </p>
</div>

One can observe how the model starts to generate more positive outputs after a few optimisation steps.

> Note: Investigating the KL-divergence will probably show that at this point the model has not converged to the target KL-divergence, yet. To get there would require longer training or starting with a higher inital coefficient.

## Model inspection
Let's inspect some examples from the IMDB dataset. We can use `gpt2_model_ref` to compare the tuned model `gpt2_model` against the model before optimisation.

In [None]:
#### get a batch from the dataset
bs = 16
game_data = dict()
df_batch = df.sample(bs)
game_data['query'] = df_batch['query'].tolist()
query_tensors = torch.stack(df_batch['tokens'].tolist())

#### get response from gpt2 and gpt2_ref
total_length = config['txt_in_len']+config['txt_out_len']
response_tensors_ref  = respond_to_batch(gpt2_model_ref, query_tensors, txt_len=config['txt_out_len'])
game_data['response (before)'] = [gpt2_tokenizer.decode(response_tensors_ref[i, :]) for i in range(bs)]

response_tensors  = respond_to_batch(gpt2_model, query_tensors, txt_len=config['txt_out_len'])
game_data['response (after)'] = [gpt2_tokenizer.decode(response_tensors[i, :]) for i in range(bs)]

#### sentiment analysis of query/response pairs before/after
texts = [q + r for q,r in zip(game_data['query'], game_data['response (before)'])]
sentiment_inputs, attention_masks = build_bert_batch_from_txt(texts, sentiment_tokenizer, device)    
rewards = sentiment_model.forward(sentiment_inputs, attention_masks)[0][:, 1].detach()
game_data['rewards (before)'] = rewards.cpu().numpy()

texts = [q + r for q,r in zip(game_data['query'], game_data['response (after)'])]
sentiment_inputs, attention_masks = build_bert_batch_from_txt(texts, sentiment_tokenizer, device)    
rewards = sentiment_model.forward(sentiment_inputs, attention_masks)[0][:, 1].detach()
game_data['rewards (after)'] = rewards.cpu().numpy()

# store results in a dataframe
df_results = pd.DataFrame(game_data)
df_results

Looking at the reward mean/median of the generated sequences we observe a significant difference.

In [None]:
print('mean:')
display(df_results.mean())
print()
print('median:')
display(df_results.median())

## Save model
Finally, we save the model to disk for later usage.

In [None]:
os.makedirs('gpt2-imdb-pos')
gpt2_model.save_pretrained('gpt2-imdb-pos')
gpt2_tokenizer.save_pretrained('gpt2-imdb-pos')