# Experimental implementation of Informal-GPT

This notebook will finetune GPT-2 medium model to talk in informal way.

Will use

- [s-nlp/roberta-base-formality-ranker](https://huggingface.co/s-nlp/roberta-base-formality-ranker) as a reward model
- [robowaifudev/megatron-gpt2-345m](https://huggingface.co/robowaifudev/megatron-gpt2-345m) as a autoregressive text generator
- [wikitext](https://huggingface.co/datasets/wikitext) as a query source

## Preparation & Configuration

In [1]:
import datetime
import os
from typing import Any, Dict, List, Optional, Union

import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
import pandas as pd

tqdm.pandas()

# Hugging Face transformers
from transformers import pipeline, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
from datasets import load_dataset, Dataset

# TRL library
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from trl.core import LengthSampler

# IDs of models to use
REWARD_MODEL_ID = "s-nlp/roberta-base-formality-ranker"
TEXT_GENERATION_MODEL_ID = "robowaifudev/megatron-gpt2-345m"
# IDs of dataset to use
DATASET_ID = "wikitext"

PPO Configurations

In [2]:
GPT_BATCH_SIZE = 300
BERT_BATCH_SIZE = 16

# os.environ["WANDB_PROJECT"] = "informal-gpt"
ppo_config = PPOConfig(
    batch_size=GPT_BATCH_SIZE,
    model_name=TEXT_GENERATION_MODEL_ID,
    learning_rate=1.41e-5,  # NOTE: This parameter is taken from OpenAI's paper: "Fine-Tuning Language Models from Human Preferences"
    log_with="wandb",
)
datetime_str = str(datetime.datetime.now()).replace(" ", "_").replace(":", "-")
checkpoint_dir_name = f"megatron-gpt2-medium-and-bert-based-formality-ranker-{datetime_str}"

bert_kwargs = {
    "return_all_scores": True,  # make BERT return scores for all classes, not only the most likely one
    "function_to_apply": "none",
    "batch_size": BERT_BATCH_SIZE,
}

fatal: No names found, cannot describe anything.


connect to wandb

In [3]:
import wandb
wandb.init(
    project="informal-gpt",
    name=f"megatron-gpt2-medium-and-bert-based-formality-ranker-{datetime_str}",
)

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mt0d4[0m. Use [1m`wandb login --relogin`[0m to force relogin


## Prepare **wikitext** dataset

define function for dataset preparation

In [4]:
def build_dataset(
    dataset_id: str,
    dataset_size: int,
    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
    config_name: Optional[str] = None,
    input_min_text_length: int = 4,
    input_max_text_length: int = 8,
) -> DataLoader:
    
    if config_name:
        ds = load_dataset(dataset_id, config_name, split="train")
    else:
        ds = load_dataset(
            path=dataset_id,
            split="train",  # retrieve only train split
        )
    ds = ds.filter(
        function=lambda item: len(item["text"]) > 150,  # only use sentences whose length is more than 150 characters
        batched=False,
    )
    
    ds = ds.shuffle()
    ds = ds[:dataset_size]
    ds = Dataset.from_dict(ds)  # ds becomes `dict` when sliced, so change it back to Dataset
    print(ds)
    
    def tokenize(item):
        """
            item["input_ids"] will be a sequence of first `get_input_size()` tokens converted from the input text.
            item["query"] will be a human-readable raw text decoded from item["input_ids"]
        """
        get_input_size = LengthSampler(
            min_value=input_min_text_length,
            max_value=input_max_text_length,
        )
        item["input_ids"] = tokenizer.encode(text=item["text"])[:get_input_size()]
        item["query"] = tokenizer.decode(token_ids=item["input_ids"])
        return item
    
    ds = ds.map(
        function=tokenize,
        batched=False,
        num_proc=8,
    )
    ds.set_format(type="torch")
    return ds

instantiate tokenizer and do preprocessing

In [5]:
# reference: imdb dataset's train split contains 24895 rows
DATASET_SIZE = 24895

tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=ppo_config.model_name,
)
tokenizer.pad_token = tokenizer.eos_token  # use <EOS> token for padding

dataset = build_dataset(
    dataset_id=DATASET_ID,
    dataset_size = DATASET_SIZE,
    config_name="wikitext-103-raw-v1",
    tokenizer=tokenizer,
)


def dataset_collator(data: List[Dict[str, Any]]) -> Dict[str, List[Any]]:
    # Data is like list of dictionaries.
    # We format them as a batch
    return dict((key, [d[key] for d in data]) for key in data[0])



Dataset({
    features: ['text'],
    num_rows: 40000
})


Map (num_proc=8):   0%|          | 0/40000 [00:00<?, ? examples/s]

## Load pre-trained text generation models

In [7]:
model = AutoModelForCausalLMWithValueHead.from_pretrained(
    pretrained_model_name_or_path=ppo_config.model_name,
)
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(
    pretrained_model_name_or_path=ppo_config.model_name,
)

## Initialize PPOTrainer

In [8]:
ppo_trainer = PPOTrainer(
    config=ppo_config,
    model=model,
    ref_model=ref_model,
    tokenizer=tokenizer,
    dataset=dataset,
    data_collator=dataset_collator,
    # TODO: use learning rate scheduler
)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01667026193366231, max=1.0)…

In [9]:
# for batch in ppo_trainer.dataloader:
#     print(batch["input_ids"][0])
#     print(ppo_trainer.generate(batch["input_ids"][0]))
#     break

## Load BERT-based classification model to use as Reward Model

In [10]:
device = ppo_trainer.accelerator.device
if ppo_trainer.accelerator.num_processes == 1:
    if torch.cuda.is_available():
        device = 0
    else:
        device = "cpu"  # although this is not feasible
formality_pipe = pipeline(
    task="text-classification",
    model=REWARD_MODEL_ID,
    device=device,
)

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Xformers is not installed correctly. If you want to use memorry_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


## Optimize model

output directory setting

In [None]:
checkpoint_dirpath = os.path.join("checkpoints", checkpoint_dir_name)
os.makedirs(checkpoint_dirpath, exist_ok=False)

define generation settings

In [11]:
generation_kwargs = {
    "min_length": -1,
    "top_k": 50,  # NOTE: Here's room for optimization
    "top_p": .95,  # NOTE: Here's room for optimization
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id
}

### Define training loop

In [None]:
output_min_length = 4
output_max_length = 16
output_length_sampler = LengthSampler(
    min_value=output_min_length,
    max_value=output_max_length,
)

batch_count = len(ppo_trainer.dataloader)
save_interval = batch_count // 5
for batch_pos, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    queries_tensor = batch["input_ids"]
    
    # retrieve response from text generator (faster implementation than tutorial)
    # generation_kwargs["max_new_tokens"] = output_max_length
    # completions = ppo_trainer.generate(queries_tensor, **generation_kwargs)
    # responses_tensor = []
    # for i in range(completions.shape[0]):
    #     generation_len = output_length_sampler()
    #     responses_tensor.append(completions[i, :generation_len])
    
    completions_tensor = []
    for query in queries_tensor:
        generation_len = output_length_sampler()
        generation_kwargs["max_new_tokens"] = generation_len
        completion = ppo_trainer.generate(query, **generation_kwargs)
        completions_tensor.append(completion.squeeze()[-generation_len:])
    batch["response"] = [tokenizer.decode(c.squeeze()) for c in completions_tensor]
    
    # compute informality score
    completed_texts = [q + c for q, c in zip(batch["query"], batch["response"])]
    pipe_outputs = formality_pipe(completed_texts, **bert_kwargs)
    # output[0] is a score for "informal"experiments/checkpoints/megatron-gpt2-medium-and-bert-based-formality-ranker-2023-07-19_10-20-47.357590/checkpoint-last
    rewards = [torch.tensor(output[0]["score"]) for output in pipe_outputs]

    # execute PPO to tune parameter
    stats = ppo_trainer.step(
        queries=queries_tensor,
        responses=completions_tensor,
        scores=rewards,
    )
    ppo_trainer.log_stats(
        stats=stats,
        batch=batch,
        rewards=rewards,
    )

    if batch_pos % save_interval == 0:
        model.save_pretrained(
            save_directory=os.path.join(
                checkpoint_dirpath, f"checkpoint-at-step-{batch_pos}"
            )
        )
        tokenizer.save_pretrained(
            save_directory=os.path.join(
                checkpoint_dirpath, f"checkpoint-at-step-{batch_pos}"
            )
        )

save the model at the end of training

In [None]:
model.save_pretrained(
    save_directory=os.path.join(checkpoint_dirpath, "checkpoint-last")
)
tokenizer.save_pretrained(
    save_directory=os.path.join(checkpoint_dirpath, "checkpoint-last")
)

## try prediction

In [10]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from trl import AutoModelForCausalLMWithValueHead
model = AutoModelForCausalLMWithValueHead.from_pretrained(
    pretrained_model_name_or_path="checkpoints/megatron-gpt2-medium-and-bert-based-formality-ranker-2023-07-19_10-20-47.380135/checkpoint-last"
)
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path="checkpoints/megatron-gpt2-medium-and-bert-based-formality-ranker-2023-07-19_10-20-47.380135/checkpoint-last"
)

gpt_generation_pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    top_p=0.95,
    top_k=50,
    max_length=50,
)

Some weights of the model checkpoint at checkpoints/megatron-gpt2-medium-and-bert-based-formality-ranker-2023-07-19_10-20-47.380135/checkpoint-last were not used when initializing GPT2LMHeadModel: ['v_head.summary.bias', 'v_head.summary.weight']
- This IS expected if you are initializing GPT2LMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2LMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The model 'AutoModelForCausalLMWithValueHead' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCaus

In [14]:
gpt_generation_pipe("My laptop is")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'My laptop is just a little bit bit bit bit bit bit bit of a big-by-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a-a'}]