In [2]:
%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1 --quiet

%pip install \
    transformers==4.27.2 \
    datasets==2.15.0 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1 \
    peft==0.3.0 --quiet
%pip install urllib3==1.26.17
# Reinforcement Learning library
%pip install git+https://github.com/lvwerra/trl.git@25fa1bd    

[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pathos 0.3.2 requires dill>=0.3.8, but you have dill 0.3.7 which is incompatible.
pathos 0.3.2 requires multiprocess>=0.70.16, but you have multiprocess 0.70.15 which is incompatible.[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.
Collecting urllib3==1.26.17
  Using cached urllib3-1.26.17-py2.py3-none-any.whl.metadata (48 kB)
Using cached urllib3-1.26.17-py2.py3-none-any.whl (143 kB)
Installing collected packages: urllib3
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.26.18
    Uninstalling urllib3-1.26.18:
      Successfully uninstalled urllib3-1.26.18
[31mERROR: pip's dependency resolver does not currently tak

In [4]:
import pandas as pd
import numpy as np
import evaluate
import time
import torch

from datasets import load_dataset

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import GenerationConfig

from peft import LoraConfig, get_peft_model, TaskType, PeftModel, PeftConfig

# trl: Transformer Reinforcement Learning library
from trl import PPOTrainer, PPOConfig, AutoModelForSeq2SeqLMWithValueHead
from trl import create_reference_model
from trl.core import LengthSampler

from tqdm import tqdm
tqdm.pandas()

In [60]:
model_name = 'google/flan-t5-base'
huggingface_dataset_name = "knkarthick/dialogsum"
toxicity_model_name = "facebook/roberta-hate-speech-dynabench-r4-target"
not_hate_index = 0

In [40]:
def get_dataset_from_huggingface(dataset_name: str):
    dataset = load_dataset(dataset_name)
    return dataset


def load_s2s_pertained_model(model_name: str, is_float_16: bool = True):
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    if is_float_16:
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
        return model, tokenizer
    else:
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        return model, tokenizer
    
def tokenize_function(record, tokenizer):
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in record["dialogue"]]
    record['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    record['labels'] = tokenizer(record["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids
    return record

In [71]:
def get_peft_trainer(base_model, lora_config: LoraConfig, peft_training_args:TrainingArguments, train_dataset):
    train_peft_model = get_peft_model(base_model, lora_config)
    peft_trainer = Trainer(
        model=train_peft_model,
        args=peft_training_args,
        train_dataset=train_dataset,
    )
    return peft_trainer


def get_ppo_model_from_adapter(base_model, lora_config, adapter_path: str):
    peft_model = PeftModel.from_pretrained(base_model, 
                                           adapter_path,
                                           torch_dtype=torch.bfloat16,
                                           lora_config=lora_config,
                                           device_map="auto",                                       
                                           is_trainable=True)
    ppo_model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained(peft_model,                                                               
                                                                   torch_dtype=torch.bfloat16,
                                                                   is_trainable=True)
    return ppo_model
                                           

def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])


def evaluate_toxicity(model, toxicity_evaluator, tokenizer, dataset, num_samples: int):

    max_new_tokens=100

    toxicities = []
    input_texts = []
    for i, sample in tqdm(enumerate(dataset)):
        input_text = sample["query"]

        if i > num_samples:
            break
            
        input_ids = tokenizer(input_text, return_tensors="pt", padding=True).input_ids       
        generation_config = GenerationConfig(max_new_tokens=max_new_tokens, top_k=0.0, top_p=1.0, do_sample=True)
        response_token_ids = model.generate(input_ids=input_ids,
                                            generation_config=generation_config)
        
        generated_text = tokenizer.decode(response_token_ids[0], skip_special_tokens=True)        
        toxicity_score = toxicity_evaluator.compute(predictions=[(input_text + " " + generated_text)])
        toxicities.extend(toxicity_score["toxicity"])
       
    return  np.mean(toxicities), np.std(toxicities)


def get_ppo_trainer(ppo_model, ppo_config, tokenizer, dataset, data_collator):
    ref_model = create_reference_model(ppo_model)
    ppo_trainer = PPOTrainer(config=ppo_config, 
                             model=ppo_model, 
                             ref_model=ref_model, 
                             tokenizer=tokenizer, 
                             dataset=dataset, 
                             data_collator=data_collator)
    
    return ppo_trainer


def build_ppo_dataset(model_name,
                      dataset_name,
                      tokenizer,
                      input_min_text_length, 
                      input_max_text_length):

    """
    Preprocess the dataset and split it into train and test parts.

    Parameters:
    - model_name (str): Tokenizer model name.
    - dataset_name (str): Name of the dataset to load.
    - input_min_text_length (int): Minimum length of the dialogues.
    - input_max_text_length (int): Maximum length of the dialogues.
        
    Returns:
    - dataset_splits (datasets.dataset_dict.DatasetDict): Preprocessed dataset containing train and test parts.
    """
    
    # load dataset (only "train" part will be enough for this lab).
    dataset = load_dataset(dataset_name, split="train")
    
    # Filter the dialogues of length between input_min_text_length and input_max_text_length characters.
    dataset = dataset.filter(lambda x: len(x["dialogue"]) > input_min_text_length and len(x["dialogue"]) <= input_max_text_length, batched=False)

    # Prepare tokenizer. Setting device_map="auto" allows to switch between GPU and CPU automatically.
    tokenizer = AutoTokenizer.from_pretrained(model_name, device_map="auto")
    
    def tokenize(sample):
        
        # Wrap each dialogue with the instruction.
        prompt = f"""
Summarize the following conversation.

{sample["dialogue"]}

Summary:
"""
        sample["input_ids"] = tokenizer.encode(prompt)
        
        # This must be called "query", which is a requirement of our PPO library.
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    # Tokenize each dialogue.
    dataset = dataset.map(tokenize, batched=False)
    dataset.set_format(type="torch")
    
    # Split the dataset into train and test parts.
    dataset_splits = dataset.train_test_split(test_size=0.2, shuffle=False, seed=42)

    return dataset_splits  

    
def train_ppo_model(ppo_trainer, tokenizer, sentiment_pipe, generation_kwargs, reward_kwargs):
    output_min_length = 100
    output_max_length = 400
    output_length_sampler = LengthSampler(output_min_length, output_max_length)
    max_ppo_steps = 10
    
    for step, batch in tqdm(enumerate(ppo_trainer.dataloader)):
        if step >= max_ppo_steps:
            break   

        prompt_tensors = batch["input_ids"]

        # Get response from FLAN-T5/PEFT LLM.
        summary_tensors = []

        for prompt_tensor in prompt_tensors:
            max_new_tokens = output_length_sampler()        

            generation_kwargs["max_new_tokens"] = max_new_tokens
            summary = ppo_trainer.generate(prompt_tensor, **generation_kwargs)

            summary_tensors.append(summary.squeeze()[-max_new_tokens:])

        # This needs to be called "response".
        batch["response"] = [tokenizer.decode(r.squeeze()) for r in summary_tensors]

        # Compute reward outputs.
        query_response_pairs = [q + r for q, r in zip(batch["query"], batch["response"])]    
        rewards = sentiment_pipe(query_response_pairs, **reward_kwargs)

        reward_tensors = [torch.tensor(reward[not_hate_index]["score"]) for reward in rewards]    

        # Run PPO step.
        stats = ppo_trainer.step(prompt_tensors, summary_tensors, reward_tensors)
        ppo_trainer.log_stats(stats, batch, reward_tensors)
    
    return ppo_trainer
    


In [31]:
dataset = get_dataset_from_huggingface(model_name)
base_model, base_model_tokenizer = load_s2s_pertained_model(huggingface_dataset_name)


In [41]:
tokenized_datasets = dataset.map(lambda example: tokenize_function(example, base_model_tokenizer), batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary',])
tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 20 == 0, with_indices=True)

Map:   0%|          | 0/12460 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [43]:
lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

output_dir = "./tmp"
peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    optim="adamw_torch",
    learning_rate=5e-4, # Higher learning rate than full fine-tuning.
    logging_steps=1,
    max_steps=25   
)

peft_trainer = get_peft_trainer(base_model, lora_config, peft_training_args, tokenized_datasets["train"])


In [44]:
peft_trainer.train()

Step,Training Loss
1,50.75
2,48.5
3,46.0
4,44.75
5,42.5
6,40.0
7,38.25
8,37.25
9,34.0
10,33.5


TrainOutput(global_step=25, training_loss=32.42, metrics={'train_runtime': 1285.8023, 'train_samples_per_second': 0.156, 'train_steps_per_second': 0.019, 'total_flos': 139125797683200.0, 'train_loss': 32.42, 'epoch': 0.02})

In [87]:
## save peft model to local checkpoint

peft_model_checkpoint_path="./checkpoint-local"
peft_trainer.model.save_pretrained(peft_model_checkpoint_path)
base_model_tokenizer.save_pretrained(peft_model_checkpoint_path)

('./checkpoint-local/tokenizer_config.json',
 './checkpoint-local/special_tokens_map.json',
 './checkpoint-local/tokenizer.json')

In [82]:
ppo_model = get_ppo_model_from_adapter(base_model, lora_config, "./checkpoint-local")
ref_model = create_reference_model(ppo_model)

# reward model
toxicity_model_name = "facebook/roberta-hate-speech-dynabench-r4-target"
toxicity_tokenizer = AutoTokenizer.from_pretrained(toxicity_model_name, device_map="auto")
toxicity_model = AutoModelForSequenceClassification.from_pretrained(toxicity_model_name, device_map="auto")

toxicity_evaluator = evaluate.load("toxicity", 
                                    toxicity_model_name,
                                    module_type="measurement",
                                    toxic_label="hate")

Detected kernel version 4.14.336, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [73]:
ppo_dataset = build_ppo_dataset(model_name=model_name,
                        dataset_name=huggingface_dataset_name,
                        tokenizer=base_model_tokenizer,
                        input_min_text_length=200, 
                        input_max_text_length=1000)

In [76]:
learning_rate=1.41e-5
max_ppo_epochs=1
mini_batch_size=4
batch_size=16

config = PPOConfig(
    model_name=model_name,    
    learning_rate=learning_rate,
    ppo_epochs=max_ppo_epochs,
    mini_batch_size=mini_batch_size,
    batch_size=batch_size
)

ppo_trainer = get_ppo_trainer(ppo_model, config, base_model_tokenizer, ppo_dataset['train'], collator)



Detected kernel version 4.14.336, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [77]:
device = 0 if torch.cuda.is_available() else "cpu"
sentiment_pipe = pipeline("sentiment-analysis", 
                          model=toxicity_model_name, 
                          device=device)

generation_kwargs = {
    "min_length": 5,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True
}

reward_kwargs = {
    "top_k": None, # Return all scores.
    "function_to_apply": "none", # You want the raw logits without softmax.
    "batch_size": 16
}

ppo_trainer =  train_ppo_model(ppo_trainer, base_model_tokenizer, sentiment_pipe, generation_kwargs, reward_kwargs)


10it [14:23, 86.40s/it]


In [78]:
mean_after_detoxification, std_after_detoxification = evaluate_toxicity(model=ppo_model, 
                                                                        toxicity_evaluator=toxicity_evaluator, 
                                                                        tokenizer=base_model_tokenizer, 
                                                                        dataset=ppo_dataset["test"], 
                                                                        num_samples=10)
print(f'toxicity [mean, std] after detox: [{mean_after_detoxification}, {std_after_detoxification}]')

11it [00:14,  1.30s/it]

toxicity [mean, std] after detox: [0.01364099031674083, 0.023025788777843587]





In [84]:
# code for comparing ppo model (after trained by reward model) and ref model (before trained by reward model)

batch_size = 20
compare_results = {}

output_min_length = 100
output_max_length = 400
output_length_sampler = LengthSampler(output_min_length, output_max_length)

df_batch = ppo_dataset["test"][0:batch_size]

compare_results["query"] = df_batch["query"]
prompt_tensors = df_batch["input_ids"]

summary_tensors_ref = []
summary_tensors = []

# Get response from ppo and base model.
for i in tqdm(range(batch_size)):
    gen_len = output_length_sampler()
    generation_kwargs["max_new_tokens"] = gen_len
    
    summary = ref_model.generate(
        input_ids=torch.as_tensor(prompt_tensors[i]).unsqueeze(dim=0).to(device), 
        **generation_kwargs
    ).squeeze()[-gen_len:]
    summary_tensors_ref.append(summary)

    summary = ppo_model.generate(
        input_ids=torch.as_tensor(prompt_tensors[i]).unsqueeze(dim=0).to(device), 
        **generation_kwargs
    ).squeeze()[-gen_len:]
    summary_tensors.append(summary)

# Decode responses.
compare_results["response_before"] = [base_model_tokenizer.decode(summary_tensors_ref[i]) for i in range(batch_size)]
compare_results["response_after"] = [base_model_tokenizer.decode(summary_tensors[i]) for i in range(batch_size)]

# Sentiment analysis of query/response pairs before/after.
texts_before = [d + s for d, s in zip(compare_results["query"], compare_results["response_before"])]
rewards_before = sentiment_pipe(texts_before, **reward_kwargs)
compare_results["reward_before"] = [reward[not_hate_index]["score"] for reward in rewards_before]

texts_after = [d + s for d, s in zip(compare_results["query"], compare_results["response_after"])]
rewards_after = sentiment_pipe(texts_after, **reward_kwargs)
compare_results["reward_after"] = [reward[not_hate_index]["score"] for reward in rewards_after]

100%|██████████| 20/20 [01:04<00:00,  3.22s/it]


In [None]:
pd.set_option('display.max_colwidth', 500)
df_compare_results = pd.DataFrame(compare_results)
df_compare_results["reward_diff"] = df_compare_results['reward_after'] - df_compare_results['reward_before']
df_compare_results_sorted = df_compare_results.sort_values(by=['reward_diff'], ascending=False).reset_index(drop=True)
df_compare_results_sorted

In [90]:
rlhf_model_checkpoint_path="./rlhf-checkpoint-local"
Path(rlhf_model_checkpoint_path).mkdir(parents=True, exist_ok=True)
ppo_trainer.model.save_pretrained(rlhf_model_checkpoint_path)
base_model_tokenizer.save_pretrained(rlhf_model_checkpoint_path)

('./rlhf-checkpoint-local/tokenizer_config.json',
 './rlhf-checkpoint-local/special_tokens_map.json',
 './rlhf-checkpoint-local/tokenizer.json')

In [91]:
!aws s3 cp --recursive ./rlhf-checkpoint-local/ s3://llm-demo-george/models/rlhf:latest

upload: rlhf-checkpoint-local/adapter_config.json to s3://llm-demo-george/models/rlhf:latest/adapter_config.json
upload: rlhf-checkpoint-local/special_tokens_map.json to s3://llm-demo-george/models/rlhf:latest/special_tokens_map.json
upload: rlhf-checkpoint-local/tokenizer_config.json to s3://llm-demo-george/models/rlhf:latest/tokenizer_config.json
upload: rlhf-checkpoint-local/pytorch_model.bin to s3://llm-demo-george/models/rlhf:latest/pytorch_model.bin
upload: rlhf-checkpoint-local/tokenizer.json to s3://llm-demo-george/models/rlhf:latest/tokenizer.json
upload: rlhf-checkpoint-local/adapter_model.bin to s3://llm-demo-george/models/rlhf:latest/adapter_model.bin
