# Reinforcement Learning from human feedback

In [1]:
!pip install trl

Collecting trl
  Obtaining dependency information for trl from https://files.pythonhosted.org/packages/ce/b2/b0e9c7a15d666aebe83ed72b6a3bec869be88246ddf22d8953f3eee61e22/trl-0.7.2-py3-none-any.whl.metadata
  Downloading trl-0.7.2-py3-none-any.whl.metadata (10 kB)
Collecting tyro>=0.5.7 (from trl)
  Obtaining dependency information for tyro>=0.5.7 from https://files.pythonhosted.org/packages/f0/ee/72cb2647dc72ef6412e5cbca8bbbae7f757c872ca047e421a2d78d7b890f/tyro-0.5.12-py3-none-any.whl.metadata
  Downloading tyro-0.5.12-py3-none-any.whl.metadata (7.5 kB)
Collecting docstring-parser>=0.14.1 (from tyro>=0.5.7->trl)
  Downloading docstring_parser-0.15-py3-none-any.whl (36 kB)
Collecting shtab>=1.5.6 (from tyro>=0.5.7->trl)
  Obtaining dependency information for shtab>=1.5.6 from https://files.pythonhosted.org/packages/86/69/3a4873b36d65a1b8f4ee606f5a785b5babb9960385802de60d8455e2f8b6/shtab-1.6.4-py3-none-any.whl.metadata
  Downloading shtab-1.6.4-py3-none-any.whl.metadata (7.3 kB)
Download

In [1]:
from transformers import pipeline,AutoTokenizer,AutoModelForSequenceClassification,AutoModelForSeq2SeqLM,GenerationConfig
from datasets import load_dataset
from peft import PeftModel,PeftConfig,LoraConfig,TaskType

from trl import PPOTrainer,PPOConfig,AutoModelForSeq2SeqLMWithValueHead,create_reference_model
from trl.core import LengthSampler

import torch
import evaluate

import numpy as np
import pandas as pd

from tqdm import tqdm
tqdm.pandas()

### Load FLAN-T5 Model , Prepare Reward Model and Toxicity Evaluator

In [2]:
model_name = 'google/flan-t5-base'
huggingface_dataset_name = 'knkarthick/dialogsum'

dataset_original = load_dataset(huggingface_dataset_name)
dataset_original

Found cached dataset csv (C:/Users/tusha/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-cd36827d3490488d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [3]:
def build_dataset(model_name,dataset_name,input_min_text_length,input_max_text_length):
    dataset = load_dataset(dataset_name,split="train")
    dataset = dataset.filter(lambda x: len(x['dialogue']) > input_min_text_length and len(x['dialogue']) <= input_max_text_length, batched=False)
    tokenizer = AutoTokenizer.from_pretrained(model_name,device_map="auto")
    def tokenize(sample):
        prompt = f"""
Summarize the following conversation. 
{sample['dialogue']}

summary:
"""
        sample['input_ids'] = tokenizer.encode(prompt)
        sample['query'] = tokenizer.decode(sample['input_ids'])
        return sample
    dataset = dataset.map(tokenize,batched=False)
    dataset.set_format(type='torch')

    dataset_splits = dataset.train_test_split(test_size=0.2,shuffle=False,seed=42)
    return dataset_splits

dataset = build_dataset(model_name = model_name,dataset_name= huggingface_dataset_name,input_min_text_length=200,input_max_text_length=1000)
print(dataset)

Found cached dataset csv (C:/Users/tusha/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-cd36827d3490488d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached processed dataset at C:\Users\tusha\.cache\huggingface\datasets\knkarthick___csv\knkarthick--dialogsum-cd36827d3490488d\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-d904df43a047dc8b.arrow
Loading cached processed dataset at C:\Users\tusha\.cache\huggingface\datasets\knkarthick___csv\knkarthick--dialogsum-cd36827d3490488d\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-722906dcf7d23a9b.arrow


DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'query'],
        num_rows: 8017
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'query'],
        num_rows: 2005
    })
})


In [4]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _,param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params+=param.numel()
    return f" trainable model parameters: {trainable_model_params}\n all model parameters: {all_model_params}\n percentage of trainable model: {trainable_model_params*100/all_model_params}"

In [None]:
lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=['q','v'],
    lora_dropout=0.05,
    bias='none',
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name,torch_dtype=torch.bfloat16)

peft_model = PeftModel.from_pretrained(model,
                                       './peft-dialogue-summary-checkpoint-from-s3/',
                                       lora_config=lora_config,
                                       torch_dtype=torch.bfloat16,
                                       device_map="auto",
                                       is_trainable=True)
print(f'PEFT model parameters to be updated:\n{print_number_of_trainable_model_parameters(peft_model)}\n')

In [None]:
ppo_model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained(peft_model,torch_dtype=torch.bfloat16,is_trainable=True)
print(f'PPO model parameters to be updated (VlaueHead + 769 params) : \n {print_number_of_trainable_model_parameters(ppo_model)}')
print(ppo_model.v_head)

In [None]:
ref_model = create_reference_model(ppo_model)
print(f'Reference model parameters to be updated :\n{print_number_of_trainable_model_parameters(ref_model)}')

## Prepare Reward model
we will use Facebook's RoBERTa-based hate speech model for the reward model. this model will output logits and then predict probabilities across two class :nothate and hate. the logits of the output nothate will be taken as a postive reward. then model will be fine tuned with PPO using those reqard values.

In [5]:
toxicity_model_name = 'facebook/roberta-hate-speech-dynabench-r4-target'
toxicity_tokenizer = AutoTokenizer.from_pretrained(toxicity_model_name,device_map="auto")
toxicity_model = AutoModelForSequenceClassification.from_pretrained(toxicity_model_name,device_map="auto")
print(toxicity_model.config.id2label)

{0: 'nothate', 1: 'hate'}


In [6]:
non_toxic_text = 'i want to kiss you'
toxicity_input_ids = toxicity_tokenizer(non_toxic_text,return_tensors='pt').input_ids
logits = toxicity_model(input_ids = toxicity_input_ids).logits
print(f'logits [not hate,hate]: {logits.tolist()[0]}')
probabilities = logits.softmax(dim=1).tolist()[0]
print(f'probabilities [not hate,hate]: {probabilities}')
not_hate_index = 0
nothate_reward = (logits[:,not_hate_index]).tolist()
print(f'reward (high): {nothate_reward}')

logits [not hate,hate]: [2.307953357696533, -2.0024046897888184]
probabilities [not hate,hate]: [0.9867492318153381, 0.013250798918306828]
reward (high): [2.307953357696533]


In [7]:
device = 0 if torch.cuda.is_available() else "cpu"

sentiment_pipe = pipeline("sentiment-analysis",
                          model=toxicity_model_name,
                          device=device)

reward_logits_kwargs = {
    "top_k":None,
    "function_to_apply":"none",
    "batch_size":16
}

reward_probabilities_kwargs = {
    "top_k":None,
    "function_to_apply":"softmax",
    "batch_size":16
}

print("Reward model output for non-toxic text:")
print(sentiment_pipe(non_toxic_text,**reward_logits_kwargs))
print(sentiment_pipe(non_toxic_text,**reward_probabilities_kwargs))

Reward model output for non-toxic text:
[{'label': 'nothate', 'score': 2.307953357696533}, {'label': 'hate', 'score': -2.0024046897888184}]
[{'label': 'nothate', 'score': 0.9867492318153381}, {'label': 'hate', 'score': 0.013250799849629402}]


# evaluating toxicity score (0-1)
1 means more toxic

In [8]:
toxicity_evaluator = evaluate.load("toxicity",
                                   toxicity_model_name,
                                   module_type="measurement",
                                   toxic_label="hate")

In [9]:
toxicity_score = toxicity_evaluator.compute(predictions=[non_toxic_text])
print("Toxicity score for non-toxic text: ",toxicity_score['toxicity'])

Toxicity score for non-toxic text:  [0.013250799849629402]


In [10]:
def evaluate_toxicity(model,toxicity_evaluator,tokenizer,dataset,num_samples):
    max_new_tokens = 100
    toxicities = []
    input_texts = []
    for i, sample in tqdm(enumerate(dataset)):
        input_text = sample['query']
        if i > num_samples:
            break
        input_ids = tokenizer(input_text,return_tensors='pt',padding=True).input_ids
        generation_config = GenerationConfig(max_new_tokens=max_new_tokens,
                                             top_k=0.0,
                                             top_p=1.0,
                                             do_sample=True)
        response_token_ids = model.generate(input_ids=input_ids,generation_config=generation_config)
        generated_text = tokenizer.decode(response_token_ids[0],skip_special_tokens=True)
        toxicity_score=toxicity_evaluator.compute(predictions=[(input_text+" "+generated_text)])
        toxicities.extend(toxicity_score['toxicity'])

    mean = np.mean(toxicities)
    std = np.std(toxicities)
    return mean,std

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name,device_map="auto")
mean_before_detoxification,std_before_detoxification = evaluate_toxicity(model=ref_model,
                                                                         toxicity_evaluator=toxicity_evaluator,
                                                                         tokenizer=tokenizer,
                                                                         dataset=dataset['test'],
                                                                         num_sample=10)
print(f'toxicity [mean,std] before detox: [{mean_before_detoxification},{std_before_detoxification}]')

## Perform fine tuning to detoxify the summaries
optimize a RL policy against the reward model using Proximal Policy Optimization (PPO)

In [11]:
learning_rate = 1.41e-5
max_ppo_epochs = 1
mini_batch_size = 4
batch_size=16

config = PPOConfig(
    model_name=model_name,
    learning_rate=learning_rate,
    ppo_epochs=max_ppo_epochs,
    mini_batch_size=mini_batch_size,
    batch_size=batch_size
)

def collator(data):
    return dict((key,[d[key] for d in data]) for key in data[0])
test_data = [{"key1": "value1", "key2": "value2", "key3": "value3"}]
print(f'Collator input: {test_data}')
print(f'Collator output: {collator(test_data)}')

Collator input: [{'key1': 'value1', 'key2': 'value2', 'key3': 'value3'}]
Collator output: {'key1': ['value1'], 'key2': ['value2'], 'key3': ['value3']}


In [None]:
ppo_trainer = PPOTrainer(config=config, 
                         model=ppo_model, 
                         ref_model=ref_model, 
                         tokenizer=tokenizer, 
                         dataset=dataset["train"], 
                         data_collator=collator)

In [None]:
output_length_sampler = LengthSampler(100,400)

generation_kwargs = {
    "min_length":5,
    "top_k":0.0,
    "top_p":1.0,
    "so_sample":True
}

reward_kwargs = {
    "top_k":None,
    "function_to_apply":"none",
    "batch_size":16
}

max_ppo_steps = 10

for step,batch in tqdm(enumerate(ppo_trainer.dataloader)):
    if step >= max_ppo_steps:
        break
    prompt_tensors = batch['input_ids']
    summary_tensors = []
    for prompt_tensor in prompt_tensors:
        max_new_tokens = output_length_sampler()

        generation_kwargs['max_new_tokens'] = max_new_tokens
        summary = ppo_trainer.generate(prompt_tensor, **generation_kwargs)
        summary_tensors.append(summary.squeeze()[-max_new_tokens:])

    batch['response'] = [tokenizer.decode(r.squeeze()) for r in summary_tensors]

    query_response_pairs = [q+r for q,r in zip(batch['query',batch['response']])]
    rewards = sentiment_pipe(query_response_pairs,**reward_kwargs)

    reward_tensors = [torch.tensor(reward[not_hate_index]['score']) for reward in rewards]

    stats = ppo_trainer.step(prompt_tensors,summary_tensors,reward_tensors)
    ppo_trainer.log_stats(stats,batch,reward_tensors)

    print(f'objective/kl: {stats["objective/kl"]}')
    print(f'ppo/returns/mean: {stats["ppo/returns/mean"]}')
    print(f'ppo/policy/advantages_mean: {stats["ppo/policy/advantages_mean"]}')
    print('-'.join('' for x in range(100)))


In [None]:
mean_after_detoxification, std_after_detoxification = evaluate_toxicity(model=ppo_model, 
                                                                        toxicity_evaluator=toxicity_evaluator, 
                                                                        tokenizer=tokenizer, 
                                                                        dataset=dataset["test"], 
                                                                        num_samples=10)
print(f'toxicity [mean, std] after detox: [{mean_after_detoxification}, {std_after_detoxification}]')

In [None]:
mean_improvement = (mean_before_detoxification - mean_after_detoxification) / mean_before_detoxification
std_improvement = (std_before_detoxification - std_after_detoxification) / std_before_detoxification

print(f'Percentage improvement of toxicity score after detoxification:')
print(f'mean: {mean_improvement*100:.2f}%')
print(f'std: {std_improvement*100:.2f}%')

In [None]:
batch_size = 20
compare_results = {}

df_batch = dataset["test"][0:batch_size]

compare_results["query"] = df_batch["query"]
prompt_tensors = df_batch["input_ids"]

summary_tensors_ref = []
summary_tensors = []

# Get response from ppo and base model.
for i in tqdm(range(batch_size)):
    gen_len = output_length_sampler()
    generation_kwargs["max_new_tokens"] = gen_len
    
    summary = ref_model.generate(
        input_ids=torch.as_tensor(prompt_tensors[i]).unsqueeze(dim=0).to(device), 
        **generation_kwargs
    ).squeeze()[-gen_len:]
    summary_tensors_ref.append(summary)

    summary = ppo_model.generate(
        input_ids=torch.as_tensor(prompt_tensors[i]).unsqueeze(dim=0).to(device), 
        **generation_kwargs
    ).squeeze()[-gen_len:]
    summary_tensors.append(summary)

# Decode responses.
compare_results["response_before"] = [tokenizer.decode(summary_tensors_ref[i]) for i in range(batch_size)]
compare_results["response_after"] = [tokenizer.decode(summary_tensors[i]) for i in range(batch_size)]

# Sentiment analysis of query/response pairs before/after.
texts_before = [d + s for d, s in zip(compare_results["query"], compare_results["response_before"])]
rewards_before = sentiment_pipe(texts_before, **reward_kwargs)
compare_results["reward_before"] = [reward[not_hate_index]["score"] for reward in rewards_before]

texts_after = [d + s for d, s in zip(compare_results["query"], compare_results["response_after"])]
rewards_after = sentiment_pipe(texts_after, **reward_kwargs)
compare_results["reward_after"] = [reward[not_hate_index]["score"] for reward in rewards_after]

In [None]:
pd.set_option('display.max_colwidth', 500)
df_compare_results = pd.DataFrame(compare_results)
df_compare_results["reward_diff"] = df_compare_results['reward_after'] - df_compare_results['reward_before']
df_compare_results_sorted = df_compare_results.sort_values(by=['reward_diff'], ascending=False).reset_index(drop=True)
df_compare_results_sorted