## installs

In [None]:
# pip install -U git+https://github.com/huggingface/trl

In [None]:
# !pip install accelerate peft

## imports and downloads

In [3]:
from trl import PPOConfig, PPOTrainer, DPOTrainer, DPOConfig, SFTTrainer, DataCollatorForCompletionOnlyLM, SFTConfig
from datasets import load_dataset
from accelerate import Accelerator
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from peft import LoraConfig, TaskType

In [4]:
model = AutoModelForCausalLM.from_pretrained('gpt2-medium')
tokenizer = AutoTokenizer.from_pretrained('gpt2-medium')
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [5]:
lora_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    task_type='CAUSAL_LM',
    bias='none',
    r=8
)

In [7]:
dataset_anthropic = load_dataset('Anthropic/hh-rlhf')

Downloading readme:   0%|          | 0.00/5.77k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/13.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/25.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/743k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/875k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/160800 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8552 [00:00<?, ? examples/s]

## Anthropic SFT

In [39]:
train_data, val_data = dataset_anthropic['train'], dataset_anthropic['test']

In [40]:
collator = DataCollatorForCompletionOnlyLM(
    instruction_template="Human:",
    response_template="Assistant:",
    tokenizer=tokenizer, mlm=False
)



In [46]:
# Define training arguments
training_args = SFTConfig(
    output_dir='./output',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    logging_dir='./logs',
    save_steps=1000
)

# Initialize SFTTrainer
sft_trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data.select(range(1000)),
    eval_dataset=val_data.select(range(100)),
    dataset_text_field='chosen',
    peft_config=lora_config,
    max_seq_length=512,
    data_collator=collator
)

# Train the reward model using SFTTrainer
sft_trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Step,Training Loss
500,2.5041


TrainOutput(global_step=750, training_loss=2.4433046875, metrics={'train_runtime': 489.3781, 'train_samples_per_second': 6.13, 'train_steps_per_second': 1.533, 'total_flos': 1548526946304000.0, 'train_loss': 2.4433046875, 'epoch': 3.0})

## Reward Model

In [5]:
from transformers import TrainingArguments
from trl import RewardTrainer
from datasets import DatasetDict

In [8]:
new_train = dataset_anthropic['train'].select(range(1000))
new_test = dataset_anthropic['test'].select(range(100))

# Create a new DatasetDict with the resized datasets
dataset_anthropic = DatasetDict({
    'train': new_train,
    'test': new_test
})

In [9]:
def preprocess_function(examples):
    formatted = {
        "input_ids_chosen": [],
        "attention_mask_chosen": [],
        "input_ids_rejected": [],
        "attention_mask_rejected": [],
    }

    for chosen, rejected in zip(examples["chosen"], examples["rejected"]):
        tokenized_chosen = tokenizer(chosen)
        tokenized_rejected = tokenizer(rejected)
        formatted["input_ids_chosen"].append(tokenized_chosen["input_ids"])
        formatted["attention_mask_chosen"].append(tokenized_chosen["attention_mask"])
        formatted["input_ids_rejected"].append(tokenized_rejected["input_ids"])
        formatted["attention_mask_rejected"].append(tokenized_rejected["attention_mask"])

    return formatted

dataset_anthropic = dataset_anthropic.map(
    preprocess_function,
    batched=True
)

dataset_anthropic = dataset_anthropic.filter(
    lambda x: len(x['input_ids_chosen']) <= 512 and len(x['input_ids_rejected']) <= 512
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
train_data, val_data = dataset_anthropic['train'], dataset_anthropic['test']

peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1
)

training_args = TrainingArguments(
    output_dir='outputs'
)

trainer = RewardTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    peft_config=peft_config,
)

trainer.train()
trainer.save_pretrained('gpt2-medium_reward')

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


## PPO

In [None]:
config = PPOConfig(
    model_name='gpt2-medium',
    learning_rate=1.41e-5,
)

In [None]:
ppo_trainer = PPOTrainer(
    model=model,
    config=config,
    dataset=dataset_anthropic,
    tokenizer=tokenizer,
)

ppo_trainer.train()
ppo_trainer.save_model('gpt2-medium_dpo')

## DPO

In [None]:
dataset_anthropic = load_dataset('Anthropic/hh-rlhf')

new_train = dataset_anthropic['train'].select(range(1000))
new_test = dataset_anthropic['test'].select(range(100))

# Create a new DatasetDict with the resized datasets
dataset_anthropic = DatasetDict({
    'train': new_train,
    'test': new_test
})

def process(row):
    row['chosen'] = tokenizer.apply_chat_template(row['chosen'], tokenize=False)
    row['rejected'] = tokenizer.apply_chat_template(row['rejected'], tokenize=False)
    return row

dataset_anthropic = dataset_anthropic.map(process, load_from_cache_file=False)

In [None]:
dpo_trainer = DPOTrainer(
    model,
    model_ref,
    args=training_args,
    train_dataset=train_data,
    eval_datset=val_dataset,
    tokenizer=tokenizer,
    peft_config=lora_config
)

dpo_trainer.train()
dpo_trainer.save_model('gpt2-medium_dpo')