In [1]:
%pip install datasets transformers bitsandbytes peft evaluate

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cublas_c

In [2]:
import torch
import pandas as pd

from datasets import Dataset

from transformers import GPT2LMHeadModel, GPT2Tokenizer, BitsAndBytesConfig

from peft import LoraConfig, get_peft_model, TaskType

from torch.optim import AdamW

import torch.nn.functional as F

from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR

from tqdm import tqdm

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Using device: {device}")

Using device: cuda


In [4]:
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True, llm_int8_threshold=6.0, llm_int8_has_fp16_weight=False
)

base_model = GPT2LMHeadModel.from_pretrained(
    "gpt2", quantization_config=quantization_config
)

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.truncation_side = "left"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [5]:
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["c_attn"],
    bias="none",
)

base_model = get_peft_model(base_model, lora_config)
base_model.print_trainable_parameters()

trainable params: 294,912 || all params: 124,734,720 || trainable%: 0.2364


In [6]:
train_dataset_clean = pd.read_csv("./train_dataset_clean.csv").to_dict("records")
eval_dataset_clean = pd.read_csv("./eval_dataset_clean.csv").to_dict("records")
test_dataset_clean = pd.read_csv("./test_dataset_clean.csv").to_dict("records")

print(
    f"Loaded {len(train_dataset_clean)} train samples, {len(eval_dataset_clean)} eval samples, and {len(test_dataset_clean)} test samples"
)

Loaded 142510 train samples, 15835 eval samples, and 8408 test samples


In [7]:
ref_model = GPT2LMHeadModel.from_pretrained(
    "gpt2", device_map="auto", quantization_config=quantization_config
)

for param in ref_model.parameters():
    param.requires_grad = False

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# base_model.to(device)
# ref_model.to(device)

base_model.train()
ref_model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Linear8bitLt(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear8bitLt(in_features=768, out_features=768, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Linear8bitLt(in_features=768, out_features=3072, bias=True)
          (c_proj): Linear8bitLt(in_features=3072, out_features=768, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwi

In [9]:
def get_tokenized(input, max_length=256):
    return tokenizer(
        input, return_tensors="pt", padding=True, truncation=True, max_length=max_length
    )


preference_dataset_train = Dataset.from_list(train_dataset_clean[:16000])

prompts_train = get_tokenized([each["prompt"] for each in preference_dataset_train])
choosen_train = get_tokenized([each["chosen"] for each in preference_dataset_train])
rejected_train = get_tokenized([each["rejected"] for each in preference_dataset_train])

preference_dataset_eval = Dataset.from_list(eval_dataset_clean[:1000])

prompts_eval = get_tokenized([each["prompt"] for each in preference_dataset_eval])
choosen_eval = get_tokenized([each["chosen"] for each in preference_dataset_eval])
rejected_eval = get_tokenized([each["rejected"] for each in preference_dataset_eval])

In [10]:
batch_size = 8
grad_accum_steps = 4

effective_batch_size = batch_size * grad_accum_steps

steps_per_epoch = len(preference_dataset_train) // effective_batch_size

epochs = 2

max_steps = steps_per_epoch * epochs

optimizer = AdamW(base_model.parameters(), lr=1e-5)

warmup_steps = max_steps // 20

warmup_scheduler = LinearLR(
    optimizer,
    start_factor=0.01,
    end_factor=1.0,
    total_iters=warmup_steps,
)

cosine_scheduler = CosineAnnealingLR(
    optimizer,
    T_max=max_steps - warmup_steps,
    eta_min=1e-7,
)

scheduler = SequentialLR(
    optimizer,
    schedulers=[warmup_scheduler, cosine_scheduler],
    milestones=[warmup_steps],
)

In [11]:
def get_logprobs(
    model,
    prompt_ids,
    completion_ids,
    prompt_mask=None,
    completion_mask=None,
    requires_grad=False,
):
    full_ids = torch.cat([prompt_ids, completion_ids], dim=-1)

    if prompt_mask is not None and completion_mask is not None:
        full_attention_mask = torch.cat([prompt_mask, completion_mask], dim=-1)
    else:
        full_attention_mask = None

    if not requires_grad:
        with torch.no_grad():
            outputs = model(full_ids, attention_mask=full_attention_mask)
    else:
        outputs = model(full_ids, attention_mask=full_attention_mask)

    logits = outputs.logits[:, prompt_ids.shape[1] - 1 : -1, :]

    log_probs = F.log_softmax(logits, dim=-1)

    target_ids = completion_ids

    gathered_log_probs = log_probs.gather(
        dim=-1, index=target_ids.unsqueeze(-1)
    ).squeeze(-1)

    if completion_mask is not None:
        gathered_log_probs = gathered_log_probs * completion_mask

    return gathered_log_probs.sum(dim=-1)

In [12]:
def dpo_loss(
    policy_model,
    ref_model,
    prompt_ids,
    chosen_ids,
    rejected_ids,
    prompt_mask=None,
    chosen_mask=None,
    rejected_mask=None,
    beta=0.1,
):
    policy_chosen_log_probs = get_logprobs(
        policy_model,
        prompt_ids,
        chosen_ids,
        prompt_mask=prompt_mask,
        completion_mask=chosen_mask,
        requires_grad=True,
    )

    policy_rejected_log_probs = get_logprobs(
        policy_model,
        prompt_ids,
        rejected_ids,
        prompt_mask=prompt_mask,
        completion_mask=rejected_mask,
        requires_grad=True,
    )

    ref_chosen_log_probs = get_logprobs(
        ref_model,
        prompt_ids,
        chosen_ids,
        prompt_mask=prompt_mask,
        completion_mask=chosen_mask,
    )

    ref_rejected_log_probs = get_logprobs(
        ref_model,
        prompt_ids,
        rejected_ids,
        prompt_mask=prompt_mask,
        completion_mask=rejected_mask,
    )

    chosen_rewards = policy_chosen_log_probs - ref_chosen_log_probs
    rejected_rewards = policy_rejected_log_probs - ref_rejected_log_probs

    loss = -F.logsigmoid(beta * (chosen_rewards - rejected_rewards))

    return loss.mean()

In [13]:
step = 0

for epoch in range(epochs):
    epoch_loss = 0
    num_batches_processed = 0

    for i in range(0, len(preference_dataset_train), batch_size):
        batch_end = min(i + batch_size, len(preference_dataset_train))

        prompts_ids = prompts_train["input_ids"][i:batch_end].to(device)
        chosen_ids = choosen_train["input_ids"][i:batch_end].to(device)
        rejected_ids = rejected_train["input_ids"][i:batch_end].to(device)

        prompts_mask = prompts_train["attention_mask"][i:batch_end].to(device)
        chosen_mask = choosen_train["attention_mask"][i:batch_end].to(device)
        rejected_mask = rejected_train["attention_mask"][i:batch_end].to(device)

        loss = dpo_loss(
            base_model,
            ref_model,
            prompts_ids,
            chosen_ids,
            rejected_ids,
            prompt_mask=prompts_mask,
            chosen_mask=chosen_mask,
            rejected_mask=rejected_mask,
        )

        loss = loss / grad_accum_steps

        loss.backward()

        torch.nn.utils.clip_grad_norm_(base_model.parameters(), max_norm=1.0)

        epoch_loss += loss.item()
        num_batches_processed += 1

        if (num_batches_processed % grad_accum_steps == 0) or (
            batch_end >= len(preference_dataset_train)
        ):
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            step += 1

            print(
                f"Step {step}, Epoch {epoch + 1}, Avg Loss: {epoch_loss / num_batches_processed * grad_accum_steps:.4f}"
            )

            if torch.cuda.is_available():
                torch.cuda.empty_cache()

        del prompts_ids, chosen_ids, rejected_ids
        del prompts_mask, chosen_mask, rejected_mask
        del loss

Step 1, Epoch 1, Avg Loss: 1.0835
Step 2, Epoch 1, Avg Loss: 1.1006
Step 3, Epoch 1, Avg Loss: 1.4672
Step 4, Epoch 1, Avg Loss: 1.9522
Step 5, Epoch 1, Avg Loss: 1.8988
Step 6, Epoch 1, Avg Loss: 1.9544
Step 7, Epoch 1, Avg Loss: 1.8677
Step 8, Epoch 1, Avg Loss: 1.8076
Step 9, Epoch 1, Avg Loss: 1.7889
Step 10, Epoch 1, Avg Loss: 1.7862
Step 11, Epoch 1, Avg Loss: 1.8245
Step 12, Epoch 1, Avg Loss: 1.7980
Step 13, Epoch 1, Avg Loss: 1.7764
Step 14, Epoch 1, Avg Loss: 1.7389
Step 15, Epoch 1, Avg Loss: 1.7311
Step 16, Epoch 1, Avg Loss: 1.7257
Step 17, Epoch 1, Avg Loss: 1.7990
Step 18, Epoch 1, Avg Loss: 1.8464
Step 19, Epoch 1, Avg Loss: 1.8981
Step 20, Epoch 1, Avg Loss: 1.9023
Step 21, Epoch 1, Avg Loss: 1.8700
Step 22, Epoch 1, Avg Loss: 1.9590
Step 23, Epoch 1, Avg Loss: 2.0069
Step 24, Epoch 1, Avg Loss: 1.9931
Step 25, Epoch 1, Avg Loss: 1.9839
Step 26, Epoch 1, Avg Loss: 2.0041
Step 27, Epoch 1, Avg Loss: 1.9704
Step 28, Epoch 1, Avg Loss: 1.9585
Step 29, Epoch 1, Avg Loss: 1

In [14]:
torch.save(base_model.state_dict(), "best_model.pth")

In [15]:
import gc

gc.collect()
torch.cuda.empty_cache()

In [None]:
# base_model.load_state_dict(torch.load("best_model.pth", map_location=device))

In [22]:
def evaluate_preference_alignment(model):
    model.eval()

    wins, losses, ties = 0, 0, 0

    batch_size = 8

    for i in tqdm(range(0, len(test_dataset_clean[:1024]), batch_size)):
        batch = test_dataset_clean[i : i + batch_size]

        prompts = [sample["prompt"] for sample in batch]
        chosen = [sample["chosen"] for sample in batch]
        rejected = [sample["rejected"] for sample in batch]

        prompt_inputs = get_tokenized(prompts)
        chosen_inputs = get_tokenized(chosen)
        rejected_inputs = get_tokenized(rejected)

        prompt_ids = prompt_inputs["input_ids"].to(device)
        chosen_ids = chosen_inputs["input_ids"].to(device)
        rejected_ids = rejected_inputs["input_ids"].to(device)

        prompt_mask = prompt_inputs["attention_mask"].to(device)
        chosen_mask = chosen_inputs["attention_mask"].to(device)
        rejected_mask = rejected_inputs["attention_mask"].to(device)

        with torch.no_grad():
            chosen_logprobs = get_logprobs(
                model,
                prompt_ids,
                chosen_ids,
                prompt_mask=prompt_mask,
                completion_mask=chosen_mask,
            )

            rejected_logprobs = get_logprobs(
                model,
                prompt_ids,
                rejected_ids,
                prompt_mask=prompt_mask,
                completion_mask=rejected_mask,
            )

        for chosen_lp, rejected_lp in zip(chosen_logprobs, rejected_logprobs):
            if chosen_lp > rejected_lp:
                wins += 1
            elif chosen_lp < rejected_lp:
                losses += 1
            else:
                ties += 1

    total = wins + losses + ties

    print(f"\nResults on HH-RLHF ({total} samples):")
    print(f"Wins     : {wins} ({wins / total:.2%})")
    print(f"Losses   : {losses} ({losses / total:.2%})")
    print(f"Ties     : {ties} ({ties / total:.2%})")

In [33]:
evaluate_preference_alignment(ref_model)

100%|██████████| 128/128 [00:28<00:00,  4.51it/s]


Results on HH-RLHF (1024 samples):
Wins     : 563 (54.98%)
Losses   : 461 (45.02%)
Ties     : 0 (0.00%)





In [34]:
evaluate_preference_alignment(base_model)

100%|██████████| 128/128 [00:30<00:00,  4.19it/s]


Results on HH-RLHF (1024 samples):
Wins     : 568 (55.47%)
Losses   : 456 (44.53%)
Ties     : 0 (0.00%)



