In [1]:
# !unzip smollm2-reward-model-final.zip -d /content/smollm2-reward-model-final

In [2]:
# !pip uninstall -y trl
# !pip install -q git+https://github.com/huggingface/trl.git
# !pip install -q transformers accelerate peft datasets
# !pip install -U bitsandbytes

In [3]:
import torch
import gc
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, BitsAndBytesConfig
from peft import LoraConfig, PeftModel
from datasets import load_dataset
from trl import GRPOTrainer, GRPOConfig

In [4]:
gc.collect()
torch.cuda.empty_cache()


In [5]:
config = GRPOConfig(
    output_dir="./smollm2-grpo-results",
    learning_rate=1e-5,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=2,
    num_generations=4,
    max_prompt_length=256,
    max_completion_length=53,
    beta=0.04,
    num_train_epochs=5,
    seed=42,
    gradient_checkpointing=False,
    logging_steps=10,
)

In [6]:
model_id = "HuggingFaceTB/smollm2-135M-SFT-Only"
reward_model_path = "./smollm2-reward-model-final"

In [7]:
print("Loading Policy Model...")
policy_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    dtype=torch.float16,
    device_map="auto",
    attn_implementation="eager"
)

print("Loading Reward Model (Frozen)...")
rm_base = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    num_labels=1,
    dtype=torch.float16,
    device_map="auto"
)
reward_model = PeftModel.from_pretrained(rm_base, reward_model_path)

reward_model.eval()
reward_model.requires_grad_(False)

Loading Policy Model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

Loading Reward Model (Frozen)...


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at HuggingFaceTB/smollm2-135M-SFT-Only and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(49152, 576, padding_idx=2)
        (layers): ModuleList(
          (0-29): 30 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=576, out_features=576, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=576, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=576, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side = "left")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/565 [00:00<?, ?B/s]

In [9]:
dataset = load_dataset("Intel/orca_dpo_pairs", split="train[:2000]")
def format_grpo_prompt(example):
    messages = [
        {"role": "system", "content": example["system"]},
        {"role": "user", "content": example["question"]}
    ]
    prompt_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return {"prompt": prompt_text}

dataset = dataset.map(format_grpo_prompt, remove_columns=dataset.column_names)

README.md:   0%|          | 0.00/196 [00:00<?, ?B/s]

orca_rlhf.jsonl:   0%|          | 0.00/36.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12859 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [10]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

In [11]:
trainer = GRPOTrainer(
    model=policy_model,
    reward_funcs=[reward_model],
    args=config,
    train_dataset=dataset,
    processing_class=tokenizer,
    peft_config=peft_config,
    reward_processing_classes=[tokenizer]
)

The model is already on multiple devices. Skipping the move to device specified in `args`.


In [12]:
print("Starting GRPO Training...")
print(f"Group Size: {config.num_generations}")
trainer.train()

trainer.save_model("./smollm2-grpo-final")
print("GRPO Training Complete!")

Starting GRPO Training...
Group Size: 4


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33m27100046[0m ([33m27100046-lahore-university-of-management-sciences[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,0.0786
20,0.0651
30,0.0513
40,0.0658
50,0.0704
60,0.0648
70,0.0782
80,0.0858
90,0.0621
100,0.0751


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

GRPO Training Complete!


In [13]:
from google.colab import files
import shutil

shutil.make_archive('smollm2-grpo-final', 'zip', './smollm2-grpo-final')

files.download('smollm2-grpo-final.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>