In [None]:
from unsloth import FastModel
import torch
from datasets import load_dataset
import re
from trl import GRPOConfig, GRPOTrainer
from transformers import TextStreamer
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from rob_rew import RobertaRewardModel

In [None]:
# 1. Load your reward model
reward_model = RobertaRewardModel(
    model_path="./roberta-reward-model",
    device="cuda" if torch.cuda.is_available() else "cpu"
)

# 2. Prepare a dataset of prompts (questions)
questions = [
    "What are the main causes of climate change?",
    "How does the immune system work?",
    "What are the principles of quantum computing?",
    "Explain the process of photosynthesis.",
    "What are the key features of machine learning algorithms?"
]

# 3. Load the base LLM to be fine-tuned with GRPO
model_name = "gpt2"  # or any other model you prefer
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Ensure we have a pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# 4. Create a reward function wrapper
reward_fn = GRPORewardFunction(
    reward_model=reward_model,
    reward_scaling=1.0,
    normalize_rewards=True
)

# 5. Initialize the trainer
trainer = CustomGRPOTrainer(
    model=model,
    tokenizer=tokenizer,
    reward_fn=reward_fn,
    lr=2e-5,
    batch_size=4,
    epochs=3
)

# 6. Train the model
stats = trainer.train(questions, num_iterations=5)

# 7. Save the fine-tuned model
trainer.save_model("./grpo_finetuned_model")

return model, tokenizer, stats