In [44]:
import torch
import time
import re
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
from peft import LoraConfig, get_peft_model
# from math_verify import LatexExtractionConfig, parse, verify
from trl import GRPOConfig
from trl import GRPOTrainer
import pandas as pd

In [45]:
model_id = "Qwen/Qwen2-0.5B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype="auto",
    device_map="auto",
)

In [46]:
lora_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"],
)

In [47]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 540,672 || all params: 494,573,440 || trainable%: 0.1093


In [48]:
def format_reward(completions, **kwargs):
    """Reward function that checks if the completion has a specific format."""
    pattern = r"^<think>.*?</think>\s*<answer>.*?</answer>$"
    completion_contents = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, content) for content in completion_contents]
    rewards_list = [1.0 if match else 0.0 for match in matches]
    return [1.0 if match else 0.0 for match in matches]

In [49]:
def accuracy_reward(completions, **kwargs):
    """Reward function that checks if the completion is the same as the ground truth."""
    solutions = kwargs["solution"]
    completion_contents = [completion[0]["content"] for completion in completions]
    rewards = []
    for content, solution in zip(completion_contents, solutions):
        gold_parsed = parse(solution, extraction_mode="first_match", extraction_config=[LatexExtractionConfig()])
        answer_parsed = parse(content, extraction_mode="first_match", extraction_config=[LatexExtractionConfig()])
        if len(gold_parsed) != 0:
            try:
                rewards.append(float(verify(answer_parsed, gold_parsed)))
            except Exception:
                rewards.append(0.0)
        else:
            rewards.append(1.0)
    return rewards

In [50]:
training_args = GRPOConfig(
    output_dir="Qwen2-0.5B-GRPO-test",
    learning_rate=1e-5,
    gradient_accumulation_steps=16,
    num_train_epochs=1,
    bf16=True,
    # Parameters that control de data preprocessing
    max_completion_length=64,  # default: 256
    num_generations=4,  # default: 8
    max_prompt_length=128,  # default: 512
    # Parameters related to reporting and saving
    report_to=["tensorboard"],
    logging_steps=10,
    push_to_hub=True,
    save_strategy="steps",
    save_steps=10,
)

In [58]:
train_dataset = pd.read_csv("../data/connections.csv")
train_dataset = train_dataset.iloc[:500]

In [59]:
train_dataset = train_dataset[["question", "answers", "groups"]]
train_dataset

Unnamed: 0,question,answers,groups
0,"RETURN, LEVEL, JAZZ, BUCKS, SHIFT, OPTION, HEA...","[['HAIL', 'RAIN', 'SLEET', 'SNOW'], ['BUCKS', ...","['WET WEATHER', 'NBA TEAMS', 'KEYBOARD KEYS', ..."
1,"SNEAKER, MILE, US, ARE, LEAGUE, BOOT, PUMP, YA...","[['BOOT', 'LOAFER', 'PUMP', 'SNEAKER'], ['FOOT...","['FOOTWEAR', 'UNITS OF LENGTH', 'MAGAZINES', '..."
2,"AMIGO, LAB, GOBBLE, PIT, KING, TENOR, CHEEK, P...","[['CHEEK', 'EYE', 'MOUTH', 'NOSE'], ['CHOW', '...","['FACIAL FEATURES', 'SYNONYMS FOR EAT', 'DOG B..."
3,"CATS, SUPER, SWEEP, MOP, ADIDAS, BAT, VACUUM, ...","[['ADIDAS', 'NIKE', 'PUMA', 'REEBOK'], ['CABAR...","['SNEAKER BRANDS', 'MUSICALS BEGINNING WITH “C..."
4,"TARTAR, GLUM, RELISH, SCARLET, MAYO, BLUE, PLU...","[['HULU', 'NETFLIX', 'PEACOCK', 'PRIME'], ['KE...","['STREAMING SERVICES', 'CONDIMENTS', 'SYNONYMS..."
...,...,...,...
495,"INSPIRE, SOAP, METROID, CANDLE, LOTION, INCENS...","[['GENERATE', 'INSPIRE', 'PROMPT', 'PROVOKE'],...","['BRING ABOUT', 'THINGS THAT ARE OFTEN SCENTED..."
496,"ROW, ARTICLE, CLATTER, FEATURE, PADDLE, SEW, C...","[['ARTICLE', 'COLUMN', 'FEATURE', 'STORY'], ['...","['BIT OF NEWSPAPER WRITING', 'NOISY DISTURBANC..."
497,"GRAB, KIDNEY, DRIVE, NAVY, MUNG, NEUTRAL, CAR,...","[['KIDNEY', 'MUNG', 'NAVY', 'PINTO'], ['DRAW',...","['KINDS OF BEANS', 'ATTRACT', 'AUTOMATIC GEAR ..."
498,"HAUTE, HIGH-END, PIONEER, VOYAGER, TAG, MARCO ...","[['DESIGNER', 'HAUTE', 'HIGH-END', 'LUXURY'], ...","['BRAND-NAME', 'DEVISE', 'NASA SPACECRAFT', 'G..."


In [56]:
trainer = GRPOTrainer(
    model=model, reward_funcs=[format_reward], args=training_args, train_dataset=train_dataset)
trainer.train()

KeyError: 42

In [None]:
trainer.save_model(training_args.output_dir)
trainer.push_to_hub(dataset_name=dataset_id)

In [None]:
model_id = "sergiopaniego/Qwen2-0.5B-GRPO"
trained_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype="auto",
    device_map="auto",
)
trained_tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:

def generate_with_reasoning(prompt):
    # Build the prompt from the dataset
    prompt = " ".join(entry["content"] for entry in prompt)

    # Tokenize and move to the same device as the model
    inputs = trained_tokenizer(prompt, return_tensors="pt").to(trained_model.device)

    # Generate text without gradients
    start_time = time.time()
    with torch.no_grad():
        output_ids = trained_model.generate(**inputs, max_length=500)
    end_time = time.time()

    # Decode and extract model response
    generated_text = trained_tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Get inference time
    inference_duration = end_time - start_time

    # Get number of generated tokens
    num_input_tokens = inputs["input_ids"].shape[1]
    num_generated_tokens = output_ids.shape[1] - num_input_tokens

    return generated_text, inference_duration, num_generated_tokens