In [95]:
import torch
import time
import re
import pandas as pd
from trl import GRPOConfig, GRPOTrainer
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer
# from math_verify import LatexExtractionConfig, parse, verify

In [96]:
def create_prompt(train_sample):
    SYSTEM_PROMPT = (
    "You are playing the NY Times Connections game. I will give you a set of 16 words, and I want you to provide 4 sets of exactly 4 words that are connected in some way. \
    I want you to group the words in such a way that each group has a common theme. Think about your answers carefully, as you will only have one chance to submit your answer. \
    Here is an example: If the words are: 'BUCKS, HAIL, JAZZ, SHIFT, LEVEL, MOM, SNOW, RACECAR, SLEET, TAB, KAYAK, RETURN, OPTION, NETS, RAIN, HEAT', \
    a possible answer could be: 'answer: [['HAIL', 'RAIN', 'SLEET', 'SNOW'], ['BUCKS', 'HEAT', 'JAZZ', 'NETS'], ['OPTION', 'RETURN', 'SHIFT', 'TAB'], ['KAYAK', 'LEVEL', 'MOM', 'RACECAR']] and groups: ['WET WEATHER', 'NBA TEAMS', 'KEYBOARD KEYS', 'PALINDROMES']. \
    Give your answer strictly in the format (no other words): \
            'Answer: [[4 words of group1], [4 words of group2], [4 words of group3], [4 words of group4]]  \
            Group: [group1, group2, group3, group4]."
    )
    
    return {
        "prompt": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": train_sample["question"]},
        ],
    }

In [97]:
df = pd.read_csv("../data/connections.csv")
train_dataset = df.iloc[:500]
test_dataset = df.iloc[500:]

train_dataset["prompt"] = train_dataset.apply(create_prompt, axis=1)
test_dataset["prompt"] = test_dataset.apply(create_prompt, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_dataset["prompt"] = train_dataset.apply(create_prompt, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dataset["prompt"] = test_dataset.apply(create_prompt, axis=1)


In [98]:
train_dataset = train_dataset.drop(columns=["question", "date", "groups"])
test_dataset = test_dataset.drop(columns=["question", "date", "groups"])

In [99]:
train_dataset.head()

Unnamed: 0,answers,prompt
0,"[['HAIL', 'RAIN', 'SLEET', 'SNOW'], ['BUCKS', ...","{'prompt': [{'role': 'system', 'content': 'You..."
1,"[['BOOT', 'LOAFER', 'PUMP', 'SNEAKER'], ['FOOT...","{'prompt': [{'role': 'system', 'content': 'You..."
2,"[['CHEEK', 'EYE', 'MOUTH', 'NOSE'], ['CHOW', '...","{'prompt': [{'role': 'system', 'content': 'You..."
3,"[['ADIDAS', 'NIKE', 'PUMA', 'REEBOK'], ['CABAR...","{'prompt': [{'role': 'system', 'content': 'You..."
4,"[['HULU', 'NETFLIX', 'PEACOCK', 'PRIME'], ['KE...","{'prompt': [{'role': 'system', 'content': 'You..."


In [None]:
class GRPOTrainer:
    def __init__(self, model_id="Qwen/Qwen2-0.5B-Instruct", output_dir="GRPO-test"):
        self.model_id = model_id
        self.output_dir = output_dir
        self.model = None
        self.tokenizer = None
        self.trainer = None
        self.training_args = None

    def load_model(self):
        """Loads the base model and applies LoRA fine-tuning."""
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_id,
            torch_dtype="auto",
            device_map="auto",
        )

        lora_config = LoraConfig(
            task_type="CAUSAL_LM",
            r=8,
            lora_alpha=32,
            lora_dropout=0.1,
            target_modules=["q_proj", "v_proj"],
        )

        self.model = get_peft_model(self.model, lora_config)

    def format_reward(self, completions, **kwargs):
        """Reward function that checks if the completion has a specific format."""
        pattern = r"^<think>.*?</think>\s*<answer>.*?</answer>$"
        completion_contents = [completion[0]["content"] for completion in completions]
        matches = [re.match(pattern, content) for content in completion_contents]
        return [1.0 if match else 0.0 for match in matches]

    def accuracy_reward(self, completions, **kwargs):
        """Reward function that checks if the completion matches the ground truth."""
        solutions = kwargs["solution"]
        completion_contents = [completion[0]["content"] for completion in completions]
        rewards = []
        # for content, solution in zip(completion_contents, solutions):
        #     gold_parsed = parse(solution, extraction_mode="first_match", extraction_config=[LatexExtractionConfig()])
        #     answer_parsed = parse(content, extraction_mode="first_match", extraction_config=[LatexExtractionConfig()])
        #     if len(gold_parsed) != 0:
        #         try:
        #             rewards.append(float(verify(answer_parsed, gold_parsed)))
        #         except Exception:
        #             rewards.append(0.0)
        #     else:
        #         rewards.append(1.0)
        return rewards

    def configure_training(self):
        """Configures the training arguments for GRPOTrainer."""
        self.training_args = GRPOConfig(
            output_dir=self.output_dir,
            learning_rate=1e-5,
            gradient_accumulation_steps=16,
            num_train_epochs=1,
            bf16=True,  # Ensure bf16 is supported on GPU
            max_completion_length=64,
            num_generations=4,
            max_prompt_length=128,
            report_to=["tensorboard"],
            logging_steps=10,
            push_to_hub=True,
            save_strategy="steps",
            save_steps=10,
        )

    def train_model(self, train_dataset):
        """Trains the model using GRPOTrainer."""
        if self.model is None or self.training_args is None:
            raise ValueError("Model and training configuration must be set before training.")

        self.trainer = GRPOTrainer(
            model=self.model,
            reward_funcs=[self.format_reward, self.accuracy_reward],
            args=self.training_args,
            train_dataset=train_dataset,
        )

        print("Training started...")
        self.trainer.train()
        print("Training completed!")
    
    def save_trained_model(self):
        """Saves the trained model to the Hugging Face Hub."""
        self.trainer.save_model(self.training_args.output_dir)
        self.trainer.push_to_hub(dataset_name=dataset_id)

In [101]:
def load_model(model_id):
    trained_model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype="auto",
        device_map="auto",
    )
    trained_tokenizer = AutoTokenizer.from_pretrained(model_id)

In [102]:
def generate_with_reasoning(prompt, trained_model, trained_tokenizer):
    # Build the prompt from the dataset
    prompt = " ".join(entry["content"] for entry in prompt)

    # Tokenize and move to the same device as the model
    inputs = trained_tokenizer(prompt, return_tensors="pt").to(trained_model.device)

    # Generate text without gradients
    start_time = time.time()
    with torch.no_grad():
        output_ids = trained_model.generate(**inputs, max_length=500)
    end_time = time.time()

    # Decode and extract model response
    generated_text = trained_tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Get inference time
    inference_duration = end_time - start_time

    # Get number of generated tokens
    num_input_tokens = inputs["input_ids"].shape[1]
    num_generated_tokens = output_ids.shape[1] - num_input_tokens

    return generated_text, inference_duration, num_generated_tokens

In [104]:
# Initialize and load the model
trainer = load_model("sergiopaniego/Qwen2-0.5B-GRPO")

# Set the trained model and tokenizer
trained_model = trainer.model
trained_tokenizer = trainer.tokenizer

# Define a sample prompt
prompt = test_dataset.iloc[0]["prompt"]

# Generate text with reasoning
response, duration, num_tokens = generate_with_reasoning(prompt, trained_model, trained_tokenizer)

print("Response:", response)
print(f"Inference duration: {duration:.2f} seconds")
print(f"Generated tokens: {num_tokens}")

ValueError: Unrecognized model in sergiopaniego/Qwen2-0.5B-GRPO. Should have a `model_type` key in its config.json, or contain one of the following strings in its name: albert, align, altclip, aria, aria_text, audio-spectrogram-transformer, autoformer, bamba, bark, bart, beit, bert, bert-generation, big_bird, bigbird_pegasus, biogpt, bit, blenderbot, blenderbot-small, blip, blip-2, bloom, bridgetower, bros, camembert, canine, chameleon, chinese_clip, chinese_clip_vision_model, clap, clip, clip_text_model, clip_vision_model, clipseg, clvp, code_llama, codegen, cohere, cohere2, colpali, conditional_detr, convbert, convnext, convnextv2, cpmant, ctrl, cvt, dab-detr, dac, data2vec-audio, data2vec-text, data2vec-vision, dbrx, deberta, deberta-v2, decision_transformer, deformable_detr, deit, depth_anything, depth_pro, deta, detr, diffllama, dinat, dinov2, dinov2_with_registers, distilbert, donut-swin, dpr, dpt, efficientformer, efficientnet, electra, emu3, encodec, encoder-decoder, ernie, ernie_m, esm, falcon, falcon_mamba, fastspeech2_conformer, flaubert, flava, fnet, focalnet, fsmt, funnel, fuyu, gemma, gemma2, git, glm, glpn, got_ocr2, gpt-sw3, gpt2, gpt_bigcode, gpt_neo, gpt_neox, gpt_neox_japanese, gptj, gptsan-japanese, granite, granitemoe, granitemoeshared, granitevision, graphormer, grounding-dino, groupvit, helium, hiera, hubert, ibert, idefics, idefics2, idefics3, idefics3_vision, ijepa, imagegpt, informer, instructblip, instructblipvideo, jamba, jetmoe, jukebox, kosmos-2, layoutlm, layoutlmv2, layoutlmv3, led, levit, lilt, llama, llava, llava_next, llava_next_video, llava_onevision, longformer, longt5, luke, lxmert, m2m_100, mamba, mamba2, marian, markuplm, mask2former, maskformer, maskformer-swin, mbart, mctct, mega, megatron-bert, mgp-str, mimi, mistral, mixtral, mllama, mobilebert, mobilenet_v1, mobilenet_v2, mobilevit, mobilevitv2, modernbert, moonshine, moshi, mpnet, mpt, mra, mt5, musicgen, musicgen_melody, mvp, nat, nemotron, nezha, nllb-moe, nougat, nystromformer, olmo, olmo2, olmoe, omdet-turbo, oneformer, open-llama, openai-gpt, opt, owlv2, owlvit, paligemma, patchtsmixer, patchtst, pegasus, pegasus_x, perceiver, persimmon, phi, phi3, phimoe, pix2struct, pixtral, plbart, poolformer, pop2piano, prophetnet, pvt, pvt_v2, qdqbert, qwen2, qwen2_5_vl, qwen2_audio, qwen2_audio_encoder, qwen2_moe, qwen2_vl, rag, realm, recurrent_gemma, reformer, regnet, rembert, resnet, retribert, roberta, roberta-prelayernorm, roc_bert, roformer, rt_detr, rt_detr_resnet, rt_detr_v2, rwkv, sam, seamless_m4t, seamless_m4t_v2, segformer, seggpt, sew, sew-d, siglip, siglip_vision_model, speech-encoder-decoder, speech_to_text, speech_to_text_2, speecht5, splinter, squeezebert, stablelm, starcoder2, superglue, superpoint, swiftformer, swin, swin2sr, swinv2, switch_transformers, t5, table-transformer, tapas, textnet, time_series_transformer, timesformer, timm_backbone, timm_wrapper, trajectory_transformer, transfo-xl, trocr, tvlt, tvp, udop, umt5, unispeech, unispeech-sat, univnet, upernet, van, video_llava, videomae, vilt, vipllava, vision-encoder-decoder, vision-text-dual-encoder, visual_bert, vit, vit_hybrid, vit_mae, vit_msn, vitdet, vitmatte, vitpose, vitpose_backbone, vits, vivit, wav2vec2, wav2vec2-bert, wav2vec2-conformer, wavlm, whisper, xclip, xglm, xlm, xlm-prophetnet, xlm-roberta, xlm-roberta-xl, xlnet, xmod, yolos, yoso, zamba, zamba2, zoedepth