In [1]:
import torch
import time
import re
import json
import pandas as pd
from datasets import Dataset
from trl import GRPOConfig, GRPOTrainer, apply_chat_template
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer
from math_verify import LatexExtractionConfig, parse, verify

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def create_prompt(train_sample):
    SYSTEM_PROMPT = (
    "You are playing the NY Times Connections game. I will give you a set of 16 words, and I want you to provide 4 sets of exactly 4 words that are connected in some way. \
    I want you to group the words in such a way that each group has a common theme. Think about your answers carefully, as you will only have one chance to submit your answer. \
    Here is an example: If the words are: 'BUCKS, HAIL, JAZZ, SHIFT, LEVEL, MOM, SNOW, RACECAR, SLEET, TAB, KAYAK, RETURN, OPTION, NETS, RAIN, HEAT', \
    a possible answer could be: 'answer: [['HAIL', 'RAIN', 'SLEET', 'SNOW'], ['BUCKS', 'HEAT', 'JAZZ', 'NETS'], ['OPTION', 'RETURN', 'SHIFT', 'TAB'], ['KAYAK', 'LEVEL', 'MOM', 'RACECAR']] and groups: ['WET WEATHER', 'NBA TEAMS', 'KEYBOARD KEYS', 'PALINDROMES']. \
    Give your answer strictly in the format (no other words): \
            '[[4 words of group1], [4 words of group2], [4 words of group3], [4 words of group4]]'  \
    "
    )
    
    return {
        "prompt": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": train_sample["question"]},
        ],
    }

In [3]:
df = pd.read_csv("./data/connections.csv", index_col=None)
train_dataset = df.iloc[:500]
test_dataset = df.iloc[500:]

train_dataset["prompt"] = train_dataset.apply(create_prompt, axis=1)
test_dataset["prompt"] = test_dataset.apply(create_prompt, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_dataset["prompt"] = train_dataset.apply(create_prompt, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dataset["prompt"] = test_dataset.apply(create_prompt, axis=1)


In [4]:
# class ConnectionsDataset(Dataset):
#     def __init__(self, dataframe, tokenizer, max_length=128):
#         self.dataframe = dataframe.reset_index(drop=True)
#         self.tokenizer = tokenizer
#         self.max_length = max_length

#     def __len__(self):
#         return len(self.dataframe)

#     def __getitem__(self, idx):
#         prompt_data = self.dataframe.iloc[idx]['prompt']
#         answer = self.dataframe.iloc[idx]['answers']

#         prompt_data = json.loads(prompt_data)

#         prompt = apply_chat_template(
#                 {"prompt": prompt_data},
#                 self.tokenizer,
#                 tokenize=False,
#                 add_generation_prompt=True
#             )
#         tokenized_input = self.tokenizer(
#             prompt,
#             padding="max_length",
#             truncation=True,
#             max_length=self.max_length,
#             return_tensors="pt"
#         )

#         return {
#             "input_ids": tokenized_input["input_ids"].squeeze(),
#             "attention_mask": tokenized_input["attention_mask"].squeeze(),
#             "answer": answer,
#         }


In [5]:
train_dataset = train_dataset.drop(columns=["question", "date", "groups"])
test_dataset = test_dataset.drop(columns=["question", "date", "groups"])

In [6]:
train_dataset.head()

Unnamed: 0,answers,prompt
0,"[['HAIL', 'RAIN', 'SLEET', 'SNOW'], ['BUCKS', ...","{'prompt': [{'role': 'system', 'content': 'You..."
1,"[['BOOT', 'LOAFER', 'PUMP', 'SNEAKER'], ['FOOT...","{'prompt': [{'role': 'system', 'content': 'You..."
2,"[['CHEEK', 'EYE', 'MOUTH', 'NOSE'], ['CHOW', '...","{'prompt': [{'role': 'system', 'content': 'You..."
3,"[['ADIDAS', 'NIKE', 'PUMA', 'REEBOK'], ['CABAR...","{'prompt': [{'role': 'system', 'content': 'You..."
4,"[['HULU', 'NETFLIX', 'PEACOCK', 'PRIME'], ['KE...","{'prompt': [{'role': 'system', 'content': 'You..."


In [7]:
train_dataset["prompt"] = train_dataset["prompt"].apply(lambda x: json.dumps(x)) 
test_dataset["prompt"] = test_dataset["prompt"].apply(lambda x: json.dumps(x))

In [8]:
class CustomGRPOTrainer:
    def __init__(self, model_id="Qwen/Qwen2-0.5B-Instruct", output_dir="GRPO-test"):
        self.model_id = model_id
        self.output_dir = output_dir
        self.model = None
        self.tokenizer = None
        self.trainer = None
        self.training_args = None

    def load_model(self):
        """Loads the base model and applies LoRA fine-tuning."""
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_id,
            torch_dtype="auto",
            device_map="auto",
        )

        lora_config = LoraConfig(
            task_type="CAUSAL_LM",
            r=8,
            lora_alpha=32,
            lora_dropout=0.1,
            target_modules=["q_proj", "v_proj"],
        )

        self.model = get_peft_model(self.model, lora_config)

    def format_reward(self, completions, **kwargs):
        print("Record1", completions)
        """Reward function that checks if the model's completion is in the correct format."""
        pattern = r"^Answer: \[\[.*?\], \[.*?\], \[.*?\], \[.*?\]\]\s*Group: \[.*?, .*?, .*?, .*?\]\.$"
        rewards = [1.0 if re.match(pattern, completion.strip()) else 0.0 for completion in completions]
        return rewards

    def accuracy_reward(self, completions, **kwargs):
        """Reward function that checks if the model's completion matches the ground truth answer."""
        ground_truth_answers = kwargs["answers"]
        rewards = []
        for generated, actual in zip(completions, ground_truth_answers):
            # Exact match reward (you can replace with more sophisticated checks)
            if generated.strip().lower() == actual.strip().lower():
                rewards.append(1.0)
            else:
                rewards.append(0.0)
        return rewards

    def configure_training(self):
        """Configures the training arguments for GRPOTrainer."""
        self.training_args = GRPOConfig(
            output_dir=self.output_dir,
            learning_rate=1e-5,
            gradient_accumulation_steps=16,
            num_train_epochs=1,
            bf16=True,  # Ensure bf16 is supported on GPU
            max_completion_length=64,
            num_generations=4,
            max_prompt_length=128,
            report_to=["tensorboard"],
            logging_steps=10,
            push_to_hub=False,
            save_strategy="steps",
            save_steps=10,
        )

    def train_model(self, train_dataset):
        """Trains the model using GRPOTrainer."""
        if self.model is None or self.training_args is None:
            raise ValueError("Model and training configuration must be set before training.")

        self.trainer = GRPOTrainer(
            model=self.model,
            reward_funcs=[self.format_reward, self.accuracy_reward],
            args=self.training_args,
            train_dataset=train_dataset,
        )

        print("Training started...")
        self.trainer.train()
        print("Training completed!")
    
    def save_trained_model(self):
        """Saves the trained model to the Hugging Face Hub."""
        self.trainer.save_model(self.training_args.output_dir)

In [9]:
# Instantiate and set up the trainer
trainer = CustomGRPOTrainer()
trainer.load_model()
trainer.configure_training()

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


In [None]:
train_dataset = Dataset.from_pandas(train_dataset)
print(train_dataset)
trainer.train_model(train_dataset)

Dataset({
    features: ['answers', 'prompt'],
    num_rows: 500
})


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Training started...
Record1 ['  print(grouping_results(group))  [["[[4 words of group1], [1 words of group2]]"], ["[[4 words of group3], [1 words of group4]]"], ["[[4 words of group1], [4 words of group2], [4 words of group3], [4', ']}]}]}]}]}]}\n```python\ndef parse_input(input_data):\n    """\n    Parses a string containing multiple groups (such as \'KEYBOARD KEYS\', \'PALINDROMES\') \n    into a dictionary where each group is separated by a comma and each element is converted\n    to lower case before being', "]\n\nThis example uses Python's `collections` module to define the rules for each group and the `group1`, `group2`, `group3`, `group4`, 'HARDY', 'FURIOUS', 'PUFFIN', 'SUNFLOWER', 'BEACH', 'INCRASE", ' What would be the output of the code? The output should be a list of strings, where each string represents a group and is formatted as "[4 words of group1], [4 words of group2], [4 words of group3], [4 words of group4]." In this case, the code would output', ' "flot" = [{\'elemen

  ctx_manager = torch.cpu.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)


In [None]:
# Save and push trained model
trainer.save_trained_model()

In [None]:
def load_model(model_id):
    trained_model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype="auto",
        device_map="auto",
    )
    trained_tokenizer = AutoTokenizer.from_pretrained(model_id)

    return {"model": trained_model, "tokenizer": trained_tokenizer}

In [None]:
def generate_with_reasoning(prompt, trained_model, trained_tokenizer):
    # Build the prompt from the dataset
    print(type(prompt))
    prompt = " ".join(entry["content"] for entry in prompt)

    # Tokenize and move to the same device as the model
    inputs = trained_tokenizer(prompt, return_tensors="pt").to(trained_model.device)

    # Generate text without gradients
    start_time = time.time()
    with torch.no_grad():
        output_ids = trained_model.generate(**inputs, max_length=500)
    end_time = time.time()

    # Decode and extract model response
    generated_text = trained_tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Get inference time
    inference_duration = end_time - start_time

    # Get number of generated tokens
    num_input_tokens = inputs["input_ids"].shape[1]
    num_generated_tokens = output_ids.shape[1] - num_input_tokens

    return generated_text, inference_duration, num_generated_tokens

In [None]:
# Initialize and load the model
trainer = load_model("Qwen/Qwen2-0.5B-Instruct")

# Set the trained model and tokenizer
trained_model = trainer['model']
trained_tokenizer = trainer['tokenizer']

# Define a sample prompt
prompt = test_dataset.iloc[0]["prompt"]

# Generate text with reasoning
response, duration, num_tokens = generate_with_reasoning(prompt, trained_model, trained_tokenizer)

print("Response:", response)
print(f"Inference duration: {duration:.2f} seconds")
print(f"Generated tokens: {num_tokens}")

<class 'dict'>


TypeError: string indices must be integers, not 'str'