In [14]:
import sys
import ast

In [2]:
# !{sys.executable} -m pip install numpy==1.26.4
# !{sys.executable} -m pip install unsloth vllm==0.7.2
# !{sys.executable} -m pip install -U ipywidgets

In [3]:
# from unsloth import FastLanguageModel, is_bfloat16_supported
import torch
import re
from datasets import load_dataset

In [4]:
from unsloth import FastLanguageModel, is_bfloat16_supported

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 04-16 22:29:51 __init__.py:190] Automatically detected platform cuda.


In [5]:
from transformers import (
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)

In [10]:
SYSTEM_PROMPT = '''
You are playing the NY Times Connections game. Your task is to categorize 16 given words into exactly 4 groups of 4 words each, based on shared common themes.

Solve the puzzle using these clear steps:

1. THINK STEP-BY-STEP: Begin by carefully analyzing the words within <thinking> tags. Identify their meanings, relationships, and possible groupings logically.
Example:
<thinking>To solve this, I will first look for obvious groupings like NBA teams, palindrome words, weather-related words, or computer keys. Then, I'll group these words accordingly and ensure each word belongs to exactly one group.</thinking>

2. SHOW YOUR REASONING: After identifying each grouping, briefly explain why these words belong together. Clearly indicate the context or theme.

3. PROVIDE FINAL ANSWER: After clearly grouping and justifying all four sets, provide ONLY your final solution within <answer> tags. Format your solution exactly as shown below.

Example:
<answer>
[['HAIL', 'RAIN', 'SLEET', 'SNOW'],
 ['BUCKS', 'HEAT', 'JAZZ', 'NETS'],
 ['OPTION', 'RETURN', 'SHIFT', 'TAB'],
 ['KAYAK', 'LEVEL', 'MOM', 'RACECAR']]
</answer>

Important Notes:
- Categories should be specific
- Words cannot appear in more than one group.
- Categories can include compound words, shared prefixes/suffixes, pop culture references, or common phrases.
- DO NOT ADD NEW WORDS THAT ARE NOT MENTIONED IN THE QUESTION. USE ONLY WORDS MENTIONED AND GROUP THEM

Here is an example:

USER: [BUCKS, HAIL, JAZZ, SHIFT, LEVEL, MOM, SNOW, RACECAR, SLEET, TAB, KAYAK, RETURN, OPTION, NETS, RAIN, HEAT]

SOLUTION:
[['HAIL', 'RAIN', 'SLEET', 'SNOW'],
 ['BUCKS', 'HEAT', 'JAZZ', 'NETS'],
 ['OPTION', 'RETURN', 'SHIFT', 'TAB'],
 ['KAYAK', 'LEVEL', 'MOM', 'RACECAR']]

Explanation:
- WEATHER TERMS: 'HAIL', 'RAIN', 'SLEET', 'SNOW'
- NBA TEAMS: 'BUCKS', 'HEAT', 'JAZZ', 'NETS'
- KEYBOARD KEYS: 'OPTION', 'RETURN', 'SHIFT', 'TAB'
- PALINDROMES: 'KAYAK', 'LEVEL', 'MOM', 'RACECAR'
'''

In [11]:

def get_connections_questions(split="train"):
    data = load_dataset('csv', data_files='../data/final_transformed_connections.csv')[split]
    data = data.train_test_split(test_size=0.1, seed=3407)
    train_data = data['train']
    test_data  = data['test']
    # each x has 'questions' (the 16‐word list) and 'answer' (the reference)
    train_data = train_data.map(lambda x: {
        'prompt': [
            {'role': 'assistant', 'content': SYSTEM_PROMPT},
            {'role': 'user',      'content': x['questions']}
        ],
        'answer': x['answer']
    })
    test_data = test_data.map(lambda x: {
        'prompt': [
            {'role': 'assistant', 'content': SYSTEM_PROMPT},
            {'role': 'user',      'content': x['questions']}
        ],
        'answer': x['answer']
    })
    return train_data, test_data

train_ds, test_ds = get_connections_questions()

In [8]:
# ——————————————————————————————
# 2) Initialize model + LoRA adapter
# ——————————————————————————————

max_seq_length = 1024
lora_rank       = 8

# load base model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name            = "unsloth/Llama-3.2-1B-Instruct",
    max_seq_length        = max_seq_length,
    load_in_4bit          = True,    # 4‑bit base + LoRA
    fast_inference        = True,
    max_lora_rank         = lora_rank,
    gpu_memory_utilization= 0.9,
)

# wrap with PEFT
model = FastLanguageModel.get_peft_model(
    model              = model,
    r                  = lora_rank,
    target_modules     = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    lora_alpha         = lora_rank,
    random_state       = 3407,
)

# Make sure we also have a HF‐style tokenizer
hf_tokenizer = AutoTokenizer.from_pretrained("unsloth/Llama-3.2-1B-Instruct", use_fast=True)
hf_tokenizer.pad_token_id = hf_tokenizer.eos_token_id


==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.1. vLLM: 0.7.2.
   \\   /|    Tesla V100-PCIE-32GB. Num GPUs = 1. Max memory: 31.733 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 7.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Your GPU does not support prefix caching - will disable!
Unsloth: vLLM loading unsloth/llama-3.2-1b-instruct-unsloth-bnb-4bit with actual GPU utilization = 89.03%
Unsloth: Your GPU has CUDA compute capability 7.0 with VRAM = 31.73 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 1024. Num Sequences = 320.
Unsloth: vLLM's KV Cache can use up to 27.19 GB. Also swap space = 6 GB.
INFO 04-16 22:27:03 config.py:542] This model supports multiple tasks: {'generate', 'score', 'classify', 'rewa



INFO 04-16 22:27:05 weight_utils.py:252] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 04-16 22:27:06 model_runner.py:1115] Loading model weights took 1.0453 GB
INFO 04-16 22:27:06 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 04-16 22:27:08 worker.py:267] Memory profiling takes 1.79 seconds
INFO 04-16 22:27:08 worker.py:267] the current vLLM instance can use total_gpu_memory (31.73GiB) x gpu_memory_utilization (0.89) = 28.25GiB
INFO 04-16 22:27:08 worker.py:267] model weights take 1.05GiB; non_torch_memory takes 0.07GiB; PyTorch activation peak memory takes 1.48GiB; the rest of the memory reserved for KV Cache is 25.66GiB.
INFO 04-16 22:27:09 executor_base.py:110] # CUDA blocks: 52545, # CPU blocks: 12288
INFO 04-16 22:27:09 executor_base.py:115] Maximum concurrency for 1024 tokens per request: 821.02x
INFO 04-16 22:27:13 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory erro

Capturing CUDA graph shapes: 100%|██████████| 43/43 [00:25<00:00,  1.66it/s]

INFO 04-16 22:27:39 model_runner.py:1562] Graph capturing finished in 26 secs, took 0.53 GiB
INFO 04-16 22:27:39 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 32.37 seconds



Unsloth 2025.3.19 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


In [13]:

# ——————————————————————————————
# 3) Preprocess: pad labels to 1024
# ——————————————————————————————
def preprocess_sft(batch):
    inputs = [
        tokenizer.apply_chat_template(p, tokenize=False, add_generation_prompt=False)
        for p in batch["prompt"]
    ]
    model_inputs = hf_tokenizer(
        inputs,
        max_length = max_seq_length,
        padding    = "max_length",
        truncation = True,
    )
    labels = hf_tokenizer(
        batch["answer"],
        max_length = max_seq_length,    # ← pad answers out to 1024
        padding    = "max_length",
        truncation = True,
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_enc = train_ds.map(
    preprocess_sft, batched=True, remove_columns=train_ds.column_names
)
test_enc  = test_ds.map(
    preprocess_sft, batched=True, remove_columns=test_ds.column_names
)

data_collator = DataCollatorForSeq2Seq(
    tokenizer           = hf_tokenizer,
    model               = model.model,   # unwrapped base
    label_pad_token_id  = -100,          # ignore padded labels
)

Map:   0%|          | 0/580 [00:00<?, ? examples/s]

Map:   0%|          | 0/65 [00:00<?, ? examples/s]

In [15]:
# ——————————————————————————————
# 4) Seq2SeqTrainer with padded labels
# ——————————————————————————————
sft_args = Seq2SeqTrainingArguments(
    output_dir                  = "sft_outputs",
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 16,
    num_train_epochs            = 3,
    learning_rate               = 5e-5,
    fp16                        = not is_bfloat16_supported(),
    bf16                        = is_bfloat16_supported(),
    logging_steps               = 10,
    save_steps                  = 100,
    save_total_limit            = 2,
    predict_with_generate       = True,   # generate during eval if you want
    generation_max_length       = 256,    # max tokens for eval generation
)

sft_trainer = Seq2SeqTrainer(
    model         = model,
    args          = sft_args,
    train_dataset = train_enc,
    eval_dataset  = test_enc,
    tokenizer     = hf_tokenizer,
    data_collator = DataCollatorForSeq2Seq(
        tokenizer          = hf_tokenizer,
        model              = model.model,   # base model for collator
        label_pad_token_id = -100,          # ignore padding in loss
    ),
)

  sft_trainer = Seq2SeqTrainer(
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [16]:
# run the SFT pass
sft_trainer.train()
model.save_lora("sft_saved_lora_1B")

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 580 | Num Epochs = 3 | Total steps = 108
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 16
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 16 x 1) = 16
 "-____-"     Trainable parameters = 5,636,096/1,000,000,000 (0.56% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,12.3104
20,9.0561
30,7.5332
40,4.565
50,1.5788
60,0.8001
70,0.6553
80,0.5797
90,0.5413
100,0.5156


In [7]:
max_seq_length = 1024
lora_rank = 8

In [8]:
# ——————————————————————————————
# 4) GRPO stage, initialized from SFT adapter
# ——————————————————————————————

# re‑load the same base & adapter structure
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name             = "unsloth/Llama-3.2-1B-Instruct",
    max_seq_length         = max_seq_length,
    load_in_4bit           = True,
    fast_inference         = True,
    max_lora_rank          = lora_rank,
    gpu_memory_utilization = 0.9,
)
model = FastLanguageModel.get_peft_model(
    model          = model,
    r              = lora_rank,
    target_modules = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    lora_alpha     = lora_rank,
)
# load your SFT‐trained adapter
model.load_lora("sft_saved_lora_1B")

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.1. vLLM: 0.7.2.
   \\   /|    Tesla V100-PCIE-32GB. Num GPUs = 1. Max memory: 31.733 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 7.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Your GPU does not support prefix caching - will disable!
Unsloth: vLLM loading unsloth/llama-3.2-1b-instruct-unsloth-bnb-4bit with actual GPU utilization = 89.03%
Unsloth: Your GPU has CUDA compute capability 7.0 with VRAM = 31.73 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 1024. Num Sequences = 320.
Unsloth: vLLM's KV Cache can use up to 27.19 GB. Also swap space = 6 GB.
INFO 04-16 22:30:08 config.py:542] This model supports multiple tasks: {'embed', 'reward', 'generate', 'classi



INFO 04-16 22:30:10 loader.py:1102] Loading weights with BitsAndBytes quantization.  May take a while ...
INFO 04-16 22:30:11 weight_utils.py:252] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 04-16 22:30:21 model_runner.py:1115] Loading model weights took 1.0453 GB
INFO 04-16 22:30:21 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 04-16 22:30:24 worker.py:267] Memory profiling takes 2.55 seconds
INFO 04-16 22:30:24 worker.py:267] the current vLLM instance can use total_gpu_memory (31.73GiB) x gpu_memory_utilization (0.89) = 28.25GiB
INFO 04-16 22:30:24 worker.py:267] model weights take 1.05GiB; non_torch_memory takes 0.07GiB; PyTorch activation peak memory takes 1.48GiB; the rest of the memory reserved for KV Cache is 25.66GiB.
INFO 04-16 22:30:24 executor_base.py:110] # CUDA blocks: 52545, # CPU blocks: 12288
INFO 04-16 22:30:24 executor_base.py:115] Maximum concurrency for 1024 tokens per request: 821.02x
INFO 04-16 22:30:28 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory erro

Capturing CUDA graph shapes: 100%|██████████| 43/43 [00:26<00:00,  1.63it/s]

INFO 04-16 22:30:55 model_runner.py:1562] Graph capturing finished in 26 secs, took 0.53 GiB
INFO 04-16 22:30:55 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 33.62 seconds



Unsloth 2025.3.19 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


LoRARequest(lora_name='0', lora_int_id=0, lora_path='sft_saved_lora_1B', lora_tensors=None, lora_config=(None,), lora_local_path=None, long_lora_max_len=None, base_model_name=None, lora_embeddings=None)

In [15]:
def thinking_reward_func(completions, **kwargs) -> list[float]:
    """Reward function for including thinking tags"""
    rewards = []
    for completion in completions:
        try:
            reward = 0.0
            # Extract all thinking blocks
            for message in completion:
                if message["role"] == "assistant" and message.get("content"):
                    content = message["content"]

                    # Count opening and closing tags
                    opening_tags = len(re.findall(r"<thinking>", content))
                    closing_tags = len(re.findall(r"</thinking>", content))

                    if opening_tags == 0 or closing_tags == 0:
                        continue

                    if opening_tags == closing_tags:
                        reward += 0.5
                    else:
                        reward += 0.1
            reward = min(reward, 1.5)
            rewards.append(reward)
        except Exception as e:
            print(f"{RED}Error in thinking_reward_func: {e}{RESET}")
            rewards.append(0.0)
    assert len(rewards) == len(completions)
    return rewards


def answer_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function for including answer tags"""
    rewards = []
    for completion in completions:
        try:
            reward = 0.0
            # Extract all answer blocks
            for message in completion:
                if message["role"] == "assistant" and message.get("content"):
                    content = message["content"]

                    # Count opening and closing tags
                    opening_tags = len(re.findall(r"<answer>", content))
                    closing_tags = len(re.findall(r"</answer>", content))

                    if opening_tags == 0 or closing_tags == 0:
                        continue

                    if opening_tags == 1 and closing_tags == 1:
                        reward += 0.5
                    else:
                        reward += 0.1
            reward = min(reward, 1.5)
            rewards.append(reward)
        except Exception as e:
            print(f"{RED}Error in thinking_reward_func: {e}{RESET}")
            rewards.append(0.0)
    assert len(rewards) == len(completions)
    return rewards


def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
    """Safe, robust reward for NYT Connections."""
    rewards = []
    
    for completion, expected_answer in zip(completions, answer):
        # default
        reward = 0.0

        # 1) Extract last <answer>…</answer> block
        predicted_block = None
        for msg in completion:
            if msg.get("role") == "assistant" and msg.get("content"):
                matches = re.findall(r"<answer>(.*?)</answer>", msg["content"], re.DOTALL)
                if matches:
                    predicted_block = matches[-1].strip()
        if not predicted_block:
            rewards.append(0.0)
            continue

        # 2) Safe parse
        try:
            pred_groups = ast.literal_eval(predicted_block)
            exp_groups  = ast.literal_eval(expected_answer)
        except Exception:
            rewards.append(0.0)
            continue

        # 3) Normalize: keep only lists of strings
        def clean(groups):
            cleaned = []
            for g in groups:
                if isinstance(g, list):
                    cleaned.append([w for w in g if isinstance(w, str)])
            return cleaned

        pred = clean(pred_groups)
        exp  = clean(exp_groups)

        # 4) Scoring
        # Perfect match
        if pred == exp:
            rewards.append(6.0)
            continue

        # +1.5 for each fully correct group (set‐equality)
        used = set()
        for pg in pred:
            for i, eg in enumerate(exp):
                if i in used: 
                    continue
                if set(pg) == set(eg):
                    reward += 1.5
                    used.add(i)
                    break

        # +0.75 for any 3‐word overlap in unmatched groups
        for pg in pred:
            for i, eg in enumerate(exp):
                if i in used:
                    continue
                overlap = len([w for w in pg if w in eg])
                if overlap == 3:
                    reward += 0.75
                    used.add(i)
                    break

        # +0.25 if every predicted group has exactly 4 words
        if all(len(pg) == 4 for pg in pred):
            reward += 0.25

        # +0.5 if no word is repeated across all predicted groups
        flat = [w for pg in pred for w in pg]
        if len(flat) == len(set(flat)):
            reward += 0.5

        rewards.append(reward)

    assert len(rewards) == len(completions)
    return rewards

In [16]:
from trl import GRPOConfig, GRPOTrainer

rl_args = GRPOConfig(
    use_vllm                     = False,
    learning_rate                = 5e-6,
    adam_beta1                   = 0.9,
    adam_beta2                   = 0.99,
    weight_decay                 = 0.1,
    warmup_ratio                 = 0.1,
    lr_scheduler_type            = "cosine",
    optim                        = "adamw_8bit",
    logging_steps                = 1,
    bf16                         = is_bfloat16_supported(),
    fp16                         = not is_bfloat16_supported(),
    per_device_train_batch_size  = 1,
    gradient_accumulation_steps  = 16,
    num_generations              = 4,
    max_prompt_length            = 128,
    max_completion_length        = 64,
    num_train_epochs             = 1,
    max_steps                    = 150,
    save_steps                   = 50,
    max_grad_norm                = 0.1,
    report_to                    = "none",
    output_dir                   = "outputs_grpo",
)

trainer = GRPOTrainer(
    model            = model,
    processing_class = tokenizer,
    reward_funcs     = [
        thinking_reward_func,
        correctness_reward_func,
        answer_format_reward_func,
    ],
    args             = rl_args,
    train_dataset    = train_ds,  # or a mix of train & test
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 1 to the `num_generations` of 4


In [17]:
trainer.train()
model.save_lora("grpo_saved_lora_1B")

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 580 | Num Epochs = 5 | Total steps = 150
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 16
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 16 x 1) = 64
 "-____-"     Trainable parameters = 5,636,096/1,000,000,000 (0.56% trained)


Step,Training Loss,reward,reward_std,completion_length,kl,rewards / thinking_reward_func,rewards / correctness_reward_func,rewards / answer_format_reward_func
1,0.0,0.039062,0.078125,63.328125,2e-05,0.0,0.023438,0.015625
2,0.0,0.042188,0.084375,62.390625,1.8e-05,0.0,0.023438,0.01875
3,0.0,0.027344,0.054688,62.03125,2.1e-05,0.0,0.011719,0.015625
4,0.0,0.046875,0.09375,62.140625,2e-05,0.0,0.023438,0.023438
5,0.0,0.066406,0.132812,60.125,3.2e-05,0.0,0.035156,0.03125
6,0.0,0.019531,0.039062,62.671875,2.3e-05,0.0,0.011719,0.007812
7,0.0,0.023438,0.02992,63.03125,1.8e-05,0.0,0.007812,0.015625
8,0.0,0.058594,0.117188,62.5,1.8e-05,0.0,0.027344,0.03125
9,0.0,0.121094,0.242188,63.34375,1.8e-05,0.007812,0.058594,0.054688
10,0.0,0.03125,0.0625,63.515625,1.6e-05,0.0,0.015625,0.015625


In [18]:
text = tokenizer.apply_chat_template([
    {"role" : "system", "content" : SYSTEM_PROMPT},
    {"role" : "user", "content" : "[BOXER, SLOW, MUZZLE, DALMATIAN, HUSKY, WAVE, SNOUT, DETOUR, TRUNK, YIELD, ASLEEP, BARRIER, POODLE, BEAK, STOP, BITE]"},
], tokenize = False, add_generation_prompt = True)

from vllm import SamplingParams
sampling_params = SamplingParams(
    temperature = 0.8,
    top_p = 0.95,
    max_tokens = 1024,
)
output = model.fast_generate(
    text,
    sampling_params = sampling_params,
    lora_request = model.load_lora("grpo_saved_lora_1B"),
)[0].outputs[0].text

output

Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.57it/s, est. speed input: 991.92 toks/s, output: 111.78 toks/s]


"<answer>\n[['BOXER', 'DALMATIAN', 'HUSKY', 'BEAR'],\n ['SLOW', 'STOP', 'BEAK', 'YOWL'],\n ['WAVE', 'MUFFIN', 'BARRIER', 'BITE'],\n ['YIELD', 'POODLE', 'SNOUT', 'STOPPED']"

In [19]:
output

"<answer>\n[['BOXER', 'DALMATIAN', 'HUSKY', 'BEAR'],\n ['SLOW', 'STOP', 'BEAK', 'YOWL'],\n ['WAVE', 'MUFFIN', 'BARRIER', 'BITE'],\n ['YIELD', 'POODLE', 'SNOUT', 'STOPPED']"

In [21]:
test_ds[0]

{'questions': 'BOXER, SLOW, MUZZLE, DALMATIAN, HUSKY, WAVE, SNOUT, DETOUR, TRUNK, YIELD, ASLEEP, BARRIER, POODLE, BEAK, STOP, BITE',
 'answer': "[['BOXER', 'DALMATIAN', 'HUSKY', 'POODLE'], ['BEAK', 'MUZZLE', 'SNOUT', 'TRUNK'], ['DETOUR', 'SLOW', 'STOP', 'YIELD'], ['ASLEEP', 'BARRIER', 'BITE', 'WAVE']]",
 'grouped_answers': "DOG BREEDS - ['BOXER', 'DALMATIAN', 'HUSKY', 'POODLE']\nANIMAL NOSES - ['BEAK', 'MUZZLE', 'SNOUT', 'TRUNK']\nTRAFFIC SIGNS - ['DETOUR', 'SLOW', 'STOP', 'YIELD']\nSOUND ___ - ['ASLEEP', 'BARRIER', 'BITE', 'WAVE']",
 'prompt': [{'content': "\nYou are playing the NY Times Connections game. Your task is to categorize 16 given words into exactly 4 groups of 4 words each, based on shared common themes.\n\nSolve the puzzle using these clear steps:\n\n1. THINK STEP-BY-STEP: Begin by carefully analyzing the words within <thinking> tags. Identify their meanings, relationships, and possible groupings logically.\nExample:\n<thinking>To solve this, I will first look for obvious 