In [1]:
import sys
import ast

In [2]:
# !{sys.executable} -m pip install numpy==1.26.4
# !{sys.executable} -m pip install unsloth vllm==0.7.2
# !{sys.executable} -m pip install -U ipywidgets

In [3]:
# from unsloth import FastLanguageModel, is_bfloat16_supported
import torch
import re
from datasets import load_dataset

In [4]:
from unsloth import FastLanguageModel, is_bfloat16_supported

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-04-17 22:54:21.637013: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744955661.655335 1380583 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744955661.660897 1380583 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1744955661.675140 1380583 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744955661.675156 1380583 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744955661.675158 1380583 computation_placer.cc:177] computation placer alr

🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 04-17 22:54:28 __init__.py:190] Automatically detected platform cuda.


In [5]:
from transformers import (
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)

In [6]:
SYSTEM_PROMPT = '''
You are playing the NY Times Connections game. Your task is to categorize 16 given words into exactly 4 groups of 4 words each, based on shared common themes.

Solve the puzzle using these clear steps:

1. THINK STEP-BY-STEP: Begin by carefully analyzing the words within <thinking> tags. Identify their meanings, relationships, and possible groupings logically.
Example:
<thinking>To solve this, I will first look for obvious groupings like NBA teams, palindrome words, weather-related words, or computer keys. Then, I'll group these words accordingly and ensure each word belongs to exactly one group.</thinking>

2. PROVIDE FINAL ANSWER: After clearly grouping and justifying all four sets, provide ONLY your final solution within <answer> tags. Format your solution exactly as shown below.

Example:
<answer>
[['HAIL', 'RAIN', 'SLEET', 'SNOW'],
 ['BUCKS', 'HEAT', 'JAZZ', 'NETS'],
 ['OPTION', 'RETURN', 'SHIFT', 'TAB'],
 ['KAYAK', 'LEVEL', 'MOM', 'RACECAR']]
</answer>

Important Notes:
- Categories should be specific
- Words cannot appear in more than one group.
- Categories can include compound words, shared prefixes/suffixes, pop culture references, or common phrases.
- DO NOT ADD NEW WORDS THAT ARE NOT MENTIONED IN THE QUESTION. USE ONLY WORDS MENTIONED AND GROUP THEM

Here is an example:

USER: [BUCKS, HAIL, JAZZ, SHIFT, LEVEL, MOM, SNOW, RACECAR, SLEET, TAB, KAYAK, RETURN, OPTION, NETS, RAIN, HEAT]

SOLUTION:
[['HAIL', 'RAIN', 'SLEET', 'SNOW'],
 ['BUCKS', 'HEAT', 'JAZZ', 'NETS'],
 ['OPTION', 'RETURN', 'SHIFT', 'TAB'],
 ['KAYAK', 'LEVEL', 'MOM', 'RACECAR']]

Explanation:
- WEATHER TERMS: 'HAIL', 'RAIN', 'SLEET', 'SNOW'
- NBA TEAMS: 'BUCKS', 'HEAT', 'JAZZ', 'NETS'
- KEYBOARD KEYS: 'OPTION', 'RETURN', 'SHIFT', 'TAB'
- PALINDROMES: 'KAYAK', 'LEVEL', 'MOM', 'RACECAR'
'''

In [7]:

def get_connections_questions(split="train"):
    data = load_dataset('csv', data_files='../data/final_transformed_connections.csv')[split]
    data = data.train_test_split(test_size=0.1, seed=3407)
    train_data = data['train']
    test_data  = data['test']
    # each x has 'questions' (the 16‐word list) and 'answer' (the reference)
    train_data = train_data.map(lambda x: {
        'prompt': [
            {'role': 'assistant', 'content': SYSTEM_PROMPT},
            {'role': 'user',      'content': x['questions']}
        ],
        'answer': x['answer']
    })
    test_data = test_data.map(lambda x: {
        'prompt': [
            {'role': 'assistant', 'content': SYSTEM_PROMPT},
            {'role': 'user',      'content': x['questions']}
        ],
        'answer': x['answer']
    })
    return train_data, test_data

train_ds, test_ds = get_connections_questions()

In [8]:
# ——————————————————————————————
# 2) Initialize model + LoRA adapter
# ——————————————————————————————

max_seq_length = 2048
lora_rank       = 8

# load base model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name            = "unsloth/Llama-3.2-3B-Instruct",
    max_seq_length        = max_seq_length,
    load_in_4bit          = True,    # 4‑bit base + LoRA
    fast_inference        = True,
    max_lora_rank         = lora_rank,
    gpu_memory_utilization= 0.9,
)

# wrap with PEFT
model = FastLanguageModel.get_peft_model(
    model              = model,
    r                  = lora_rank,
    target_modules     = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    lora_alpha         = lora_rank,
    random_state       = 3407,
)

# Make sure we also have a HF‐style tokenizer
hf_tokenizer = AutoTokenizer.from_pretrained("unsloth/Llama-3.2-3B-Instruct", use_fast=True)
hf_tokenizer.pad_token_id = hf_tokenizer.eos_token_id


==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.49.0. vLLM: 0.7.2.
   \\   /|    Tesla V100-PCIE-32GB. Num GPUs = 1. Max memory: 31.733 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 7.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Your GPU does not support prefix caching - will disable!
Unsloth: vLLM loading unsloth/llama-3.2-3b-instruct-unsloth-bnb-4bit with actual GPU utilization = 89.03%
Unsloth: Your GPU has CUDA compute capability 7.0 with VRAM = 31.73 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 2048. Num Sequences = 320.
Unsloth: vLLM's KV Cache can use up to 25.85 GB. Also swap space = 6 GB.
INFO 04-17 22:54:47 config.py:542] This model supports multiple tasks: {'embed', 'score', 'generate', 'reward'



INFO 04-17 22:54:55 weight_utils.py:252] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 04-17 22:54:57 model_runner.py:1115] Loading model weights took 2.2405 GB
INFO 04-17 22:54:57 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 04-17 22:54:59 worker.py:267] Memory profiling takes 2.10 seconds
INFO 04-17 22:54:59 worker.py:267] the current vLLM instance can use total_gpu_memory (31.73GiB) x gpu_memory_utilization (0.89) = 28.25GiB
INFO 04-17 22:54:59 worker.py:267] model weights take 2.24GiB; non_torch_memory takes 0.07GiB; PyTorch activation peak memory takes 1.49GiB; the rest of the memory reserved for KV Cache is 24.45GiB.
INFO 04-17 22:55:00 executor_base.py:110] # CUDA blocks: 14309, # CPU blocks: 3510
INFO 04-17 22:55:00 executor_base.py:115] Maximum concurrency for 2048 tokens per request: 111.79x
INFO 04-17 22:55:03 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error

Capturing CUDA graph shapes: 100%|██████████| 43/43 [00:42<00:00,  1.00it/s]

INFO 04-17 22:55:46 model_runner.py:1562] Graph capturing finished in 43 secs, took 0.76 GiB
INFO 04-17 22:55:46 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 49.23 seconds



Unsloth 2025.3.19 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [9]:

# ——————————————————————————————
# 3) Preprocess: pad labels to 1024
# ——————————————————————————————
def preprocess_sft(batch):
    inputs = [
        tokenizer.apply_chat_template(p, tokenize=False, add_generation_prompt=False)
        for p in batch["prompt"]
    ]
    model_inputs = hf_tokenizer(
        inputs,
        max_length = max_seq_length,
        padding    = "max_length",
        truncation = True,
    )
    labels = hf_tokenizer(
        batch["answer"],
        max_length = max_seq_length,    # ← pad answers out to 1024
        padding    = "max_length",
        truncation = True,
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_enc = train_ds.map(
    preprocess_sft, batched=True, remove_columns=train_ds.column_names
)
test_enc  = test_ds.map(
    preprocess_sft, batched=True, remove_columns=test_ds.column_names
)

data_collator = DataCollatorForSeq2Seq(
    tokenizer           = hf_tokenizer,
    model               = model.model,   # unwrapped base
    label_pad_token_id  = -100,          # ignore padded labels
)

Map:   0%|          | 0/65 [00:00<?, ? examples/s]

In [10]:
# ——————————————————————————————
# 4) Seq2SeqTrainer with padded labels
# ——————————————————————————————
sft_args = Seq2SeqTrainingArguments(
    output_dir                  = "sft_outputs",
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 16,
    num_train_epochs            = 3,
    learning_rate               = 5e-5,
    fp16                        = not is_bfloat16_supported(),
    bf16                        = is_bfloat16_supported(),
    logging_steps               = 10,
    save_steps                  = 100,
    save_total_limit            = 2,
    predict_with_generate       = True,   # generate during eval if you want
    generation_max_length       = 256,    # max tokens for eval generation
)

sft_trainer = Seq2SeqTrainer(
    model         = model,
    args          = sft_args,
    train_dataset = train_enc,
    eval_dataset  = test_enc,
    tokenizer     = hf_tokenizer,
    data_collator = DataCollatorForSeq2Seq(
        tokenizer          = hf_tokenizer,
        model              = model.model,   # base model for collator
        label_pad_token_id = -100,          # ignore padding in loss
    ),
)

  sft_trainer = Seq2SeqTrainer(
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [11]:
# run the SFT pass
sft_trainer.train()
model.save_lora("sft_saved_lora_3B")

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 580 | Num Epochs = 3 | Total steps = 108
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 16
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 16 x 1) = 16
 "-____-"     Trainable parameters = 12,156,928/3,000,000,000 (0.41% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,12.4153
20,11.3275
30,9.6508
40,8.2289
50,7.2586
60,6.3691
70,5.6394
80,5.0827
90,4.6289
100,4.2862


In [12]:
max_seq_length = 1024
lora_rank = 8

In [13]:
# ——————————————————————————————
# 4) GRPO stage, initialized from SFT adapter
# ——————————————————————————————

# re‑load the same base & adapter structure
# model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name             = "unsloth/Llama-3.2-3B-Instruct",
#     max_seq_length         = max_seq_length,
#     load_in_4bit           = True,
#     fast_inference         = True,
#     max_lora_rank          = lora_rank,
#     gpu_memory_utilization = 0.9,
# )
# model = FastLanguageModel.get_peft_model(
#     model          = model,
#     r              = lora_rank,
#     target_modules = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
#     lora_alpha     = lora_rank,
# )
# load your SFT‐trained adapter
model.load_lora("sft_saved_lora_3B")

LoRARequest(lora_name='0', lora_int_id=0, lora_path='sft_saved_lora_3B', lora_tensors=None, lora_config=(None,), lora_local_path=None, long_lora_max_len=None, base_model_name=None, lora_embeddings=None)

In [14]:
def thinking_reward_func(completions, **kwargs) -> list[float]:
    """Reward function for including thinking tags"""
    rewards = []
    for completion in completions:
        try:
            reward = 0.0
            # Extract all thinking blocks
            for message in completion:
                if message["role"] == "assistant" and message.get("content"):
                    content = message["content"]

                    # Count opening and closing tags
                    opening_tags = len(re.findall(r"<thinking>", content))
                    closing_tags = len(re.findall(r"</thinking>", content))

                    if opening_tags == 0 or closing_tags == 0:
                        continue

                    if opening_tags == closing_tags:
                        reward += 0.5
                    else:
                        reward += 0.1
            reward = min(reward, 1.5)
            rewards.append(reward)
        except Exception as e:
            print(f"{RED}Error in thinking_reward_func: {e}{RESET}")
            rewards.append(0.0)
    assert len(rewards) == len(completions)
    return rewards


def answer_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function for including answer tags"""
    rewards = []
    for completion in completions:
        try:
            reward = 0.0
            # Extract all answer blocks
            for message in completion:
                if message["role"] == "assistant" and message.get("content"):
                    content = message["content"]

                    # Count opening and closing tags
                    opening_tags = len(re.findall(r"<answer>", content))
                    closing_tags = len(re.findall(r"</answer>", content))

                    if opening_tags == 0 or closing_tags == 0:
                        continue

                    if opening_tags == 1 and closing_tags == 1:
                        reward += 0.5
                    else:
                        reward += 0.1
            reward = min(reward, 1.5)
            rewards.append(reward)
        except Exception as e:
            print(f"{RED}Error in thinking_reward_func: {e}{RESET}")
            rewards.append(0.0)
    assert len(rewards) == len(completions)
    return rewards


def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
    """Safe, robust reward for NYT Connections."""
    rewards = []
    
    for completion, expected_answer in zip(completions, answer):
        # default
        reward = 0.0

        # 1) Extract last <answer>…</answer> block
        predicted_block = None
        for msg in completion:
            if msg.get("role") == "assistant" and msg.get("content"):
                matches = re.findall(r"<answer>(.*?)</answer>", msg["content"], re.DOTALL)
                if matches:
                    predicted_block = matches[-1].strip()
        if not predicted_block:
            rewards.append(0.0)
            continue

        # 2) Safe parse
        try:
            pred_groups = ast.literal_eval(predicted_block)
            exp_groups  = ast.literal_eval(expected_answer)
        except Exception:
            rewards.append(0.0)
            continue

        # 3) Normalize: keep only lists of strings
        def clean(groups):
            cleaned = []
            for g in groups:
                if isinstance(g, list):
                    cleaned.append([w for w in g if isinstance(w, str)])
            return cleaned

        pred = clean(pred_groups)
        exp  = clean(exp_groups)

        # 4) Scoring
        # Perfect match
        if pred == exp:
            rewards.append(6.0)
            continue

        # +1.5 for each fully correct group (set‐equality)
        used = set()
        for pg in pred:
            for i, eg in enumerate(exp):
                if i in used: 
                    continue
                if set(pg) == set(eg):
                    reward += 1.5
                    used.add(i)
                    break

        # +0.75 for any 3‐word overlap in unmatched groups
        for pg in pred:
            for i, eg in enumerate(exp):
                if i in used:
                    continue
                overlap = len([w for w in pg if w in eg])
                if overlap == 3:
                    reward += 0.75
                    used.add(i)
                    break

        # +0.25 if every predicted group has exactly 4 words
        if all(len(pg) == 4 for pg in pred):
            reward += 0.25

        # +0.5 if no word is repeated across all predicted groups
        flat = [w for pg in pred for w in pg]
        if len(flat) == len(set(flat)):
            reward += 0.5

        rewards.append(reward)

    assert len(rewards) == len(completions)
    return rewards

In [15]:
from trl import GRPOConfig, GRPOTrainer

rl_args = GRPOConfig(
    use_vllm                     = False,
    learning_rate                = 5e-6,
    adam_beta1                   = 0.9,
    adam_beta2                   = 0.99,
    weight_decay                 = 0.1,
    warmup_ratio                 = 0.1,
    lr_scheduler_type            = "cosine",
    optim                        = "adamw_8bit",
    logging_steps                = 1,
    bf16                         = is_bfloat16_supported(),
    fp16                         = not is_bfloat16_supported(),
    per_device_train_batch_size  = 1,
    gradient_accumulation_steps  = 16,
    num_generations              = 4,
    max_prompt_length            = 128,
    max_completion_length        = 64,
    num_train_epochs             = 1,
    max_steps                    = 50,
    save_steps                   = 10,
    max_grad_norm                = 0.1,
    report_to                    = "none",
    output_dir                   = "outputs_grpo",
)

trainer = GRPOTrainer(
    model            = model,
    processing_class = tokenizer,
    reward_funcs     = [
        thinking_reward_func,
        correctness_reward_func,
        answer_format_reward_func,
    ],
    args             = rl_args,
    train_dataset    = train_ds,  # or a mix of train & test
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 1 to the `num_generations` of 4


In [16]:
trainer.train()
model.save_lora("grpo_saved_lora_3B")

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 580 | Num Epochs = 2 | Total steps = 50
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 16
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 16 x 1) = 64
 "-____-"     Trainable parameters = 12,156,928/3,000,000,000 (0.41% trained)


Step,Training Loss,reward,reward_std,completion_length,kl,rewards / thinking_reward_func,rewards / correctness_reward_func,rewards / answer_format_reward_func
1,0.0002,0.196875,0.238795,64.0,0.004228,0.196875,0.0,0.0
2,0.0002,0.226562,0.226002,64.0,0.003957,0.226562,0.0,0.0
3,0.0002,0.125,0.134669,64.0,0.004196,0.117188,0.0,0.007812
4,0.0002,0.15625,0.246461,64.0,0.00393,0.15625,0.0,0.0
5,0.0002,0.203125,0.233253,63.890625,0.003848,0.203125,0.0,0.0
6,0.0002,0.164062,0.23567,64.0,0.003975,0.164062,0.0,0.0
7,0.0002,0.190625,0.240133,64.0,0.004478,0.190625,0.0,0.0
8,0.0002,0.142188,0.196422,64.0,0.004229,0.142188,0.0,0.0
9,0.0002,0.16875,0.200255,64.0,0.005388,0.16875,0.0,0.0
10,0.0002,0.1875,0.215211,63.09375,0.005797,0.1875,0.0,0.0


In [23]:
text = tokenizer.apply_chat_template([
    {"role" : "system", "content" : SYSTEM_PROMPT},
    {"role" : "user", "content" : "[FLY, SINK, SHOWER, SALSA, TAP, RACE, DIP, MODERN, FALL, SWING, CARROT, TEAR, DROP, TALK, BLAZE, BOOM]"},
], tokenize = False, add_generation_prompt = True)

from vllm import SamplingParams
sampling_params = SamplingParams(
    temperature = 0.8,
    top_p = 0.95,
    max_tokens = 1024,
)
output = model.fast_generate(
    text,
    sampling_params = sampling_params,
    lora_request = model.load_lora("grpo_saved_lora_3B"),
)[0].outputs[0].text

output

Processed prompts: 100%|██████████| 1/1 [00:06<00:00,  6.54s/it, est. speed input: 91.31 toks/s, output: 79.69 toks/s]


"<thinking>To start, let's analyze the words within the provided list. We can identify several connections:\n\n- A few words seem to be related to falls or dropping: 'FLY' (falling, 'DROP', 'TEAR', 'FALL')\n- Some words are related to actions: 'SINK', 'SWING', 'TALK', 'BOOM'\n- 'TAP' is connected to both 'DIP' and 'MODERN'\n- 'RACE' is related to 'CARROT', which might be a bit confusing but is a common phrase 'carrot and stick'\n- 'SHOWER' might seem out of place but could be connected to 'BLAZE' through the phrase 'fire and water'\n- A final set seems to be a mix of words that sound similar: 'FLY' and 'FLY' is not present, 'FALL' and 'TALK' and 'BOOM' are similar to 'SWING' and 'DIP' in sound\n\nAfter re-examining the list, I have come up with the following groups:\n\n<thinking> Group 1 consists of words that are related to falling or dropping: 'FLY', 'DROP', 'TEAR', 'FALL'\n Group 2 contains words that are connected to actions: 'SINK', 'SWING', 'TALK', 'BOOM'\n Group 3 includes words