In [1]:
import sys

In [2]:
# !{sys.executable} -m pip install numpy==1.26.4
# !{sys.executable} -m pip install unsloth vllm==0.7.2
# !{sys.executable} -m pip install -U ipywidgets

In [3]:
# from unsloth import FastLanguageModel, is_bfloat16_supported
import torch
import re
from datasets import load_dataset

In [4]:
from unsloth import FastLanguageModel, is_bfloat16_supported

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-04-16 21:09:50.550007: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744862990.567735 1385304 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744862990.573234 1385304 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1744862990.587364 1385304 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744862990.587381 1385304 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744862990.587383 1385304 computation_placer.cc:177] computation placer alr

🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 04-16 21:09:57 __init__.py:190] Automatically detected platform cuda.


In [5]:
SYSTEM_PROMPT = '''
You are playing the NY Times Connections game. Your task is to categorize 16 given words into exactly 4 groups of 4 words each, based on shared common themes.

Solve the puzzle using these clear steps:

1. THINK STEP-BY-STEP: Begin by carefully analyzing the words within <thinking> tags. Identify their meanings, relationships, and possible groupings logically.
Example:
<thinking>To solve this, I will first look for obvious groupings like NBA teams, palindrome words, weather-related words, or computer keys. Then, I'll group these words accordingly and ensure each word belongs to exactly one group.</thinking>

2. SHOW YOUR REASONING: After identifying each grouping, briefly explain why these words belong together. Clearly indicate the context or theme.

3. PROVIDE FINAL ANSWER: After clearly grouping and justifying all four sets, provide ONLY your final solution within <answer> tags. Format your solution exactly as shown below.

Example:
<answer>
[['HAIL', 'RAIN', 'SLEET', 'SNOW'],
 ['BUCKS', 'HEAT', 'JAZZ', 'NETS'],
 ['OPTION', 'RETURN', 'SHIFT', 'TAB'],
 ['KAYAK', 'LEVEL', 'MOM', 'RACECAR']]
</answer>

Important Notes:
- Categories should be specific
- Words cannot appear in more than one group.
- Categories can include compound words, shared prefixes/suffixes, pop culture references, or common phrases.
- DO NOT ADD NEW WORDS THAT ARE NOT MENTIONED IN THE QUESTION. USE ONLY WORDS MENTIONED AND GROUP THEM

Here is an example:

USER: [BUCKS, HAIL, JAZZ, SHIFT, LEVEL, MOM, SNOW, RACECAR, SLEET, TAB, KAYAK, RETURN, OPTION, NETS, RAIN, HEAT]

SOLUTION:
[['HAIL', 'RAIN', 'SLEET', 'SNOW'],
 ['BUCKS', 'HEAT', 'JAZZ', 'NETS'],
 ['OPTION', 'RETURN', 'SHIFT', 'TAB'],
 ['KAYAK', 'LEVEL', 'MOM', 'RACECAR']]

Explanation:
- WEATHER TERMS: 'HAIL', 'RAIN', 'SLEET', 'SNOW'
- NBA TEAMS: 'BUCKS', 'HEAT', 'JAZZ', 'NETS'
- KEYBOARD KEYS: 'OPTION', 'RETURN', 'SHIFT', 'TAB'
- PALINDROMES: 'KAYAK', 'LEVEL', 'MOM', 'RACECAR'
'''

In [6]:
def get_connections_questions(split = "train"):
    data = load_dataset('csv', data_files='../data/final_transformed_connections.csv')[split]
    data = data.train_test_split(test_size=0.1, seed=3407)
    train_data = data['train']
    test_data = data['test']
    train_data = train_data.map(lambda x: { # type: ignore
        'prompt': [
            {'role': 'assistant', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': x['questions']}
        ],
        'answer': (x['answer'])
    })
    return train_data, test_data # type: ignore

dataset, test_dataset = get_connections_questions()

In [7]:
def thinking_reward_func(completions, **kwargs) -> list[float]:
    """Reward function for including thinking tags"""
    rewards = []
    for completion in completions:
        try:
            reward = 0.0
            # Extract all thinking blocks
            for message in completion:
                if message["role"] == "assistant" and message.get("content"):
                    content = message["content"]

                    # Count opening and closing tags
                    opening_tags = len(re.findall(r"<thinking>", content))
                    closing_tags = len(re.findall(r"</thinking>", content))

                    if opening_tags == 0 or closing_tags == 0:
                        continue

                    if opening_tags == closing_tags:
                        reward += 0.5
                    else:
                        reward += 0.1
            reward = min(reward, 1.5)
            rewards.append(reward)
        except Exception as e:
            print(f"{RED}Error in thinking_reward_func: {e}{RESET}")
            rewards.append(0.0)
    assert len(rewards) == len(completions)
    return rewards


def answer_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function for including answer tags"""
    rewards = []
    for completion in completions:
        try:
            reward = 0.0
            # Extract all answer blocks
            for message in completion:
                if message["role"] == "assistant" and message.get("content"):
                    content = message["content"]

                    # Count opening and closing tags
                    opening_tags = len(re.findall(r"<answer>", content))
                    closing_tags = len(re.findall(r"</answer>", content))

                    if opening_tags == 0 or closing_tags == 0:
                        continue

                    if opening_tags == 1 and closing_tags == 1:
                        reward += 0.5
                    else:
                        reward += 0.1
            reward = min(reward, 1.5)
            rewards.append(reward)
        except Exception as e:
            print(f"{RED}Error in thinking_reward_func: {e}{RESET}")
            rewards.append(0.0)
    assert len(rewards) == len(completions)
    return rewards


def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
    """Reward function tailored for NY Times Connections game puzzle answers, with extended partial credit."""
    rewards = []

    for completion, expected_answer in zip(completions, answer):
        try:
            reward = 0.0

            # Extract the answer from completion
            predicted_answer = None
            for message in completion:
                if message["role"] == "assistant":
                    content = message.get("content", "")
                    matches = re.findall(r"<answer>(.*?)</answer>", content, re.DOTALL)
                    if matches:
                        predicted_answer = matches[-1].strip()

            if predicted_answer is None:
                rewards.append(0.0)
                continue

            # Convert string representations of lists into actual Python lists for comparison
            try:
                predicted_groups = eval(predicted_answer)
                expected_groups = eval(expected_answer)

                if isinstance(predicted_groups, list) and isinstance(expected_groups, list):
                    # +6 for exact match of all 4 groups
                    if predicted_groups == expected_groups:
                        reward += 6.0
                    else:
                        # +1.5 for each completely correct group
                        correct_groups = sum(1 for group in predicted_groups if group in expected_groups)
                        reward += correct_groups * 1.5

                        # +0.75 for groups with 3 correct words
                        for pred_group in predicted_groups:
                            for exp_group in expected_groups:
                                overlap = len(set(pred_group) & set(exp_group))
                                if overlap == 3:
                                    reward += 0.75
                                    break  # Avoid counting the same group multiple times

                        # +0.25 for each group having exactly 4 words
                        if all(isinstance(group, list) and len(group) == 4 for group in predicted_groups):
                            reward += 0.25

                        # +0.5 for no repeated words
                        all_words = [word for group in predicted_groups for word in group]
                        if len(set(all_words)) == len(all_words):
                            reward += 0.5
                else:
                    reward = 0.0

                rewards.append(reward)

            except Exception as e:
                print(f"Error1 in correctness_reward_func: {e}")
                print(completions)
                rewards.append(0.0)
            
        except Exception as e:
            print(f"Error2 in correctness_reward_func: {e}")
            rewards.append(0.0)

    assert len(rewards) == len(completions)
    return rewards


In [8]:
max_seq_length = 1024 # Can increase for longer reasoning traces
lora_rank = 8 # Larger rank = smarter, but slower

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct",
    max_seq_length = max_seq_length,
    load_in_4bit = True, # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.9, # Reduce if out of memory
)

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ], # Remove QKVO if out of memory
    lora_alpha = lora_rank,
    random_state = 3407,
)

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.49.0. vLLM: 0.7.2.
   \\   /|    Tesla V100-PCIE-32GB. Num GPUs = 1. Max memory: 31.733 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 7.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Your GPU does not support prefix caching - will disable!
Unsloth: vLLM loading unsloth/llama-3.2-1b-instruct-unsloth-bnb-4bit with actual GPU utilization = 7.79%
Unsloth: Your GPU has CUDA compute capability 7.0 with VRAM = 31.73 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 1024. Num Sequences = 128.
Unsloth: vLLM's KV Cache can use up to 1.41 GB. Also swap space = 6 GB.
INFO 04-16 21:10:18 config.py:542] This model supports multiple tasks: {'embed', 'score', 'classify', 'generate'



INFO 04-16 21:10:21 loader.py:1102] Loading weights with BitsAndBytes quantization.  May take a while ...
INFO 04-16 21:10:22 weight_utils.py:252] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 04-16 21:10:40 model_runner.py:1115] Loading model weights took 1.0453 GB
INFO 04-16 21:10:40 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 04-16 21:10:43 worker.py:267] Memory profiling takes 2.75 seconds
INFO 04-16 21:10:43 worker.py:267] the current vLLM instance can use total_gpu_memory (31.73GiB) x gpu_memory_utilization (0.08) = 2.47GiB
INFO 04-16 21:10:43 worker.py:267] model weights take 1.05GiB; non_torch_memory takes 0.07GiB; PyTorch activation peak memory takes 0.59GiB; the rest of the memory reserved for KV Cache is 0.77GiB.
INFO 04-16 21:10:44 executor_base.py:110] # CUDA blocks: 1568, # CPU blocks: 12288
INFO 04-16 21:10:44 executor_base.py:115] Maximum concurrency for 1024 tokens per request: 24.50x
INFO 04-16 21:10:51 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error oc

Capturing CUDA graph shapes: 100%|██████████| 19/19 [00:17<00:00,  1.07it/s]

INFO 04-16 21:11:09 model_runner.py:1562] Graph capturing finished in 18 secs, took 0.25 GiB
INFO 04-16 21:11:09 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 28.19 seconds



Unsloth 2025.3.19 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


In [9]:
from trl import GRPOConfig, GRPOTrainer
training_args = GRPOConfig(
    use_vllm = False, # use vLLM for fast inference!
    learning_rate = 5e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    optim = "adamw_8bit",
    logging_steps = 1,
    bf16 = is_bfloat16_supported(),
    fp16 = not is_bfloat16_supported(),
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 16, # Increase to 4 for smoother training
    num_generations = 4, # Decrease if out of memory
    max_prompt_length = 128,
    max_completion_length = 64,
    num_train_epochs = 1, # Set to 1 for a full training run
    max_steps = 150,
    save_steps = 50,
    max_grad_norm = 0.1,
    report_to = "none", # Can use Weights & Biases
    output_dir = "outputs",
)

Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 1 to the `num_generations` of 4


In [10]:
trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        thinking_reward_func,
        correctness_reward_func,
        answer_format_reward_func,
    ],
    args = training_args,
    train_dataset = dataset,
)
trainer.train()

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 580 | Num Epochs = 5 | Total steps = 150
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 16
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 16 x 1) = 64
 "-____-"     Trainable parameters = 5,636,096/1,000,000,000 (0.56% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,reward,reward_std,completion_length,kl,rewards / thinking_reward_func,rewards / correctness_reward_func,rewards / answer_format_reward_func
1,0.0,0.039062,0.078125,63.390625,0.0,0.0,0.023438,0.015625
2,-0.0,0.042188,0.084375,62.53125,0.0,0.0,0.023438,0.01875
3,0.0,0.027344,0.054688,62.03125,1.9e-05,0.0,0.011719,0.015625
4,0.0,0.074219,0.115418,62.359375,2.1e-05,0.0,0.035156,0.039062
5,0.0,0.066406,0.132812,60.125,2.3e-05,0.0,0.035156,0.03125


Error1 in correctness_reward_func: invalid syntax (<string>, line 1)
[[{'role': 'assistant', 'content': "BASED ON THE GIVEN WORDS, THE FOLLOWING IS MY GROUPED SOLUTION:\n\n[['KELVIN', 'AGENT', 'IMPLY', 'Verse'],\n ['POSSIBLY', 'CHEW', 'SUGGEST', 'THOUSAND'],\n [['COUCH', 'WHITE', 'ISSUE', '"}], [{'role': 'assistant', 'content': "EXPLANATION:\n['SCISSORS', 'LINE', 'ASSASSIN', 'VERSE'], ['AGENT', 'KO', 'METER', 'THOUSAND'], ['LEEP', 'COUCH', 'ASSET', 'POTASSIUM']"}], [{'role': 'assistant', 'content': "Here are 4 groups of 4 words each:\n\n <answer>\n[['KELVIN', 'AGENT', 'IMPLY', 'THOUSAND'],\n ['SUGGEST', 'VERSE', 'ASSET', 'LINE'],\n ['COUCH', 'MOLE', 'METER', 'OKAY'],\n ['"}], [{'role': 'assistant', 'content': "<aus cane <answer>\n& <answer>\n ['KELVIN', 'AGENT', 'IMPLY', 'VERSE']\n</answer>\n\nsolution:\n['KELVIN', 'AGENT', 'ASSET', 'METER']\n\nExplanation:\n- NUCLEAR ELEMENT: 'KELVIN', 'AGENT', '"}]]
Error1 in correctness_reward_func: invalid syntax (<string>, line 2)
[[{'role': 'assi

KeyboardInterrupt: 

In [None]:
model.save_lora("grpo_saved_lora_1B")

In [None]:
text = tokenizer.apply_chat_template([
    {"role" : "system", "content" : SYSTEM_PROMPT},
    {"role" : "user", "content" : "[BOXER, SLOW, MUZZLE, DALMATIAN, HUSKY, WAVE, SNOUT, DETOUR, TRUNK, YIELD, ASLEEP, BARRIER, POODLE, BEAK, STOP, BITE]"},
], tokenize = False, add_generation_prompt = True)

from vllm import SamplingParams
sampling_params = SamplingParams(
    temperature = 0.8,
    top_p = 0.95,
    max_tokens = 1024,
)
output = model.fast_generate(
    text,
    sampling_params = sampling_params,
    lora_request = model.load_lora("grpo_saved_lora_1B"),
)[0].outputs[0].text

output

In [None]:
output

In [None]:
test_dataset[0]