In [1]:
import sys
import ast

In [2]:
# !{sys.executable} -m pip install numpy==1.26.4
# !{sys.executable} -m pip install unsloth vllm==0.7.2
# !{sys.executable} -m pip install -U ipywidgets

In [3]:
# from unsloth import FastLanguageModel, is_bfloat16_supported
import torch
import re
from datasets import load_dataset

In [4]:
from unsloth import FastLanguageModel, is_bfloat16_supported

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 04-21 21:55:14 __init__.py:190] Automatically detected platform cuda.


In [5]:
from transformers import (
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    Trainer
)

In [6]:
SYSTEM_PROMPT = '''
You are playing the NY Times Connections game. Your task is to categorize 16 given words into exactly 4 groups of 4 words each, based on shared common themes.

Only output your final solution in the following format:
<answer>
[['WORD1', 'WORD2', 'WORD3', 'WORD4'],
 ['WORD5', 'WORD6', 'WORD7', 'WORD8'],
 ['WORD9', 'WORD10', 'WORD11', 'WORD12'],
 ['WORD13', 'WORD14', 'WORD15', 'WORD16']]
</answer>

Rules:
	• Each word must belong to one group only.
	• Groups must have a clear, shared theme (e.g., weather, NBA teams, keyboard keys, etc.).
	• Do not include any words not present in the input list.

Here is an example:

USER: [BUCKS, HAIL, JAZZ, SHIFT, LEVEL, MOM, SNOW, RACECAR, SLEET, TAB, KAYAK, RETURN, OPTION, NETS, RAIN, HEAT]

SOLUTION:
[['HAIL', 'RAIN', 'SLEET', 'SNOW'],
 ['BUCKS', 'HEAT', 'JAZZ', 'NETS'],
 ['OPTION', 'RETURN', 'SHIFT', 'TAB'],
 ['KAYAK', 'LEVEL', 'MOM', 'RACECAR']]

Explanation:
- WEATHER TERMS: 'HAIL', 'RAIN', 'SLEET', 'SNOW'
- NBA TEAMS: 'BUCKS', 'HEAT', 'JAZZ', 'NETS'
- KEYBOARD KEYS: 'OPTION', 'RETURN', 'SHIFT', 'TAB'
- PALINDROMES: 'KAYAK', 'LEVEL', 'MOM', 'RACECAR'
'''

In [7]:
def get_connections_questions(split="train"):
    data = load_dataset('csv', data_files='../data/final_transformed_connections.csv')[split]
    data = data.train_test_split(test_size=0.1, seed=3407)
    train_data = data['train']
    test_data  = data['test']
    # each x has 'questions' (the 16‐word list) and 'answer' (the reference)
    train_data = train_data.map(lambda x: {
        'prompt': [
            {'role': 'assistant', 'content': SYSTEM_PROMPT},
            {'role': 'user',      'content': x['questions']}
        ],
        'answer': x['answer']
    })
    test_data = test_data.map(lambda x: {
        'prompt': [
            {'role': 'assistant', 'content': SYSTEM_PROMPT},
            {'role': 'user',      'content': x['questions']}
        ],
        'answer': x['answer']
    })
    return train_data, test_data

train_ds, test_ds = get_connections_questions()

In [8]:
# ——————————————————————————————
# 2) Initialize model + LoRA adapter
# ——————————————————————————————

max_seq_length = 2048
lora_rank       = 8

# load base model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name            = "unsloth/Llama-3.2-3B-Instruct",
    max_seq_length        = max_seq_length,
    load_in_4bit          = True,    # 4‑bit base + LoRA
    fast_inference        = True,
    max_lora_rank         = lora_rank,
    gpu_memory_utilization= 0.9,
)

# wrap with PEFT
model = FastLanguageModel.get_peft_model(
    model              = model,
    r                  = lora_rank,
    target_modules     = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    lora_alpha         = lora_rank,
    random_state       = 3407,
)

# Make sure we also have a HF‐style tokenizer
hf_tokenizer = AutoTokenizer.from_pretrained("unsloth/Llama-3.2-3B-Instruct", use_fast=True)
hf_tokenizer.pad_token_id = hf_tokenizer.eos_token_id


==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.3. vLLM: 0.7.2.
   \\   /|    Tesla V100-PCIE-32GB. Num GPUs = 1. Max memory: 31.733 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 7.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Your GPU does not support prefix caching - will disable!
Unsloth: vLLM loading unsloth/llama-3.2-3b-instruct-unsloth-bnb-4bit with actual GPU utilization = 89.03%
Unsloth: Your GPU has CUDA compute capability 7.0 with VRAM = 31.73 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 2048. Num Sequences = 320.
Unsloth: vLLM's KV Cache can use up to 25.85 GB. Also swap space = 6 GB.
INFO 04-21 21:55:29 config.py:542] This model supports multiple tasks: {'score', 'embed', 'generate', 'classif



INFO 04-21 21:55:32 weight_utils.py:252] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 04-21 21:55:34 model_runner.py:1115] Loading model weights took 2.2405 GB
INFO 04-21 21:55:34 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 04-21 21:55:36 worker.py:267] Memory profiling takes 1.97 seconds
INFO 04-21 21:55:36 worker.py:267] the current vLLM instance can use total_gpu_memory (31.73GiB) x gpu_memory_utilization (0.89) = 28.25GiB
INFO 04-21 21:55:36 worker.py:267] model weights take 2.24GiB; non_torch_memory takes 0.07GiB; PyTorch activation peak memory takes 1.49GiB; the rest of the memory reserved for KV Cache is 24.45GiB.
INFO 04-21 21:55:36 executor_base.py:110] # CUDA blocks: 14309, # CPU blocks: 3510
INFO 04-21 21:55:36 executor_base.py:115] Maximum concurrency for 2048 tokens per request: 111.79x
INFO 04-21 21:55:40 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error

Capturing CUDA graph shapes: 100%|██████████| 43/43 [00:31<00:00,  1.38it/s]

INFO 04-21 21:56:11 model_runner.py:1562] Graph capturing finished in 31 secs, took 0.76 GiB
INFO 04-21 21:56:11 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 37.40 seconds



Unsloth 2025.3.19 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [9]:
# ——————————————————————————————
# 3) Preprocess: pad labels to 1024
# ——————————————————————————————
def preprocess_sft(batch):
    inputs = [
        tokenizer.apply_chat_template(p, tokenize=False, add_generation_prompt=False)
        for p in batch["prompt"]
    ]
    model_inputs = hf_tokenizer(
        inputs,
        max_length = max_seq_length,
        padding    = "max_length",
        truncation = True,
    )
    labels = hf_tokenizer(
        batch["answer"],
        max_length = max_seq_length,    # ← pad answers out to 1024
        padding    = "max_length",
        truncation = True,
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_enc = train_ds.map(
    preprocess_sft, batched=True, remove_columns=train_ds.column_names
)
test_enc  = test_ds.map(
    preprocess_sft, batched=True, remove_columns=test_ds.column_names
)

data_collator = DataCollatorForSeq2Seq(
    tokenizer           = hf_tokenizer,
    model               = model.model,   # unwrapped base
    label_pad_token_id  = -100,          # ignore padded labels
)

In [10]:
# ——————————————————————————————
# 4) Trainer with padded labels
# ——————————————————————————————
sft_args = TrainingArguments(
    output_dir                  = "sft_outputs",
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 16,
    num_train_epochs            = 20,
    learning_rate               = 5e-5,
    fp16                        = not is_bfloat16_supported(),
    bf16                        = is_bfloat16_supported(),
    logging_steps               = 10,
    save_steps                  = 100,
    save_total_limit            = 2,
)

sft_trainer = Trainer(
    model         = model,
    args          = sft_args,
    train_dataset = train_enc,
    eval_dataset  = test_enc,
    tokenizer     = hf_tokenizer,
    data_collator = DataCollatorForSeq2Seq(
        tokenizer          = hf_tokenizer,
        model              = model.model,
        label_pad_token_id = -100,
    ),
)

  sft_trainer = Trainer(
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [11]:
# run the SFT pass
sft_trainer.train()
model.save_lora("sft_saved_lora_3B_sft")

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 580 | Num Epochs = 20 | Total steps = 720
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 16
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 16 x 1) = 16
 "-____-"     Trainable parameters = 12,156,928/3,000,000,000 (0.41% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,12.4
20,11.322
30,9.504
40,8.0095
50,6.5728
60,4.8426
70,3.1192
80,1.6077
90,0.563
100,0.2697


In [12]:
max_seq_length = 1024
lora_rank = 8

In [14]:
text = tokenizer.apply_chat_template([
    {"role" : "system", "content" : SYSTEM_PROMPT},
    {"role" : "user", "content" : "[FLY, SINK, SHOWER, SALSA, TAP, RACE, DIP, MODERN, FALL, SWING, CARROT, TEAR, DROP, TALK, BLAZE, BOOM]"},
], tokenize = False, add_generation_prompt = True)

from vllm import SamplingParams
sampling_params = SamplingParams(
    temperature = 0.8,
    top_p = 0.95,
    max_tokens = 1024,
)
output = model.fast_generate(
    text,
    sampling_params = sampling_params,
    lora_request = model.load_lora("sft_saved_lora_3B_sft"),
)[0].outputs[0].text

output

Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.30s/it, est. speed input: 353.84 toks/s, output: 73.84 toks/s]


"After examining the given list of words, I've categorized them into 4 groups of 4 words each based on shared themes. Here's my solution:\n\n[['FLY', 'SWING', 'TAP', 'DIP'],\n ['SINK', 'SHOWER', 'DROP', 'FALL'],\n ['RACE', 'BOOM', 'BLAZE', 'TEAR'],\n ['SALSA', 'MODERN', 'CARROT', 'TALK']]"

[['DIP', 'DROP', 'FALL', 'SINK'], ['BLAZE', 'FLY', 'RACE', 'TEAR'], ['MODERN', 'SALSA', 'SWING', 'TAP'], ['BOOM', 'CARROT', 'SHOWER', 'TALK']]