In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%%capture
# Skip restarting message in Colab
import sys; modules = list(sys.modules.keys())
for x in modules: sys.modules.pop(x) if "PIL" in x or "google" in x else None

!pip install numpy==1.26.4
!pip install unsloth vllm==0.7.2

In [3]:
from unsloth import FastLanguageModel, is_bfloat16_supported
import torch
import re
from datasets import load_dataset

Unsloth: Patching Xformers to fix some performance issues.
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 03-19 00:15:46 __init__.py:190] Automatically detected platform cuda.


In [4]:
# Load and prep dataset
SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

XML_COT_FORMAT = """
<reasoning>
{reasoning}
</reasoning>
<answer>
{answer}
</answer"""

In [5]:
def extract_xml_answer(text: str) -> str:
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return answer.strip()

In [6]:
def get_connections_questions(split = "train"):
    data = load_dataset('csv', data_files='/content/drive/MyDrive/data/final_transformed_connections.csv')[split]
    data = data.train_test_split(test_size=0.1, seed=3407)
    train_data = data['train']
    test_data = data['test']
    train_data = train_data.map(lambda x: { # type: ignore
        'prompt': [
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': x['questions']}
        ],
        'answer': (x['answer'])
    })
    return train_data, test_data # type: ignore

dataset, test_dataset = get_connections_questions()

In [7]:
dataset

Dataset({
    features: ['questions', 'answer', 'grouped_answers', 'prompt'],
    num_rows: 580
})

In [8]:
def parse_groups(answer_text: str) -> list[set[str]]:
    groups = []
    lines = answer_text.strip().split("\n")
    for line in lines:
        if '-' in line:
            _, group_words_str = line.split("-", 1)
            words = [w.strip().strip("]").strip("[").strip("'") for w in group_words_str.split(",") if w.strip()]
            if words:
                groups.append(words)
    return groups

In [9]:
def extract_reasoning_block(response: str) -> str:
    """
    Extracts the content inside <reasoning>...</reasoning> tags.
    Returns reasoning text.
    """
    match = re.search(r"<reasoning>\s*(.*?)\s*</reasoning>", response, re.DOTALL | re.IGNORECASE)
    return match.group(1).strip() if match else ""

In [10]:
def grouping_correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
    responses = [completion[0]["content"] for completion in completions]
    extracted_answers = [extract_xml_answer(r) for r in responses]

    rewards = []
    for pred_groups in extracted_answers:
        try:
            pred_sets = [set(g) for g in parse_groups(pred_groups)]
            gt_sets = answer
            print(pred_sets, 'xxxxx', gt_sets)
            matched = 0
            for ps in pred_sets:
                if ps in gt_sets:
                    matched += 1
            rewards.append(0.25 * matched)
        except Exception:
            rewards.append(0.0)
    return rewards

In [11]:
def reasoning_quality_reward_func(completions, **kwargs) -> list[float]:
    keyword_patterns = [
        r"because", r"related to", r"belong to the same category", r"all are types of",
        r"they represent", r"share a common theme"
    ]
    responses = [completion[0]["content"] for completion in completions]
    reasoning_blocks = [extract_reasoning_block(r) for r in responses]

    rewards = []
    for reasoning in reasoning_blocks:
        score = sum(1 for pattern in keyword_patterns if re.search(pattern, reasoning, re.IGNORECASE))
        rewards.append(min(score / len(keyword_patterns), 1.0))  # Normalize 0–1
    return rewards

In [12]:
def strict_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r"^<reasoning>\n.*?\n</reasoning>\n<answer>\n.*?\n</answer>\n$"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def count_xml(text) -> float:
    count = 0.0
    if text.count("<reasoning>\n") == 1:
        count += 0.125
    if text.count("\n</reasoning>\n") == 1:
        count += 0.125
    if text.count("\n<answer>\n") == 1:
        count += 0.125
        count -= len(text.split("\n</answer>\n")[-1])*0.001
    if text.count("\n</answer>") == 1:
        count += 0.125
        count -= (len(text.split("\n</answer>")[-1]) - 1)*0.001
    return count

def xmlcount_reward_func(completions, **kwargs) -> list[float]:
    contents = [completion[0]["content"] for completion in completions]
    return [count_xml(c) for c in contents]

In [13]:
max_seq_length = 1024 # Can increase for longer reasoning traces
lora_rank = 8 # Larger rank = smarter, but slower

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Qwen/Qwen2.5-1.5B-Instruct",
    max_seq_length = max_seq_length,
    load_in_4bit = True, # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.5, # Reduce if out of memory
)

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ], # Remove QKVO if out of memory
    lora_alpha = lora_rank,
    random_state = 3407,
)

==((====))==  Unsloth 2025.3.15: Fast Qwen2 patching. Transformers: 4.48.3. vLLM: 0.7.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/qwen2.5-1.5b-instruct-unsloth-bnb-4bit with actual GPU utilization = 49.53%
Unsloth: Your GPU has CUDA compute capability 7.5 with VRAM = 14.74 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 1024. Num Sequences = 192.
Unsloth: vLLM's KV Cache can use up to 6.09 GB. Also swap space = 2 GB.
INFO 03-19 00:17:00 config.py:542] This model supports multiple tasks: {'score', 'reward', 'generate', 'embed', 'classify'}. Defaulting to 'generate'.
Unsloth: vLLM Bitsandbytes config usin

tokenizer_config.json:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

INFO 03-19 00:17:07 cuda.py:179] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.
INFO 03-19 00:17:07 cuda.py:227] Using XFormers backend.
INFO 03-19 00:17:08 model_runner.py:1110] Starting to load model unsloth/qwen2.5-1.5b-instruct-unsloth-bnb-4bit...
INFO 03-19 00:17:08 loader.py:1102] Loading weights with BitsAndBytes quantization.  May take a while ...
INFO 03-19 00:17:10 weight_utils.py:252] Using model weights format ['*.safetensors']


model.safetensors:   0%|          | 0.00/1.53G [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-19 00:17:33 model_runner.py:1115] Loading model weights took 1.4331 GB
INFO 03-19 00:17:33 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 03-19 00:17:41 worker.py:267] Memory profiling takes 8.17 seconds
INFO 03-19 00:17:41 worker.py:267] the current vLLM instance can use total_gpu_memory (14.74GiB) x gpu_memory_utilization (0.50) = 7.30GiB
INFO 03-19 00:17:41 worker.py:267] model weights take 1.43GiB; non_torch_memory takes 0.03GiB; PyTorch activation peak memory takes 1.05GiB; the rest of the memory reserved for KV Cache is 4.79GiB.
INFO 03-19 00:17:42 executor_base.py:110] # CUDA blocks: 11221, # CPU blocks: 4681
INFO 03-19 00:17:42 executor_base.py:115] Maximum concurrency for 1024 tokens per request: 175.33x
INFO 03-19 00:17:44 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error o

Capturing CUDA graph shapes: 100%|██████████| 27/27 [00:40<00:00,  1.51s/it]

INFO 03-19 00:18:25 model_runner.py:1562] Graph capturing finished in 41 secs, took 0.47 GiB
INFO 03-19 00:18:25 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 52.66 seconds





tokenizer_config.json:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Unsloth 2025.3.15 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [14]:
from trl import GRPOConfig, GRPOTrainer
training_args = GRPOConfig(
    use_vllm = True, # use vLLM for fast inference!
    learning_rate = 5e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    optim = "adamw_8bit",
    logging_steps = 1,
    bf16 = is_bfloat16_supported(),
    fp16 = not is_bfloat16_supported(),
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 16, # Increase to 4 for smoother training
    num_generations = 4, # Decrease if out of memory
    max_prompt_length = 128,
    max_completion_length = 64,
    num_train_epochs = 1, # Set to 1 for a full training run
    save_steps = 10,
    max_grad_norm = 0.1,
    report_to = "none", # Can use Weights & Biases
    output_dir = "outputs",
)

Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 1 to the `num_generations` of 4


In [15]:
trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        xmlcount_reward_func,
        strict_format_reward_func,
        reasoning_quality_reward_func,
        grouping_correctness_reward_func,
    ],
    args = training_args,
    train_dataset = dataset,
)
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 580 | Num Epochs = 1 | Total steps = 36
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 16
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 16 x 1) = 64
 "-____-"     Trainable parameters = 9,232,384/5,000,000,000 (0.18% trained)


Step,Training Loss,reward,reward_std,completion_length,kl,rewards / xmlcount_reward_func,rewards / strict_format_reward_func,rewards / reasoning_quality_reward_func,rewards / grouping_correctness_reward_func
1,0.0,-0.002875,0.00575,60.109375,0.0,-0.002875,0.0,0.0,0.0
2,0.0,0.001953,0.003906,58.578125,0.0,0.001953,0.0,0.0,0.0
3,0.0,-0.004109,0.014031,63.15625,1.5e-05,-0.004109,0.0,0.0,0.0
4,0.0,0.003906,0.007812,63.03125,1.6e-05,0.003906,0.0,0.0,0.0
5,0.0,0.002141,0.004281,62.265625,1.5e-05,0.002141,0.0,0.0,0.0
6,0.0,0.005047,0.012406,61.203125,1.7e-05,0.005047,0.0,0.0,0.0
7,0.0,-0.000531,0.016687,61.125,1.6e-05,-0.000531,0.0,0.0,0.0
8,0.0,-0.003141,0.006281,59.0625,1.7e-05,-0.003141,0.0,0.0,0.0
9,0.0,-0.001047,0.005594,59.8125,1.8e-05,-0.001047,0.0,0.0,0.0
10,0.0,0.004375,0.019625,61.328125,1.9e-05,0.004375,0.0,0.0,0.0


Unsloth: Will smartly offload gradients to save VRAM!


KeyboardInterrupt: 