In [1]:
%%capture
# Install necessary packages for fast model inference and GPU management.
!pip install unsloth vllm  
!pip install triton==3.1.0  
!pip install -U pynvml

In [1]:
!pip install --upgrade pip
!pip uninstall -y torch torchvision torchaudio vllm unsloth
!pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu118
!pip install unsloth vllm

Collecting pip
  Downloading pip-25.0.1-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-25.0.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.0.1
Found existing installation: torch 2.5.1+cu121
Uninstalling torch-2.5.1+cu121:
  Successfully uninstalled torch-2.5.1+cu121
Found existing installation: torchvision 0.20.1+cu121
Uninstalling torchvision-0.20.1+cu121:
  Successfully uninstalled torchvision-0.20.1+cu121
Found existing installation: torchaudio 2.5.1+cu121
Uninstalling torchaudio-2.5.1+cu121:
  Successfully uninstalled torchaudio-2.5.1+cu121
[0mLooking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch==2.1.0
  Downloading https://downloa

In [4]:
import re
import os
import json
from datasets import load_dataset, Dataset
from unsloth import FastLanguageModel
import torch
from trl import GRPOConfig, GRPOTrainer
from vllm import SamplingParams
import wandb  # Import Weights & Biases
from tqdm import tqdm  # Progress bar


# ------------------------------ Configuration ------------------------------
wandb.login(key="3109e45ecb4ed9dad85e22af19852af76198d140")
WANDB_PROJECT = "QwenMATH-GRPO"  # Replace with your W&B project name
MODEL_NAME = "Qwen/Qwen2.5-Math-1.5B-Instruct"
OUTPUT_DIR = "/kaggle/working/"

MAX_SEQ_LENGTH = 1024
LORA_RANK = 32
MAX_PROMPT_LENGTH = 256  # Adjusted to keep completion length within reasonable bounds
NUM_GENERATIONS = 6  # Adjusted to keep completion length within reasonable bounds
BATCH_SIZE = 1
GRADIENT_ACCUMULATION_STEPS = 4
MAX_STEPS = 500  # Adjusted to a more realistic number
SAVE_STEPS = 500
LEARNING_RATE = 5e-6

SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

# ------------------------------ Data Preparation ------------------------------


XML_COT_FORMAT = """\
<reasoning>
{reasoning}
</reasoning>
<answer>
{answer}
</answer>
"""

def extract_xml_answer(text: str) -> str:
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return answer.strip()

def extract_hash_answer(text: str) -> str | None:
    if "####" not in text:
        return None
    return text.split("####")[1].strip()


def get_gsm8k_questions(split="train") -> Dataset:
    data = load_dataset("openai/gsm8k", "main")[split]  # type: ignore
    data = data.map(
        lambda x: {  # type: ignore
            "prompt": [
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": x["question"]},
            ],
            "answer": extract_hash_answer(x["answer"]),
        }
    )  # type: ignore
    return data  # type: ignore


# ------------------------------ Reward Functions ------------------------------


def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
    responses = [completion[0]["content"] for completion in completions]
    q = prompts[0][-1]["content"]
    extracted_responses = [extract_xml_answer(r) for r in responses]
    print(
        "-" * 20,
        f"Question:\n{q}",
        f"\nAnswer:\n{answer[0]}",
        f"\nResponse:\n{responses[0]}",
        f"\nExtracted:\n{extracted_responses[0]}",
    )
    return [2.0 if r == a else 0.0 for r, a in zip(extracted_responses, answer)]


def int_reward_func(completions, **kwargs) -> list[float]:
    responses = [completion[0]["content"] for completion in completions]
    extracted_responses = [extract_xml_answer(r) for r in responses]
    return [0.5 if r.isdigit() else 0.0 for r in extracted_responses]


def strict_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r"^<reasoning>\n.*?\n</reasoning>\n<answer>\n.*?\n</answer>\n$"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]


def soft_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def count_xml(text) -> float:
    count = 0.0
    if text.count("<reasoning>\n") == 1:
        count += 0.125
    if text.count("\n</reasoning>\n") == 1:
        count += 0.125
    if text.count("\n<answer>\n") == 1:
        count += 0.125
        count -= len(text.split("\n</answer>\n")[-1])*0.001
    if text.count("\n</answer>") == 1:
        count += 0.125
        count -= (len(text.split("\n</answer>")[-1]) - 1)*0.001
    return count

def xmlcount_reward_func(completions, **kwargs) -> list[float]:
    contents = [completion[0]["content"] for completion in completions]
    return [count_xml(c) for c in contents]


# ------------------------------ Model Initialization ------------------------------


def initialize_model():
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=MODEL_NAME,
        max_seq_length=MAX_SEQ_LENGTH,
        load_in_4bit=True,
        fast_inference=True,
        max_lora_rank=LORA_RANK,
        gpu_memory_utilization=0.6,
    )

    model = FastLanguageModel.get_peft_model(
        model,
        r=LORA_RANK,
        target_modules=[
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
        ],
        lora_alpha=LORA_RANK,
        use_gradient_checkpointing="unsloth",
        random_state=3407,
    )
    return model, tokenizer


# ------------------------------ Training Function ------------------------------


def train(model, tokenizer, train_dataset, reward_functions):
    training_args = GRPOConfig(
        learning_rate=LEARNING_RATE,
        adam_beta1=0.9,
        adam_beta2=0.99,
        weight_decay=0.1,
        warmup_ratio=0.1,
        lr_scheduler_type="cosine",
        optim="paged_adamw_8bit",
        logging_steps=1,
        per_device_train_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        num_generations=NUM_GENERATIONS,
        max_prompt_length=MAX_PROMPT_LENGTH,
        max_completion_length=MAX_SEQ_LENGTH - MAX_PROMPT_LENGTH,
        max_steps=MAX_STEPS,
        save_steps=SAVE_STEPS,
        max_grad_norm=0.3,  # Increased max_grad_norm for better stabilization
        report_to="wandb",  # Enable Weights & Biases reporting
        output_dir=OUTPUT_DIR,
    )

    trainer = GRPOTrainer(
        model=model,
        processing_class=tokenizer,
        reward_funcs=reward_functions,
        args=training_args,
        train_dataset=train_dataset,
    )

    trainer.train()
    return trainer


# ------------------------------ Testing Function ------------------------------


def extract_answer(text: str) -> str | None:
    """
    Extracts the final numerical answer from a response.
    It tries XML extraction first, then falls back to a regex-based approach.
    """
    # Try XML extraction
    xml_match = re.search(r"<answer>\s*(.*?)\s*</answer>", text, re.DOTALL)
    if xml_match:
        return xml_match.group(1).strip()

    # If XML extraction fails, try regex for a number at the end
    regex_match = re.search(r"(\d+)$", text)
    if regex_match:
        return regex_match.group(1)  # Return the captured number

    return None  # No answer found


def test(model, tokenizer, test_dataset: Dataset, results_file="test_results.jsonl", batch_size=4):
    """
    Evaluates the model's performance, saves results to a JSON Lines file,
    and logs metrics to Weights & Biases.
    """
    model.eval()
    total_correct = 0
    total_samples = 0

    sampling_params = SamplingParams(
        temperature=0.7, top_p=0.95, max_tokens=MAX_SEQ_LENGTH - MAX_PROMPT_LENGTH
    )

    with torch.no_grad(), open(results_file, "w", encoding="utf-8") as outfile:
        for i in tqdm(range(0, len(test_dataset), batch_size), desc="Testing"):
            batch = test_dataset[i : i + batch_size]
            prompts = batch["prompt"]
            answers = batch["answer"]

            # Tokenize prompts and generate text in batches
            tokenized_prompts = [
                tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
                for prompt in prompts
            ]

            # Generate text using the model for the batch of prompts
            outputs = model.fast_generate(
                tokenized_prompts, sampling_params=sampling_params, lora_request=None
            )

            predictions = [output.outputs[0].text for output in outputs]

            for prompt, pred, ans in zip(prompts, predictions, answers):
                extracted_prediction = extract_answer(pred)

                # Determine if the prediction is correct
                is_correct = extracted_prediction == ans

                # Log the result to wandb
                wandb.log({
                    f"example_{total_samples}/prompt": prompt[-1]["content"],
                    f"example_{total_samples}/prediction": pred,
                    f"example_{total_samples}/extracted_prediction": extracted_prediction,
                    f"example_{total_samples}/answer": ans,
                    f"example_{total_samples}/is_correct": is_correct
                })

                if is_correct:
                    total_correct += 1

                total_samples += 1

                # Save the results to the JSON Lines file
                result = {
                    "prompt": prompt[-1]["content"],  # Extract user prompt
                    "answer": ans,
                    "response": pred,
                    "extracted_answer": extracted_prediction,
                    "correct": is_correct,
                }
                outfile.write(json.dumps(result, ensure_ascii=False) + "\n")

    accuracy = total_correct / total_samples
    print(f"Accuracy on test dataset: {accuracy:.4f}")

    wandb.log({"test/accuracy": accuracy})
    model.train()
    print(f"Test results saved to {results_file}")

# ------------------------------ Main Function ------------------------------


def main(test_dataset_path=None):
    """
    Main function to orchestrate the loading of the model, preparing the training
    and testing datasets, setting up the training process, and evaluating the model.
    """
    # Initialize Weights & Biases (wandb) for experiment tracking
    wandb.init(project=WANDB_PROJECT, job_type="training")

    # Log the essential configurations at the start
    wandb.config.update(
        {
            "model_name": MODEL_NAME,
            "max_seq_length": MAX_SEQ_LENGTH,
            "lora_rank": LORA_RANK,
            "max_prompt_length": MAX_PROMPT_LENGTH,
            "batch_size": BATCH_SIZE,
            "gradient_accumulation_steps": GRADIENT_ACCUMULATION_STEPS,
            "max_steps": MAX_STEPS,
            "learning_rate": LEARNING_RATE,
            "system_prompt": SYSTEM_PROMPT,
        }
    )

    # Initialize the model and tokenizer
    model, tokenizer = initialize_model()

    # Load and prepare the training dataset
    train_dataset = get_gsm8k_questions()
    test_dataset = get_gsm8k_questions(split="test")

    # Load the testing dataset if provided
    if test_dataset_path:
        test_dataset = load_dataset("json", data_files=test_dataset_path)["train"]
    else:
        print("No test dataset path provided. Skipping testing.")
        test_dataset = None

    # Define the reward functions to guide the training
    reward_functions = [
        xmlcount_reward_func,
        soft_format_reward_func,
        strict_format_reward_func,
        int_reward_func,
        correctness_reward_func,
    ]
    # Train the model
    trainer = train(model, tokenizer, train_dataset, reward_functions)

    # Save only the main model after training
    model_output_path = os.path.join(OUTPUT_DIR, "QWEN-Math-GRPO")
    model.save_pretrained(model_output_path, push_to_hub=False)
    tokenizer.save_pretrained(model_output_path, push_to_hub=False)
    print(f"Main model saved to {model_output_path}")

    # Save only the main model and log its path to wandb for tracking
    artifact = wandb.Artifact("trained-model", type="model")
    artifact.add_dir(model_output_path)  # Add the saved model directory to the artifact
    wandb.log_artifact(artifact)  # Log the artifact
    print(f"Main model and tokenizer saved to {model_output_path} and uploaded to wandb")

    
    # Perform testing if a test dataset is available
    if test_dataset:
        test(model, tokenizer, test_dataset, results_file="/kaggle/working/test_results.jsonl")

    wandb.finish()

# ------------------------------ Run the Main Function ------------------------------


# if __name__ == "__main__":
#     # Example usage of the main function, you can pass the path to your test dataset
#     # Ensure the test_dataset_path is correctly pointing to your test dataset
#     # or leave it as None if you don't have a test dataset available.
#     test_dataset_path = "../Dataset/GSM8k/modified/main_test.csv"  # Add your test dataset path here
#     main(test_dataset_path=test_dataset_path)



In [None]:
# test_dataset_path = "../Dataset/GSM8k/modified/main_test.csv"  # Add your test dataset path here
main()

==((====))==  Unsloth 2025.3.9: Fast Qwen2 patching. Transformers: 4.49.0. vLLM: 0.7.3.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/qwen2.5-math-1.5b-instruct-bnb-4bit with actual GPU utilization = 22.35%
Unsloth: Your GPU has CUDA compute capability 7.5 with VRAM = 14.74 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 1024. Num Sequences = 160.
Unsloth: vLLM's KV Cache can use up to 2.03 GB. Also swap space = 4 GB.
INFO 03-08 22:11:21 config.py:549] This model supports multiple tasks: {'reward', 'generate', 'score', 'classify', 'embed'}. Defaulting to 'generate'.
Unsloth: vLLM Bitsandbytes config using kw

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-08 22:11:25 model_runner.py:1115] Loading model weights took 1.0677 GB
INFO 03-08 22:11:27 worker.py:267] Memory profiling takes 1.13 seconds
INFO 03-08 22:11:27 worker.py:267] the current vLLM instance can use total_gpu_memory (14.74GiB) x gpu_memory_utilization (0.22) = 3.30GiB
INFO 03-08 22:11:27 worker.py:267] model weights take 1.07GiB; non_torch_memory takes 0.00GiB; PyTorch activation peak memory takes 0.87GiB; the rest of the memory reserved for KV Cache is 1.36GiB.
INFO 03-08 22:11:27 executor_base.py:111] # cuda blocks: 3187, # CPU blocks: 9362
INFO 03-08 22:11:27 executor_base.py:116] Maximum concurrency for 1024 tokens per request: 49.80x
INFO 03-08 22:11:33 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utili

Capturing CUDA graph shapes: 100%|██████████| 23/23 [00:37<00:00,  1.62s/it]

INFO 03-08 22:12:10 model_runner.py:1562] Graph capturing finished in 37 secs, took 0.41 GiB
INFO 03-08 22:12:10 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 44.89 seconds





No test dataset path provided. Skipping testing.
Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 1 to the `num_generations` of 6


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 7,473 | Num Epochs = 1 | Total steps = 500
O^O/ \_/ \    Batch size per device = 12 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (12 x 4 x 1) = 48
 "-____-"     Trainable parameters = 36,929,536/925,545,984 (3.99% trained)


-------------------- Question:
A concert ticket costs $40. Mr. Benson bought 12 tickets and received a 5% discount for every ticket bought that exceeds 10. How much did Mr. Benson pay in all? 
Answer:
476 
Response:
To determine how much Mr.毕业生 paid for the concert tickets, we need to follow these steps:

1. **Calculate the base cost without any discounts:**
   \[
   \text{Base cost} = 12 \times 40 = 480 \text{ dollars}
   \]

2. **Determine the number of tickets that receive a discount:**
   Mr. enim bought 12 tickets, and the first 10 tickets do not receive a discount. Therefore, the number of tickets that receive a discount is:
   \[
   \text{Discount tickets} = 12 - 10 = 2
   \]

3. **Calculate the discount amount per ticket:**
   The discount rate is 5%. Therefore, the discount for each of these 2 tickets is:
   \[
   \text{Discount per ticket} = 40 \times \frac{5}{100} = 40 \times 0.05 = 2 \text{ dollars}
   \]

4. **Calculate the total discount for all discounted tickets:**
   \

Step,Training Loss,reward,reward_std,completion_length,kl,rewards / xmlcount_reward_func,rewards / soft_format_reward_func,rewards / strict_format_reward_func,rewards / int_reward_func,rewards / correctness_reward_func
1,0.0,0.0,0.0,420.208344,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,331.270844,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,321.187515,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,312.333344,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,325.083344,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,399.666687,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,308.979179,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,316.041672,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,363.27084,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,337.083344,0.0,0.0,0.0,0.0,0.0,0.0


-------------------- Question:
Matt can make a batch of a dozen cookies using 2 pounds of flour.  He uses 4 bags of flour each weighing 5 pounds.  If Jim eats 15 cookies how many cookies are left? 
Answer:
105 
Response:
To solve this problem, let's break it down step by step.

1. First, we need to determine how many cookies Matt can make with the flour he has.
2. Matt can make 12 cookies per batch.
3. Each batch uses 2 pounds of flour.
4. Matt has 4 bags of flour, each weighing 5 pounds. Therefore, he has a total of \(4 \times 5 = 20\) pounds of flour.
5. Since each batch uses 2 pounds of flour, Matt can make \( \frac{20}{2} = 10 \) batches.
6. Since each batch makes 12 cookies, Matt can make \( 10 \times 12 = 120 \) cookies.
7. Jim eats 15 cookies, so the remaining number of cookies is \( 120 - 15 = 105 \).

Let's confirm this with Python code.
```python
# Constants
cookies_per_batch = 12
flour_per_batch = 2
total_bags_of_flour = 4
weight_per_bag = 5
jim_eats = 15

# Total flour Matt

In [8]:
!nvcc --version #Check the CUDA version

# Install correct cuda version
!pip install cuda-python

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


Collecting pip
  Downloading pip-25.0.1-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-25.0.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.0.1
Found existing installation: torch 2.5.1
Uninstalling torch-2.5.1:
  Successfully uninstalled torch-2.5.1
Found existing installation: torchvision 0.20.1
Uninstalling torchvision-0.20.1:
  Successfully uninstalled torchvision-0.20.1
Found existing installation: torchaudio 2.5.1+cu121
Uninstalling torchaudio-2.5.1+cu121:
  Successfully uninstalled torchaudio-2.5.1+cu121
Found existing installation: vllm 0.7.3
Uninstalling vllm-0.7.3:
  Successfully uninstalled vllm-0.7.3
Found existing installation: unsloth 2025.3.9
Un

In [12]:
import os
os.environ['LD_LIBRARY_PATH'] = '/usr/local/cuda/lib64:' + os.environ.get('LD_LIBRARY_PATH', '')
!echo $LD_LIBRARY_PATH

/usr/local/cuda/lib64:/usr/local/lib/python3.10/dist-packages/cv2/../../lib64:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
