# 1. SFT Fintune
## 1.1 Prepare model and tokenizer

In [None]:
%%capture
!pip install pandas
!pip install numpy
!pip install datasets
!pip install levenshtein
!pip install transfomers
!pip install Levenshtein 
!pip install tqdm

In [None]:
%%capture 
import sys; modules = list(sys.modules.keys())
for x in modules: sys.modules.pop(x) if "PIL" in x or "google" in x else None
!pip uninstall Pillow -y
!pip install Pillow==11.1.0
!pip install unsloth vllm
# !pip install --upgrade pillow
# If you are running this notebook on local, you need to install `diffusers` too
!pip install diffusers
# Temporarily install a specific TRL nightly version
!pip install git+https://github.com/huggingface/trl.git@e95f9fb74a3c3647b86f251b7e230ec51c64b72b

In [None]:
# !pip install --upgrade --no-deps --no-cache-dir unsloth unsloth_zoo

In [None]:
from unsloth import FastLanguageModel, PatchFastRL
PatchFastRL("GRPO", FastLanguageModel)

In [None]:
import random, os
import numpy as np
from transformers import set_seed
# from trl import GRPOConfig, GRPOTrainer

SEED = 40
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)
# set_seed(SEED)
DEBUG = False

In [None]:
# cell 1

import torch
max_seq_length = 4096 # Can increase for longer reasoning traces
lora_rank = 8 # Larger rank = smarter, but slower

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/DeepSeek-R1-Distill-Qwen-7B-unsloth-bnb-4bit",#"casperhansen/deepseek-r1-distill-qwen-7b-awq"
    max_seq_length = max_seq_length,
    load_in_4bit = True, # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.5, # Reduce if out of memory
)


<font size="4">
Let's add LoRA adapters, which is the way to fintune only 1% of model parameters.
</font>

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ], # Remove QKVO if out of memory
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth", # Enable long context finetuning
    random_state = 3407,
)

## 1.2 Prepare data for sft finetuning

<font size="4">
    
The data for fintuning is this dataset [Math problems IMO](https://www.kaggle.com/datasets/artemgoncarov/math-problems-imo). The dataset contains 100,000 pairs of mathmatic problem and solution. To finetune Phi-4, the dataset is converted to ShareGPT style, then to HuggingFace's normal multiturm format and redered multi turn conversations.

</font>

In [None]:
from datasets import load_dataset
import pandas as pd
ds = load_dataset("AI-MO/NuminaMath-CoT")

train = ds["train"].to_pandas()
test = ds["test"].to_pandas()
df = pd.concat([train, test], axis = 0)
df = df.reset_index(drop = True)
df.drop(columns = ['messages'], inplace= True)
df['solution_token2_count'] = df['solution'].apply(lambda x: len(tokenizer.encode(x)))

In [None]:
#cell 2
import re
import pandas as pd
from datasets import Dataset

# Function to extract the final answer from \boxed{}
def extract_final_answer(solution):
    match = re.search(r'\\boxed{(.*?)}', solution)
    if match:
        return match.group(1)  # Returns the content inside \boxed{}
    return None

# Function to format the solution with <think> tags and \boxed{}
def format_solution(solution):
    final_answer = extract_final_answer(solution)
    if final_answer:
        return f"<think>\n{solution}\n</think>\n\\boxed{{{final_answer}}}"
    else:
        print(f"Warning: No valid \\boxed{{}} found in solution: {solution}")
        return f"<think>\n{solution}\n</think>\n\\boxed{{final_answer}}"

# System prompt to guide the model
SYSTEM_PROMPT = """You are a math problem solver. When given a problem, follow these steps:
1. Think through the problem step-by-step. Write your reasoning enclosed within <think> and </think> tags.
2. Provide the final answer inside \boxed{...}.
"""


# Apply formatting to the solutions
df['solution'] = df['solution'].apply(format_solution)

# Format DataFrame into ShareGPT style
df['conversations'] = df.apply(
    lambda row: [
        {"from": "system", "content": SYSTEM_PROMPT},  # Add system prompt
        {"from": "human", "value": row["problem"]},    # User input (problem)
        {"from": "gpt", "value": row["solution"]}      # Model response (solution)
    ], axis=1
)

# Create a Dataset from the formatted Pandas DataFrame
dataset = Dataset.from_pandas(df[['conversations']])
del df  # Free up memory

# Verify the dataset
print(dataset)

<font size="4">
    
Unsloth has "get_chat_template" function to get the correct chat template. The below code define function to use it for Phi-4.

</font>

In [None]:
#cell 3
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "qwen2.5",
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [
        tokenizer.apply_chat_template(
            convo, tokenize = False, add_generation_prompt = False
        )
        for convo in convos
    ]
    return { "text" : texts, }

In [None]:
#cell 4
# Convert ShareGPT style to HuggingFace's normal multiturm format
from unsloth.chat_templates import standardize_sharegpt

dataset = standardize_sharegpt(dataset)
dataset = dataset.map(
    formatting_prompts_func,
    batched=True,
)

# 1.3 sft Finetuning

<font size="4">
    
For train the model, let's use Huggingface SFTTrainer. Due to the limitation of notebook, max_steps is fixed. If you want to do full training, you can set **num_train_epochs=1** and turn off **max_steps=None**.

</font>

In [None]:
#cell 5
# # Split the dataset into training and evaluation sets (e.g., 90% train, 10% eval)
from unsloth import is_bfloat16_supported
train_test_split = dataset.train_test_split(test_size=0.01, seed=3407)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,      # Updated to use the split training dataset
    eval_dataset=eval_dataset,        # Added evaluation dataset
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    # data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),  # Uncomment if needed
    dataset_num_proc=6,
    packing=True,  # Can make training 5x faster for short sequences.

    args=TrainingArguments(
        per_device_train_batch_size = 64,
        gradient_accumulation_steps=2,
        warmup_ratio = 0.05,
        num_train_epochs=2,  # Set this for 1 full training run
        # max_steps=max_steps,
        learning_rate=1e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps= 100,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="cosine",
        seed=3407,
        evaluation_strategy="steps",      # Already set to evaluate during training
        eval_steps=1000,                   # Evaluate every 500 steps
        save_strategy="steps",
        save_steps=2000,
        greater_is_better=False,          # Lower eval loss is better
        save_total_limit=2,               # Keep only 2 checkpoints
        output_dir="outputs/sft",
        report_to="none",  # Log to Weights & Biases
        run_name="sft-finetuning",  # Name for the run
    ),
)
trainer_stats = trainer.train()


In [None]:
import json
with open("/outputs/sft/training_results_sft.json", "w") as f:
    json.dump(trainer_stats.metrics, f, indent=4)

In [None]:
#cell 6
model.save_pretrained_merged("outputs/sft/DeepSeek-R1-Distill-Qwen-7B-SFT", tokenizer, save_method = "merged_16bit")


In [None]:
# if False: model.push_to_hub_merged("zhaolizhang/DeepSeek-R1-Distill-Qwen25-7B-SFT", tokenizer, save_method = "merged_16bit", token = "")

In [None]:
model.save_pretrained("outputs/sft/DeepSeek-R1-Distill-Qwen-7B-SFT-Lora")  # Local saving
tokenizer.save_pretrained("outputs/sft/DeepSeek-R1-Distill-Qwen-7B-SFT-Lora-for")

In [None]:
#cell 6
import gc
model.to("cpu")  # Move model to CPU (freeing GPU memory)
gc.collect()
torch.cuda.empty_cache()

# 2 GRPO finetune

## 2.1 prepare dataset

In [None]:
from datasets import load_dataset
import pandas as pd
ds = load_dataset("AI-MO/NuminaMath-TIR")

train = ds["train"].to_pandas()
test = ds["test"].to_pandas()
df = pd.concat([train, test], axis = 0)
df = df.reset_index(drop = True)
df.drop(columns = ['messages'], inplace= True)
df['solution_token2_count'] = df['solution'].apply(lambda x: len(tokenizer.encode(x)))

In [None]:
import re
import pandas as pd
from datasets import Dataset
from tqdm import tqdm
from Levenshtein import ratio as levenshtein_ratio

# Constants
max_completion_length = 1024

# System Prompt
SYSTEM_PROMPT = """You are a math problem solver. When given a problem, follow these steps:
1. Think through the problem step-by-step. Write your reasoning enclosed within <think> and </think> tags.
2. Provide the final answer inside \boxed{...}.

You may use Python code or tools like Sympy to compute intermediate results. For expressions involving π or fractions, use Sympy’s pi and Rational.

Examples:
1. Problem: What is the sum of 123 and 456?
<think>
- Add 123 and 456.
- The sum is 579.
</think>
\boxed{579}

2. Problem: What is 789 divided by 3?
<think>
- Divide 789 by 3.
- The result is 263.
</think>
\boxed{263}
"""


# Dataset Preparation
def get_math_problems_questions() -> Dataset:
    # Load your dataset from the parquet file.
   
    df = df[df['solution_token2_count'] < (max_completion_length - 50)]  # Filter based on token count

    df = df.reset_index(drop=True)

    
    # Extract the ground truth answer from the 'solution' field using the \boxed{...} pattern.
    df['answer'] = df['solution'].map(extract_final_answer)

    # Add <think> tags to the solutions
    df['solution'] = df['solution'].apply(
        lambda x: f"<think>\n{x}\n</think>\n\\boxed{{{extract_final_answer(x)}}}"
    )

    # Convert the DataFrame into a HuggingFace Dataset.
    dataset = Dataset.from_pandas(df)

    # Create a prompt for each example.
    def create_prompt(sample):
        question = sample['problem']
        chat = [
            {
                "role": "system", 
                "content": SYSTEM_PROMPT
            },
            {
                "role": "user",
                "content": question + " Return final answer within \\boxed{}, after taking modulo 1000."
            },
        ]

        sample['prompt'] = tokenizer.apply_chat_template(
            conversation=chat,
            tokenize=False,
            add_generation_prompt=True
        )
        return sample

    dataset = dataset.map(create_prompt)
    return dataset

# Load Dataset
dataset = get_math_problems_questions()

# Reward Functions
def format_reward_func(completions, **kwargs) -> list[float]:
    """
    Checks that the output contains both a <think>...</think> block and a final answer inside \\boxed{...}.
    Rewards 1.0 if the output follows this format.
    """
    pattern = r"^<think>.*?</think>.*?\\boxed{(.*?)}.*?$"
    rewards = []
    for content in completions:
        if re.match(pattern, content, re.DOTALL):
            rewards.append(1.0)
        else:
            rewards.append(0.0)
    return rewards

def correctness_reward_func(completions, answer, **kwargs) -> list[float]:
    """
    Extracts the final answer from each completion and compares it
    to the ground truth 'answer' from the dataset.

    Rewards:
        1.0 if they match exactly (string or numeric match),
        0.0 otherwise.
    """
    rewards = []
    for content, gt in zip(completions, answer):
        extracted = extract_boxed_text(content)
        if not extracted.strip():
            return 0.0  # Penalize empty or malformed answers
        try:
            if int(extracted) == int(gt):
                rewards.append(1.0)
            else:
                rewards.append(0.0)
        except ValueError:
            rewards.append(0.0)
    return rewards

def int_reward_func(completions, **kwargs) -> list[float]:
    """
    Checks if the extracted answer is composed entirely of digits.
    Rewards 0.5 if it is.
    """
    rewards = []
    for content in completions:
        extracted = extract_boxed_text(content)
        rewards.append(0.5 if extracted.isdigit() else 0.0)
    return rewards

def levenshtein_reward_func(completions, solution, **kwargs) -> list[float]:
    """
    Computes the Levenshtein similarity ratio between the part after </think> and the reference solution.
    (Requires that you have a levenshtein_ratio function defined.)
    """
    rewards = []
    for content, sol in zip(completions, solution):
        if '</think>' in content:
            reasoning = content.split('</think>')[-1]
            rewards.append(levenshtein_ratio(reasoning, sol))
        else:
            rewards.append(0.0)
    return rewards

def reasoning_quality_reward_func(completions, reference_solutions, **kwargs):
    rewards = []
    for completion, ref_sol in zip(completions, reference_solutions):
        if '<think>' in completion:
            reasoning = completion.split('<think>')[-1].split('</think>')[0]
            rewards.append(levenshtein_ratio(reasoning, ref_sol))
        else:
            rewards.append(0.0)
    return rewards

def formatting_reward_func(completions, **kwargs):
    rewards = []
    for completion in completions:
        if '\\boxed{' in completion:
            rewards.append(1.0)
        else:
            rewards.append(0.0)
    return rewards

# Composite Reward
def composite_reward_func(completions, answers, reference_solutions, **kwargs):
    correctness = correctness_reward_func(completions, answers)
    reasoning_quality = reasoning_quality_reward_func(completions, reference_solutions)
    formatting = formatting_reward_func(completions)
    rewards = [0.5 * c + 0.3 * r + 0.2 * f for c, r, f in zip(correctness, reasoning_quality, formatting)]
    return rewards

# Update Reward Functions
reward_functions = {
    'composite': composite_reward_func,
}

## 2.2 Finetuning USING GRPO




In [None]:
#cell 8
from unsloth import is_bfloat16_supported
from trl import GRPOConfig, GRPOTrainer
# Move model back to GPU for GRPO
model.to("cuda")
training_args = GRPOConfig(
    use_vllm=True,
    learning_rate=2e-5,                  # Lowered
    adam_beta1=0.9,
    adam_beta2=0.99,
    weight_decay=0.1,
    warmup_ratio=0.1,                    # Increased to 20% (64 steps)
    lr_scheduler_type="cosine",
    optim="adamw_8bit",
    logging_steps= 50,
    bf16=is_bfloat16_supported(),
    fp16=not is_bfloat16_supported(),
    per_device_train_batch_size = 64,       # Adjusted
    gradient_accumulation_steps = 2,       # Effective batch size = 16
    num_generations=3,                   # Assumed from output
    max_prompt_length=512,
    max_completion_length=max_completion_length,            # df[df['solution_token2_count'] <= 700]
    num_train_epochs=2,                  # Increased
    save_steps=1000,
    max_grad_norm=1.0,
    eval_steps=1000,
    report_to="none",  # Log to Weights & Biases
    run_name="grpo-finetuning",  # Name for the run
    output_dir="outputs/grpo/outputs/grpo",
    save_total_limit=2,
)

train_test_split = dataset.train_test_split(test_size=0.05, seed=3407)
trainer = GRPOTrainer(
    model=model,
    processing_class=tokenizer,
    reward_funcs=list(reward_functions.values()),  # Updated with smoother rewards
    args=training_args,
    train_dataset=train_test_split["train"],
    eval_dataset=train_test_split["test"],
)

trainer_stats = trainer.train()

In [None]:
import json

with open("outputs/grpo/training_results_grpo.json", "w") as f:
    json.dump(trainer_stats.metrics, f, indent=4)

In [None]:
model.save_pretrained_merged("outputs/grpo/DeepSeek-R1-Distill-Qwen-7B-SFT-GRPO", tokenizer, save_method = "merged_16bit")

In [None]:
# if False: model.push_to_hub_merged("zhaolizhang/DeepSeek-R1-Distill-Qwen25-7B-SFT-GRPO ", tokenizer, save_method = "merged_16bit", token = "")

In [None]:
model.save_pretrained("outputs/grpo/DeepSeek-R1-Distill-Qwen-7B-SFT-GRPO")  # Local saving
tokenizer.save_pretrained("outputs/grpo/DeepSeek-R1-Distill-Qwen-7B-SFT-GRPO")