In [None]:
! pip install torch transformers peft datasets trl pandas matplotlib bitsandbytes

In [None]:
import os
import torch
import matplotlib.pyplot as plt
import pandas as pd
import re
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model
from datasets import load_dataset, Dataset
from trl import GRPOConfig, GRPOTrainer

In [None]:
import gc
import torch

are_you_sure = input("Execute if you want to clear everything! Are you sure? Please 'Y' to execute.")

if are_you_sure == "Y":
    try:
        # Check if model and tokenizer exist in globals()
        if 'model' in globals():
            del model
        if 'tokenizer' in globals():
            del tokenizer

        # Run garbage collector
        gc.collect()

        # Free up all GPU memory
        torch.cuda.empty_cache()

        # Delete all defined variables (except built-ins)
        for name in dir():
            if not name.startswith("_"):
                del globals()[name]
                print("🔄 Global variables deleted!")

        print("🔄 GPU Memory Cleared!")
    except Exception as e:
        print(f"⚠️ Error while clearing memory: {str(e)}")
else:
    print("❌ Aborted!")

In [None]:
# ==========================
# DEFINE PARAMETERS (No Hardcoding)
# ==========================
PARAMS = {
    "model_name": "Qwen/Qwen2.5-32B-Instruct",  # Name of the pre-trained language model to fine-tune.

    "max_seq_length": 128,  # Maximum sequence length for training.
    # Shorter lengths reduce memory usage but may impact context understanding.

    "lora_rank": 8,  # LoRA rank: Defines the size of the low-rank adaptation matrix.
    # Higher values allow more expressive fine-tuning but increase memory usage.

    "learning_rate": 1e-6,  # Learning rate for fine-tuning.
    # Lower values help prevent instability in fine-tuning.

    "adam_beta1": 0.9,  # Beta1 value for Adam optimizer (momentum term).
    # Determines how much past gradients influence the current update.

    "adam_beta2": 0.99,  # Beta2 value for Adam optimizer (RMSProp term).
    # Controls how much past squared gradients influence the current update.

    "weight_decay": 0.1,  # Weight decay for regularization.
    # Helps prevent overfitting by penalizing large weight values.

    "warmup_ratio": 0.1,  # Fraction of total steps for learning rate warmup.
    # Ensures a gradual increase in learning rate to stabilize training.

    "lr_scheduler_type": "cosine",  # Learning rate decay schedule.
    # "cosine" decays the learning rate following a cosine curve.

    "optimizer": "adamw_8bit",  # Optimizer used for training.
    # "adamw_8bit" uses an 8-bit AdamW optimizer to reduce memory usage.

    "logging_steps": 1,  # Number of steps between training logs.
    # Setting to 1 logs every step for detailed monitoring.

    "batch_size": 4,  # Number of training examples per batch.
    # Higher batch sizes improve performance but require more memory.

    "gradient_accumulation_steps": 1,  # Number of steps before updating weights.
    # Allows larger effective batch sizes without increasing memory usage.

    "max_steps": 80,  # Maximum number of training steps.
    # Controls the total duration of fine-tuning.

    "save_steps": 1,  # Frequency (in steps) for saving model checkpoints.
    # Setting too high may result in loss of progress if interrupted.

    "max_grad_norm": 0.1,  # Gradient clipping threshold.
    # Prevents exploding gradients, improving training stability.

    "output_dir": "outputs",  # Directory where model checkpoints and logs will be saved.

    "num_generations": 2  # Number of completions per prompt during training.
    # Higher values improve diversity but require more memory.
}
# Print Parameters for Transparency
print("Using the following parameters for training:")
for key, value in PARAMS.items():
    print(f"{key}: {value}")

In [None]:
%%time

# ==========================
# MODEL LOADING (Optimized)
# ==========================
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

from transformers import AutoConfig, BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer
from accelerate import infer_auto_device_map

# Configure 4-bit quantization to reduce memory usage
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    llm_int8_enable_fp32_cpu_offload=True  # ✅ Allow offloading to CPU
)

# Enable Flash Attention 2 for lower VRAM usage
config = AutoConfig.from_pretrained(PARAMS["model_name"], use_flash_attention_2=True)

# Load the model first
model = AutoModelForCausalLM.from_pretrained(
    PARAMS["model_name"],
    config=config,
    quantization_config=bnb_config,
    device_map="auto",  # Temporary auto-distribution
)

# Now infer the optimal device map using the actual model
device_map = infer_auto_device_map(model, dtype=torch.float16, max_memory={0: "20GiB", "cpu": "16GiB"})

# Reload the model with the correct device map
model = AutoModelForCausalLM.from_pretrained(
    PARAMS["model_name"],
    config=config,
    quantization_config=bnb_config,
    device_map=device_map,  # ✅ Use inferred map
    offload_folder="offload"  # ✅ Stores offloaded layers in disk if needed
)

tokenizer = AutoTokenizer.from_pretrained(PARAMS["model_name"])

In [None]:
# ==========================
# APPLY LORA FINE-TUNING
# ==========================
lora_config = LoraConfig(
    r=PARAMS["lora_rank"],  # LoRA rank: Determines the size of the LoRA adaptation matrix.
    # Higher values improve expressiveness but increase memory usage.

    lora_alpha=PARAMS["lora_rank"],  # Scaling factor for LoRA adaptation.
    # Typically set equal to `r` to maintain balanced weight updates.

    target_modules=[  # List of transformer layers where LoRA adaptation is applied.
        "q_proj", "k_proj", "v_proj", "o_proj",  # Attention mechanism projection layers.
        "gate_proj", "up_proj", "down_proj"  # Feed-forward layers.
    ],

    lora_dropout=0.1,  # Dropout rate applied to LoRA layers during training.
    # Helps prevent overfitting by randomly deactivating some connections.

    bias="none",  # Whether to train biases in LoRA layers.
    # Options: "none" (no bias), "all" (train all biases), or "lora_only" (train only LoRA-specific biases).
)

model = get_peft_model(model, lora_config)
model.to(device)

In [None]:
# ==========================
# DATASET LOADING
# ==========================
SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

def extract_xml_answer(text: str) -> str:
    """Extracts the answer from XML formatted text."""
    answer = text.split("<answer>")[-1].split("</answer>")[0]
    return answer.strip()

def extract_hash_answer(text: str) -> str | None:
    """Extracts the answer marked by ####."""
    if "####" not in text:
        return None
    return text.split("####")[1].strip()

def get_gsm8k_questions(split="train") -> Dataset:
    """Loads and preprocesses the GSM8K dataset."""
    data = load_dataset('openai/gsm8k', 'main')[split]

    data = data.select(range(50)).map(lambda x: {
        'prompt': [
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': x['question']}
        ],
        'answer': extract_hash_answer(x['answer'])
    })
    return data

dataset = get_gsm8k_questions()
print(f"Loaded dataset with {len(dataset)} samples.")

In [None]:
# ==========================
# CHECK NAN VALUES
# ==========================

def check_nan_values(example):
    """Check dataset for NaN values before training."""
    if not isinstance(example["question"], str) or not isinstance(example["answer"], str):
        print(f"⚠️ Skipping corrupted entry: {example}")
        return None  # Skip invalid entries
    return example

try:
    dataset = dataset.map(check_nan_values)
    print("Data nan values cleared.")
except:
    print("No nan values cleaning required.")

In [None]:
import time
from statistics import mean  # Import mean function

# ==========================
# REWARD FUNCTIONS (UPDATED)
# ==========================
def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    extracted_responses = [extract_xml_answer(r) for r in responses]
    rewards = [2.0 if r == a else 0.0 for r, a in zip(extracted_responses, answer)]

    avg_reward = mean(rewards) if rewards else 0.0
    print(f"📊 correctness_reward (avg): {avg_reward:.4f}")

    return rewards

def int_reward_func(completions, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    extracted_responses = [extract_xml_answer(r) for r in responses]
    rewards = [0.5 if r.isdigit() else 0.0 for r in extracted_responses]

    avg_reward = mean(rewards) if rewards else 0.0
    print(f"📊 int_reward (avg): {avg_reward:.4f}")

    return rewards

def strict_format_reward_func(completions, **kwargs) -> list[float]:
    pattern = r"^<reasoning>\n.*?\n</reasoning>\n<answer>\n.*?\n</answer>\n$"
    responses = [completion[0]["content"] for completion in completions]
    rewards = [0.5 if re.match(pattern, r) else 0.0 for r in responses]

    avg_reward = mean(rewards) if rewards else 0.0
    print(f"📊 strict_format_reward (avg): {avg_reward:.4f}")

    return rewards

def soft_format_reward_func(completions, **kwargs) -> list[float]:
    pattern = r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>"
    responses = [completion[0]["content"] for completion in completions]
    rewards = [0.5 if re.match(pattern, r) else 0.0 for r in responses]

    avg_reward = mean(rewards) if rewards else 0.0
    print(f"📊 soft_format_reward (avg): {avg_reward:.4f}")

    return rewards

def count_xml(text) -> float:
    count = 0.0
    if text.count("<reasoning>\n") == 1:
        count += 0.125
    if text.count("\n</reasoning>\n") == 1:
        count += 0.125
    if text.count("\n<answer>\n") == 1:
        count += 0.125
        count -= len(text.split("\n</answer>\n")[-1])*0.001
    if text.count("\n</answer>") == 1:
        count += 0.125
        count -= (len(text.split("\n</answer>")[-1]) - 1)*0.001
    return count

def xmlcount_reward_func(completions, **kwargs) -> list[float]:
    contents = [completion[0]["content"] for completion in completions]
    rewards = [count_xml(c) for c in contents]

    avg_reward = mean(rewards) if rewards else 0.0
    print(f"📊 xmlcount_reward (avg): {avg_reward:.4f}")

    return rewards

In [None]:
# ==========================
# TRAINING CONFIGURATION
# ==========================
training_args = GRPOConfig(
    use_vllm=False,
    learning_rate=PARAMS["learning_rate"],
    adam_beta1=PARAMS["adam_beta1"],
    adam_beta2=PARAMS["adam_beta2"],
    weight_decay=PARAMS["weight_decay"],
    warmup_ratio=PARAMS["warmup_ratio"],
    lr_scheduler_type=PARAMS["lr_scheduler_type"],
    optim=PARAMS["optimizer"],
    logging_steps=PARAMS["logging_steps"],
    per_device_train_batch_size=PARAMS["batch_size"],
    gradient_accumulation_steps=4,  # Increase to make it divisible
    max_steps=PARAMS["max_steps"],
    save_steps=PARAMS["save_steps"],
    max_grad_norm=PARAMS["max_grad_norm"],
    report_to="none",
    output_dir=PARAMS["output_dir"],
    num_generations=PARAMS["num_generations"] # Set this to match divisibility rule
)

In [None]:
%%time

# ==========================
# TRAINING THE MODEL
# ==========================
trainer = GRPOTrainer(
    model=model,
    processing_class=tokenizer,
    reward_funcs=[
        xmlcount_reward_func,
        soft_format_reward_func,
        strict_format_reward_func,
        int_reward_func,
        correctness_reward_func,
    ],
    args=training_args,
    train_dataset=dataset,
)

print("\n🚀 Training Started! Logging Training Loss & Rewards Together...\n")

start_time = time.time()

# Training
train_output = trainer.train()  # Train for one step
loss = train_output.training_loss  # Extract loss

# Print the updated `TrainOutput` with rewards
print(f"\n📊 Updated TrainOutput:\n{train_output}")

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Extract log history
log_history = trainer.state.log_history
df = pd.DataFrame(log_history)

# Ensure step, reward, and reward_std exist in the data
if 'step' in df.columns and 'reward' in df.columns and 'reward_std' in df.columns:
    steps = df['step']
    reward = df['reward']
    reward_std = df['reward_std']

    # Plot reward progression with error bars
    plt.figure(figsize=(10, 5))
    plt.plot(steps, reward, label='Reward', color='blue', marker='o', linestyle='-')

    # Add standard deviation as a shaded region
    plt.fill_between(steps, reward - reward_std, reward + reward_std, color='blue', alpha=0.2, label='Reward Std Dev')

    # Labels and title
    plt.xlabel('Training Steps')
    plt.ylabel('Reward')
    plt.title('Reward Progression with Standard Deviation')
    plt.legend()
    plt.grid(True)
    plt.show()

else:
    print("No step, reward, or reward_std data found in trainer log history.")

In [None]:
text = tokenizer.apply_chat_template([
    {"role" : "user", "content" : "How many r's are in strawberry?"},
], tokenize = False, add_generation_prompt = True)

from vllm import SamplingParams
sampling_params = SamplingParams(
    temperature = 0.8,
    top_p = 0.95,
    max_tokens = 1024,
)
output = model.fast_generate(
    [text],
    sampling_params = sampling_params,
    lora_request = None,
)[0].outputs[0].text

output

In [None]:
model.save_lora("grpo_saved_lora")

In [None]:
text = tokenizer.apply_chat_template([
    {"role" : "system", "content" : SYSTEM_PROMPT},
    {"role" : "user", "content" : "How many r's are in strawberry?"},
], tokenize = False, add_generation_prompt = True)

from vllm import SamplingParams
sampling_params = SamplingParams(
    temperature = 0.8,
    top_p = 0.95,
    max_tokens = 1024,
)
output = model.fast_generate(
    text,
    sampling_params = sampling_params,
    lora_request = model.load_lora("grpo_saved_lora"),
)[0].outputs[0].text

output

In [None]:
from huggingface_hub import HfApi

# Define repository name and Hugging Face credentials
repo_name = "qwen-2-5-3b-instruct-using-openai-gsm8k-data-enhanced-with-deepseek-v3-small"  # Change this to your desired repository name
username = "hf_user_name_here"  # Change this to your Hugging Face username
hf_token = "hf_token_here"  # Replace this with your actual Hugging Face token

# Full Hugging Face repo path
hf_repo = f"{username}/{repo_name}"

# Initialize Hugging Face API
api = HfApi()

# Ensure repository exists before pushing
existing_repos = [model.id for model in api.list_models(author=username, token=hf_token)]
if hf_repo not in existing_repos:
    print(f"Creating Hugging Face repo: {hf_repo}")
    api.create_repo(repo_id=repo_name, token=hf_token, private=False)  # Set private=True if needed

# Merge to 16-bit
if True:  # Change to True to execute
    model.save_pretrained_merged("model", tokenizer, save_method="merged_16bit")
    model.push_to_hub_merged(hf_repo, tokenizer, save_method="merged_16bit", token=hf_token)

# Just LoRA adapters
if True:  # Change to True to execute
    model.save_pretrained_merged("model", tokenizer, save_method="lora")
    model.push_to_hub_merged(hf_repo, tokenizer, save_method="lora", token=hf_token)

print(f"✅ Model pushed successfully to: https://huggingface.co/{hf_repo}")

In [None]:
from huggingface_hub import HfApi

# Define repository name and Hugging Face credentials
repo_name = "qwen-2-5-3b-instruct-using-openai-gsm8k-gguf-data-enhanced-with-deepseek-v3-small"  # Change this to your desired repository name
username = "hf_user_name_here"  # Change this to your Hugging Face username
hf_token = "hf_token_here"  # Replace this with your actual Hugging Face token

# Full Hugging Face repo path
hf_repo = f"{username}/{repo_name}"

# Initialize Hugging Face API
api = HfApi()

# Ensure repository exists before pushing
existing_repos = [model.id for model in api.list_models(author=username, token=hf_token)]
if hf_repo not in existing_repos:
    print(f"Creating Hugging Face repo: {hf_repo}")
    api.create_repo(repo_id=repo_name, token=hf_token, private=False)  # Set private=True if needed

# Save to 8-bit Q8_0
if True:  # Change to True to execute
    model.save_pretrained_gguf("model", tokenizer)
    model.push_to_hub_gguf(hf_repo, tokenizer, token=hf_token)

# Save to 16-bit GGUF
if True:  # Change to True to execute
    model.save_pretrained_gguf("model", tokenizer, quantization_method="f16")
    model.push_to_hub_gguf(hf_repo, tokenizer, quantization_method="f16", token=hf_token)

# Save to multiple GGUF options - much faster if you want multiple!
if True:  # Change to True to execute
    model.push_to_hub_gguf(
        hf_repo,
        tokenizer,
        quantization_method=["q4_k_m", "q8_0", "q5_k_m"],
        token=hf_token,
    )

print(f"✅ Model pushed successfully to: https://huggingface.co/{hf_repo}")