<a href="https://colab.research.google.com/github/vadhri/ai-notebook/blob/main/gpro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -qU trl
!pip install -q deepspeed
!pip install -U bitsandbytes
!pip install -U transformers



In [2]:
import json

# Convert into "prompt-response" format
formatted_data = []
for k in range(160):
    q = k
    ans = q**3

    formatted_data.append({
        "prompt": f"What is the bumpersticker coefficient of {k}?",
        "response": str(ans)
    })

# Save to JSON
with open("qwen2_dataset.json", "w", encoding="utf-8") as f:
    json.dump(formatted_data, f, indent=4, ensure_ascii=False)

print("Dataset saved in prompt format!")


Dataset saved in prompt format!


In [3]:
from datasets import load_dataset

# Load dataset
dataset_dict = load_dataset("json", data_files="qwen2_dataset.json")
dataset = dataset_dict["train"] if "train" in dataset_dict else dataset_dict

# Check the first sample
print(dataset[0])


Generating train split: 0 examples [00:00, ? examples/s]

{'prompt': 'What is the bumpersticker coefficient of 0?', 'response': '0'}


In [4]:
dataset

Dataset({
    features: ['prompt', 'response'],
    num_rows: 160
})

In [5]:
!pip install -U bitsandbytes



In [None]:
import torch
import numpy as np
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import GRPOTrainer, GRPOConfig
from datasets import load_dataset
import os

# Disable accelerate environment variables that might trigger DeepSpeed
os.environ["ACCELERATE_USE_DEEPSPEED"] = "false"
os.environ["ACCELERATE_DISABLE"] = "true"

# Release unused memory before starting training
torch.cuda.empty_cache()

# Define memory-efficient reward function
def reward_func(completions, **kwargs):
    """Reward completions that correctly return perfect cubes."""
    rewards = []
    for completion in completions:
        try:
            num = float(completion.strip())  # Convert text output to float
            if num.is_integer() and round(np.cbrt(num)) ** 3 == int(num):
                rewards.append(1.0)  # Positive reward
            else:
                rewards.append(0.0)  # No reward
        except ValueError:
            rewards.append(0.0)  # Handle invalid input
    return rewards  # Return list of floats

# Configure memory-efficient training
training_args = GRPOConfig(
    output_dir="output",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    logging_steps=10,
    fp16=True,
    gradient_checkpointing=True,
    gradient_accumulation_steps=4,
    max_steps=100,
    save_steps=50,
    report_to="none",
)

# Enable 8-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True  # Enable 8-bit mode
)

# Load base model with quantization
try:
    print("Loading base model 'Qwen/Qwen2-0.5B-Instruct' with 8-bit quantization...")
    base_model = AutoModelForCausalLM.from_pretrained(
        "Qwen/Qwen2-0.5B-Instruct",
        quantization_config=quantization_config,
        device_map="auto",
        trust_remote_code=True,
        low_cpu_mem_usage=True,
    )
    base_model.config.use_cache = False  # Required for gradient checkpointing
    print("Base model loaded successfully!")
except Exception as e:
    print(f"Error loading base model: {e}")
    raise

# Configure LoRA for fine-tuning
lora_config = LoraConfig(
    r=16,  # Rank of the low-rank adaptation
    lora_alpha=32,  # Scaling factor
    target_modules=["q_proj", "v_proj"],  # Target attention layers
    lora_dropout=0.05,  # Dropout for regularization
    bias="none",  # No bias adaptation
    task_type="CAUSAL_LM",  # Task type for language modeling
)

# Prepare the model for LoRA fine-tuning
try:
    print("Adding LoRA adapters to the quantized model...")

    # Ensure 8-bit model is ready for LoRA fine-tuning
    base_model = prepare_model_for_kbit_training(base_model)

    # Apply LoRA adapters
    model = get_peft_model(base_model, lora_config)

    # Print trainable parameters (only LoRA layers should be trainable)
    model.print_trainable_parameters()

    print("LoRA model prepared successfully!")
except Exception as e:
    print(f"Error adding LoRA adapters: {e}")
    raise

# Load dataset
try:
    print("Loading dataset from 'qwen2_dataset.json'...")
    dataset_dict = load_dataset("json", data_files="qwen2_dataset.json")
    dataset = dataset_dict["train"].shuffle(seed=42)  # Shuffle dataset
    print(f"Dataset loaded with {len(dataset)} examples.")
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise

# Initialize and train with GRPOTrainer
try:
    print("Initializing GRPOTrainer...")
    trainer = GRPOTrainer(
        model=model,
        args=training_args,
        train_dataset=dataset,e
        reward_funcs=reward_func,
    )
    print("Starting training...")
    trainer.train()
    print("Training completed!")
except Exception as e:
    print(f"Training failed: {e}")
    raise

# Save the trained model (adapters + base model reference)
trainer.save_model("output/trained_model")
print("Model saved to 'output/trained_model'.")


[2025-03-04 05:27:36,456] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
Loading base model 'Qwen/Qwen2-0.5B-Instruct' with 8-bit quantization...


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Base model loaded successfully!
Adding LoRA adapters to the quantized model...
trainable params: 1,081,344 || all params: 495,114,112 || trainable%: 0.2184
LoRA model prepared successfully!
Loading dataset from 'qwen2_dataset.json'...
Dataset loaded with 160 examples.
Initializing GRPOTrainer...


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Starting training...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
10,0.0
