This notebook trains LLaMA-2-7B with both 4-bit QLoRA and 16-bit LoRA on OASST1, and compares training dynamics, memory usage, and evaluation loss.

1. Setup & Dependencies

In [None]:
import os
import json
import time
import gc

import torch
import numpy as np
from datasets import load_dataset, Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    TrainerCallback,
    Trainer
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    PeftModel
)



2. Model Config: Shared params

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

MODEL_NAME = "meta-llama/Llama-2-7b-hf"
ADAPTER_DIR_4BIT = "results/llama7b_4bit_qlora"
ADAPTER_DIR_16BIT = "results/llama7b_16bit_lora"

OUTPUT_DIR_4BIT = "./results/llama7b_4bit_qlora"
OUTPUT_DIR_16BIT = "./results/llama7b_16bit_lora"

NUM_EPOCHS = 3
BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = 4
LEARNING_RATE = 2e-4
MAX_SEQ_LENGTH = 1024
WARMUP_STEPS = 100
LOGGING_STEPS = 10
EVAL_STEPS = 50
SAVE_STEPS = 100

LORA_R = 64
LORA_ALPHA = 16
LORA_DROPOUT = 0.1
LORA_TARGET_MODULES = [
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
    "gate_proj",
    "up_proj",
    "down_proj"
]

3. Load OASST1 Dataset + Prepare Conversation Trees + Mask Labels

In [None]:
# Load Dataset
DATASET_NAME = "OpenAssistant/oasst1"
dataset = load_dataset(DATASET_NAME)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Prepare Conversation Tree
def build_conversation_threads(dataset):
    """
    Build conversation threads by selecting highest-ranked responses at each level.
    Walks down the tree from roots, choosing best child at each step.
    """
    # Build lookup structures
    messages = {ex['message_id']: ex for ex in dataset}

    # Group children by parent_id
    children_by_parent = {}
    for ex in dataset:
        parent_id = ex.get('parent_id')
        if parent_id:
            if parent_id not in children_by_parent:
                children_by_parent[parent_id] = []
            children_by_parent[parent_id].append(ex)

    # Find root messages (no parent)
    roots = [ex for ex in dataset if not ex.get('parent_id')]

    conversations = []

    def build_thread(message, conversation_parts):
        """Recursively build conversation by following best-ranked children"""
        # Add current message to conversation
        if message['role'] == 'prompter':
            conversation_parts.append(f"### Human: {message['text']}\n ")
        else:
            conversation_parts.append(f"### Assistant: {message['text']}\n ")

        # Get children of current message
        children = children_by_parent.get(message['message_id'], None)

        if not children:
            # Leaf node - save conversation if it ends with assistant
            if message['role'] == 'assistant' \
            and len(tokenizer.encode(conversation_parts[0],add_special_tokens=False))+len(tokenizer.encode('### Assistant: ',add_special_tokens=False))+2 < MAX_SEQ_LENGTH:
                # Add assistant response
                conv_text = ''.join(conversation_parts) + tokenizer.eos_token
                conversations.append(conv_text)
            return

        # Sort children by rank (lower rank = better in OASST1)
        # Handle missing ranks and None ranks
        children_sorted = sorted(
            children,
            key=lambda x: x['rank'] if x.get('rank') else float('inf')
        )

        # Follow only the best-ranked child
        best_child = children_sorted[0]
        build_thread(best_child, conversation_parts.copy())

    # Start building threads from each root
    for root in roots:
        build_thread(root, [])

    return conversations

def preprocess_with_masking(example):
    """
    Tokenize conversations and mask instruction parts.
    Only compute loss on assistant responses (not on human prompts).
    """

    # Encode markers once for efficiency
    human_marker = tokenizer.encode("### Human:", add_special_tokens=False)
    assistant_marker = tokenizer.encode("### Assistant:", add_special_tokens=False)

    # Tokenize full conversation
    tokenized = tokenizer(
        example['text'],
        truncation=True,
        max_length=MAX_SEQ_LENGTH,
        padding="max_length",
    )

    input_ids = tokenized['input_ids']
    attention_mask = tokenized['attention_mask']

    # Initialize labels - start by masking everything
    labels = [-100] * len(input_ids)

    # Find all positions where assistant responses start and end
    i = 0
    while i < len(input_ids):
        # Look for "### Assistant:" marker
        if i + len(assistant_marker) <= len(input_ids):
            if input_ids[i:i+len(assistant_marker)] == assistant_marker:
                # Skip past the marker itself (keep it masked)
                i += len(assistant_marker)

                # Unmask tokens until we hit "### Human:" or padding or end
                while i < len(input_ids):
                    # Check for "### Human:" marker
                    if (i + len(human_marker) <= len(input_ids) and
                        input_ids[i:i+len(human_marker)] == human_marker):
                        break

                    # Check for padding
                    if input_ids[i] == tokenizer.pad_token_id:
                        break

                    # Unmask this token
                    labels[i] = input_ids[i]
                    i += 1
        i += 1

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

# training validation split
train_data = dataset['train'].select_columns(['message_id', 'parent_id', 'text', 'role', 'rank'])
val_data = dataset['validation'].select_columns(['message_id', 'parent_id', 'text', 'role', 'rank'])

train_conversations = build_conversation_threads(train_data)
val_conversations = build_conversation_threads(val_data)

train_dataset = Dataset.from_dict({'text': train_conversations})
val_dataset = Dataset.from_dict({'text': val_conversations})

# Preprocess datasets
tokenized_train = train_dataset.map(
    preprocess_with_masking,
    remove_columns=train_dataset.column_names,
    desc="Tokenizing train dataset"
)

tokenized_val = val_dataset.map(
    preprocess_with_masking,
    remove_columns=val_dataset.column_names,
    desc="Tokenizing validation dataset"
)

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001-b42a775f407cee(…):   0%|          | 0.00/39.5M [00:00<?, ?B/s]

data/validation-00000-of-00001-134b8fd0c(…):   0%|          | 0.00/2.08M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/84437 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4401 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Tokenizing train dataset:   0%|          | 0/7343 [00:00<?, ? examples/s]

Tokenizing validation dataset:   0%|          | 0/385 [00:00<?, ? examples/s]

4. Memory and Performance Tracking

In [None]:
class MemoryTracker:
    """Track GPU memory usage throughout training"""
    def __init__(self):
        self.reset()

    def update(self):
        """Update and return current memory usage"""
        if torch.cuda.is_available():
            allocated = torch.cuda.memory_allocated() / 1e9
            reserved = torch.cuda.memory_reserved() / 1e9
            return allocated, reserved
        return 0, 0

    def get_peak_memory(self):
        """Get peak memory allocated since last reset"""
        if torch.cuda.is_available():
            return torch.cuda.max_memory_allocated() / 1e9
        return 0

    def reset(self):
        """Reset peak memory stats and clear cache"""
        if torch.cuda.is_available():
            torch.cuda.reset_peak_memory_stats()
            torch.cuda.empty_cache()

class TrainingMetrics:
    """Store and compare training metrics"""
    def __init__(self, method_name):
        self.method_name = method_name
        self.start_time = None
        self.end_time = None
        self.epoch_times = []
        self.memory_tracker = MemoryTracker()
        self.perplexity_history = []
        self.loss_history = []

    def start_training(self):
        self.start_time = time.time()
        self.memory_tracker.reset()

    def end_training(self):
        self.end_time = time.time()

    def log_epoch(self, epoch_time, loss, perplexity):
        self.epoch_times.append(epoch_time)
        self.loss_history.append(loss)
        self.perplexity_history.append(perplexity)

    def get_summary(self):
        total_time = self.end_time - self.start_time if self.end_time else 0
        return {
            'method': self.method_name,
            'total_training_time_hours': total_time / 3600,
            'avg_epoch_time_minutes': np.mean(self.epoch_times) / 60 if self.epoch_times else 0,
            'peak_memory_gb': self.memory_tracker.get_peak_memory(),
            'final_perplexity': self.perplexity_history[-1] if self.perplexity_history else None,
            'final_loss': self.loss_history[-1] if self.loss_history else None
        }

def compute_perplexity(loss):
    """Convert loss to perplexity"""
    return np.exp(loss)

def get_model_size_mb(model_path):
    """Get total size of saved model in MB"""
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(model_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return total_size / (1024 * 1024)

class MetricsCallback(TrainerCallback):
    def __init__(self, metrics_tracker):
        self.metrics_tracker = metrics_tracker
        self.epoch_start_time = None

    def on_epoch_begin(self, args, state, control, **kwargs):
        self.epoch_start_time = time.time()
        self.metrics_tracker.memory_tracker.update()

    def on_epoch_end(self, args, state, control, **kwargs):
        epoch_time = time.time() - self.epoch_start_time
        # Get last logged loss and perplexity
        if state.log_history:
            last_log = state.log_history[-1]
            loss = last_log.get('loss', 0)
            perplexity = compute_perplexity(loss)
            self.metrics_tracker.log_epoch(epoch_time, loss, perplexity)

            print(f"\n{'─'*60}")
            print(f"Epoch {state.epoch:.0f} Summary:")
            print(f"  Time: {epoch_time/60:.2f} minutes")
            print(f"  Loss: {loss:.4f}")
            print(f"  Perplexity: {perplexity:.4f}")
            alloc, reserved = self.metrics_tracker.memory_tracker.update()
            print(f"  GPU Memory: {alloc:.2f} GB allocated, {reserved:.2f} GB reserved")
            print(f"{'─'*60}\n")

    def on_log(self, args, state, control, logs=None, **kwargs):
        self.metrics_tracker.memory_tracker.update()

5. 4-bit QLoRA fine-tuning

In [None]:
# create metric callback
metrics_4bit = TrainingMetrics("4-bit QLORA")
metrics_callback = MetricsCallback(metrics_4bit)

# Clear memory before loading
gc.collect()
torch.cuda.empty_cache()
metrics_4bit.memory_tracker.reset()

# Config for NF4 + DQ
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",  # NormalFloat4
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)

# Load in Model
model_4bit = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

model_4bit = prepare_model_for_kbit_training(model_4bit)

lora_config_4bit = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=LORA_TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM"
)

# Add QLoRA adapters
model_4bit = get_peft_model(model_4bit, lora_config_4bit)

PackageNotFoundError: No package metadata was found for bitsandbytes

In [None]:
training_args_4_bit = TrainingArguments(
        output_dir=OUTPUT_DIR_4BIT,
        logging_dir=f"{OUTPUT_DIR_4BIT}/logs",
        num_train_epochs=NUM_EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        eval_strategy='steps',
        save_strategy="steps",
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        learning_rate=LEARNING_RATE,
        logging_steps=LOGGING_STEPS,
        eval_steps=EVAL_STEPS,
        save_steps=SAVE_STEPS,
        warmup_steps=WARMUP_STEPS,
        group_by_length=True,
        max_grad_norm=0.3,
        adam_beta2=0.999,
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        logging_first_step=True,
        bf16 = True,
        optim="paged_adamw_32bit",  # Memory-efficient optimizer
        gradient_checkpointing=True  # Save memory
    )

In [None]:
# Initialize Trainer
trainer_4bit = Trainer(
    model=model_4bit,
    args=training_args_4_bit,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    callbacks=[metrics_callback],
)

# Start training
metrics_4bit.start_training()

train_result = trainer_4bit.train()
metrics_4bit.end_training()

6. 16-bit LoRA fine-tuning

In [None]:
# create metric callback
metrics_16bit = TrainingMetrics("16-bit LoRA")
metrics_callback = MetricsCallback(metrics_16bit)

# Clear memory before loading
gc.collect()
torch.cuda.empty_cache()
metrics_16bit.memory_tracker.reset()

# Load in Model
model_16bit = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True
)

lora_config_16bit = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=LORA_TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM"
)

# Add LoRA adapters
model_16bit = get_peft_model(model_16bit, lora_config_16bit)

config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]



In [None]:
training_args_16_bit = TrainingArguments(
        output_dir=OUTPUT_DIR_16BIT,
        logging_dir=f"{OUTPUT_DIR_16BIT}/logs",
        num_train_epochs=NUM_EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        eval_strategy='steps',
        save_strategy="steps",
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        learning_rate=LEARNING_RATE,
        logging_steps=LOGGING_STEPS,
        eval_steps=EVAL_STEPS,
        save_steps=SAVE_STEPS,
        warmup_steps=WARMUP_STEPS,
        group_by_length=True,
        max_grad_norm=0.3,
        adam_beta2=0.999,
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        logging_first_step=True,
        bf16 = True,
        optim="adamw_torch",
        gradient_checkpointing=True
    )

In [None]:
# Initialize Trainer
trainer_16bit = Trainer(
    model=model_16bit,
    args=training_args_16_bit,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    callbacks=[metrics_callback],
)

# Start training
metrics_16bit.start_training()

train_result = trainer_16bit.train()
metrics_16bit.end_training()

7. Compare metrics + memory + timing

In [None]:
eval_results_4bit = trainer_4bit.evaluate()
eval_results_16bit = trainer_16bit.evaluate()

print("Evaluation Results:")
print(f"  4-bit Eval Loss: {eval_results_4bit['eval_loss']:.4f}")
print(f"  4-bit Eval Perplexity: {compute_perplexity(eval_results_4bit['eval_loss']):.2f}")
print(f"  16-bit Eval Loss: {eval_results_16bit['eval_loss']:.4f}")
print(f"  16-bit Eval Perplexity: {compute_perplexity(eval_results_16bit['eval_loss']):.2f}")

print("\nTraining Summary Report")

summary = metrics_4bit.get_summary()
model_size_mb = get_model_size_mb(OUTPUT_DIR_4BIT)

print(f"\nMethod: {summary['method']}")
print(f"Total Training Time: {summary['total_training_time_hours']:.2f} hours")
print(f"Average Epoch Time: {summary['avg_epoch_time_minutes']:.2f} minutes")
print(f"Peak GPU Memory: {summary['peak_memory_gb']:.2f} GB")
print(f"Final Loss: {summary['final_loss']:.4f}")
print(f"Final Perplexity: {summary['final_perplexity']:.2f}")
print(f"Model Size: {model_size_mb:.2f} MB ({model_size_mb/1024:.2f} GB)")

summary = metrics_16bit.get_summary()
model_size_mb = get_model_size_mb(OUTPUT_DIR_16BIT)

print(f"\nMethod: {summary['method']}")
print(f"Total Training Time: {summary['total_training_time_hours']:.2f} hours")
print(f"Average Epoch Time: {summary['avg_epoch_time_minutes']:.2f} minutes")
print(f"Peak GPU Memory: {summary['peak_memory_gb']:.2f} GB")
print(f"Final Loss: {summary['final_loss']:.4f}")
print(f"Final Perplexity: {summary['final_perplexity']:.2f}")
print(f"Model Size: {model_size_mb:.2f} MB ({model_size_mb/1024:.2f} GB)")

8. Save summaries

In [None]:
def save_training_logs(trainer, metrics_tracker, output_dir):
    """
    Save detailed training logs including:
    - Full trainer log history (step-by-step)
    - Epoch times
    - Loss history per epoch
    - Perplexity history per epoch
    """
    os.makedirs(output_dir, exist_ok=True)

    # 1. Save trainer's complete log history (all steps)
    if hasattr(trainer, 'state') and hasattr(trainer.state, 'log_history'):
        log_history_file = os.path.join(output_dir, 'training_logs.json')
        with open(log_history_file, 'w') as f:
            json.dump(trainer.state.log_history, f, indent=2)
        print(f"✓ Saved training logs to {log_history_file}")

    # 2. Save epoch-level metrics
    epoch_metrics = {
        'epoch_times_seconds': metrics_tracker.epoch_times,
        'epoch_times_minutes': [t/60 for t in metrics_tracker.epoch_times],
        'loss_per_epoch': metrics_tracker.loss_history,
        'perplexity_per_epoch': metrics_tracker.perplexity_history,
    }

    epoch_file = os.path.join(output_dir, 'epoch_metrics.json')
    with open(epoch_file, 'w') as f:
        json.dump(epoch_metrics, f, indent=2)
    print(f"✓ Saved epoch metrics to {epoch_file}")

    # 3. Save trainer state
    if hasattr(trainer, 'state'):
        trainer_state = {
            'global_step': trainer.state.global_step,
            'epoch': trainer.state.epoch,
            'best_metric': trainer.state.best_metric,
            'best_model_checkpoint': trainer.state.best_model_checkpoint,
        }

        state_file = os.path.join(output_dir, 'trainer_state.json')
        with open(state_file, 'w') as f:
            json.dump(trainer_state, f, indent=2)
        print(f"✓ Saved trainer state to {state_file}")


def save_evaluation_results(trainer, output_dir):
    """
    Save final evaluation results
    """
    os.makedirs(output_dir, exist_ok=True)

    try:
        eval_results = trainer.evaluate()

        eval_file = os.path.join(output_dir, 'evaluation_results.json')
        with open(eval_file, 'w') as f:
            json.dump(eval_results, f, indent=2)

        print(f"✓ Saved evaluation results to {eval_file}")
        print(f"  Final eval loss: {eval_results.get('eval_loss', 'N/A')}")

        return eval_results

    except Exception as e:
        print(f"✗ Error saving evaluation results: {e}")
        return None

In [None]:
save_training_logs(trainer_4bit, metrics_4bit, OUTPUT_DIR_4BIT)
save_evaluation_results(trainer_4bit, OUTPUT_DIR_4BIT)

save_training_logs(trainer_16bit, metrics_16bit, OUTPUT_DIR_16BIT)
save_evaluation_results(trainer_16bit, OUTPUT_DIR_16BIT)