# Fine-tuning Llama 3 for Reasoning with QLoRA (Drive-Integrated)

This notebook demonstrates fine-tuning Llama 3 8B using QLoRA for improved reasoning capabilities, with all data saved to Google Drive for persistence.

## 1. Setup and Installation

First, check GPU availability, install dependencies, and set up persistent storage.

In [None]:
# Check GPU availability
!nvidia-smi

In [None]:
# Configure output directory in Google Drive (change this to your preferred location)
DRIVE_OUTPUT_DIR = "llm-trainer-output"  # Will be created under /content/drive/MyDrive/

# You can change this to a different name if you want, e.g.,:
# DRIVE_OUTPUT_DIR = "my-llm-experiments/llama3-reasoning-1"

# Full path to the output directory
DRIVE_BASE_PATH = f"/content/drive/MyDrive/{DRIVE_OUTPUT_DIR}"

# Specific paths for different components
DRIVE_DATASET_PATH = f"{DRIVE_BASE_PATH}/datasets/natural_reasoning_processed"
DRIVE_MODEL_PATH = f"{DRIVE_BASE_PATH}/models/llama3_reasoning"
DRIVE_EVAL_PATH = f"{DRIVE_BASE_PATH}/evaluation/reasoning_results"
DRIVE_ADAPTER_PATH = f"{DRIVE_BASE_PATH}/lora_adapter"
DRIVE_ADAPTER_ZIP = f"{DRIVE_BASE_PATH}/lora_adapter.zip"

print(f"All outputs will be saved to Google Drive under: {DRIVE_BASE_PATH}")

In [None]:
# Mount Google Drive for persistent storage
from google.colab import drive
drive.mount('/content/drive')

# Create project directories in Drive
!mkdir -p {DRIVE_BASE_PATH}/datasets
!mkdir -p {DRIVE_BASE_PATH}/models
!mkdir -p {DRIVE_BASE_PATH}/evaluation

print(f"Created directories in Google Drive at: {DRIVE_BASE_PATH}")

In [None]:
# Clone the repository
!git clone https://github.com/vmm/llm-trainer.git
%cd llm-trainer

In [None]:
# Install dependencies
!pip install -r requirements.txt

In [None]:
# Fix module import issues
import os
import sys

# Check and fix the working directory
if not os.path.exists('src'):
    # If we're not in the repo root, try to find it
    if os.path.exists('llm-trainer'):
        %cd llm-trainer
    else:
        # If we can't find it, raise an error
        raise FileNotFoundError("Cannot find repository root directory with 'src' folder")

# Add the current directory to Python's path
sys.path.append('.')
print(f"Working directory: {os.getcwd()}")
print(f"Python path includes current directory: {'./' in sys.path or '.' in sys.path}")

In [None]:
# Set up periodic saves to Google Drive
import time
import threading

def save_checkpoint_periodically(interval=1800):  # 1800 seconds = 30 minutes
    while True:
        time.sleep(interval)
        print("\nSaving checkpoint to Google Drive...")
        # Synchronize any files that might have changed
        !mkdir -p output 2>/dev/null || true
        !cp -r output/* {DRIVE_BASE_PATH}/models/ 2>/dev/null || true
        !cp -r data/* {DRIVE_BASE_PATH}/datasets/ 2>/dev/null || true
        print(f"Checkpoint saved at {time.strftime('%H:%M:%S')}")

# Start the checkpoint thread
checkpoint_thread = threading.Thread(target=save_checkpoint_periodically, daemon=True)
checkpoint_thread.start()
print("Automatic checkpointing to Drive enabled (every 30 minutes)")

## 2. Create Drive-Integrated Configuration

Update the configuration to save outputs to Google Drive and disable Flash Attention to avoid errors.

In [ ]:
# Update training config to save to Drive and optimize memory usage
import yaml

with open('configs/llama3_reasoning.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Update output directory to use our Drive path
config['training']['output_dir'] = DRIVE_MODEL_PATH

# Disable Flash Attention to avoid errors in Colab
if 'model' in config and 'use_flash_attention' in config['model']:
    config['model']['use_flash_attention'] = False
    print("Flash Attention disabled to avoid errors")

# Optimize for Colab memory constraints
print("Optimizing training configuration for Colab memory constraints...")
# Reduce batch size and optimize memory usage
if 'training' in config:
    # Reduce batch sizes
    config['training']['per_device_train_batch_size'] = 2
    config['training']['per_device_eval_batch_size'] = 2
    # Increase gradient accumulation to maintain effective batch size
    config['training']['gradient_accumulation_steps'] = 16
    # Reduce dataloader workers to avoid shared memory issues
    config['training']['dataloader_num_workers'] = 1
    # Enable mixed precision
    config['training']['fp16'] = True
    # Disable torch compilation which can use more memory
    if 'torch_compile' in config['training']:
        config['training']['torch_compile'] = False
    print("Training hyperparameters adjusted for memory efficiency")

# Reduce sequence length if needed
if 'dataset' in config and 'max_seq_length' in config['dataset']:
    if config['dataset']['max_seq_length'] > 1024:
        original_length = config['dataset']['max_seq_length']
        config['dataset']['max_seq_length'] = 1024
        print(f"Reduced sequence length from {original_length} to {config['dataset']['max_seq_length']} tokens")

# Save updated config
with open('configs/llama3_reasoning_drive.yaml', 'w') as f:
    yaml.dump(config, f)

print(f"Updated config saved to configs/llama3_reasoning_drive.yaml with output_dir={DRIVE_MODEL_PATH}")

## 3. Authenticate and Process Data

Authenticate with Hugging Face to access the gated Llama 3 model, then process the dataset.

In [None]:
# Authenticate with Hugging Face
import os
from huggingface_hub import login

# Replace with your actual token
HF_TOKEN = "your_huggingface_token_here"  

# Log in to Hugging Face
login(token=HF_TOKEN)

# Set environment variable for other libraries
os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN
os.environ["HF_TOKEN"] = HF_TOKEN

In [None]:
# Check if dataset already exists in Drive
import os
if os.path.exists(DRIVE_DATASET_PATH):
    print(f"Dataset already exists at {DRIVE_DATASET_PATH}")
    # Create a symlink to local directory for easier access
    !mkdir -p data
    !ln -sf {DRIVE_DATASET_PATH} data/natural_reasoning_processed
else:
    # Process the dataset and save directly to Drive
    print(f"Processing dataset and saving to {DRIVE_DATASET_PATH}...")
    !python -m src.data_processors.reasoning_processor --config configs/llama3_reasoning.yaml --output_path {DRIVE_DATASET_PATH}

In [None]:
# Verify dataset structure
from datasets import load_from_disk

# Load the processed dataset
try:
    dataset = load_from_disk(DRIVE_DATASET_PATH)
    
    # Print info about the dataset
    print(f"Dataset splits: {dataset.keys()}")
    if 'train' in dataset:
        print(f"Train size: {len(dataset['train'])}")
    if 'validation' in dataset:
        print(f"Validation size: {len(dataset['validation'])}")
    
    # See the first example
    print("\nExample data:")
    print(dataset[list(dataset.keys())[0]][0])
except Exception as e:
    print(f"Error loading dataset: {e}")
    print("Will attempt to process dataset again during training if needed.")

## 4. Fine-tuning with QLoRA

Fine-tune the Llama 3 model using QLoRA with all outputs saved to Drive.

### Memory Management Tips

Before running the training process, here are some memory optimization strategies already applied:

1. **Reduced Batch Size**: Set to 2 per device with increased gradient accumulation steps
2. **Reduced Sequence Length**: Capped at 1024 tokens instead of 2048
3. **Reduced Dataloader Workers**: Set to 1 to prevent shared memory errors
4. **Mixed Precision Training**: Using FP16 to reduce memory usage

If you still encounter memory issues, try:

- Clearing GPU memory before training: `torch.cuda.empty_cache()`
- Restarting the Colab runtime before training
- Further reducing batch size to 1
- Disabling gradient checkpointing (but this may limit sequence length capacity)
- Upgrading to Colab Pro+ for more memory
- Trying a smaller model like Llama-3-8B-Instruct instead of the full Llama-3-8B

In [ ]:
# Clean up memory before training
import gc
import torch

# Clear CUDA cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("CUDA cache cleared")
    
# Run garbage collection
gc.collect()
print("Garbage collection completed")

# Show current GPU memory usage
if torch.cuda.is_available():
    print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
    print(f"GPU memory reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")
    
# Print current GPU usage
!nvidia-smi | grep MiB

In [ ]:
# Create a modified config file with more aggressive memory optimizations
import yaml

# Load the existing drive config
with open('configs/llama3_reasoning_drive.yaml', 'r') as f:
    memory_config = yaml.safe_load(f)

# Add more aggressive memory optimizations if needed
memory_config['model']['load_in_4bit'] = True  # Ensure 4-bit quantization is enabled
memory_config['model']['use_nested_quant'] = True  # Enable nested quantization for even more memory savings

# Save as a separate config for low-memory environments
with open('configs/llama3_reasoning_lowmem.yaml', 'w') as f:
    yaml.dump(memory_config, f)

print("Created low-memory configuration with aggressive memory optimizations")
print("If you still encounter memory issues, you can use this config instead:")
print("!python -m src.trainers.qlora_trainer configs/llama3_reasoning_lowmem.yaml --dataset_path {DRIVE_DATASET_PATH}")

In [ ]:
# Check if fine-tuned model already exists
import os
if os.path.exists(os.path.join(DRIVE_MODEL_PATH, "adapter_model")):
    print(f"Fine-tuned model already exists at {DRIVE_MODEL_PATH}/adapter_model")
    print("Skipping training step. If you want to retrain, delete this directory from your Drive.")
else:
    # Fine-tune the model
    print(f"Starting fine-tuning process (this may take several hours)...")
    print(f"Model will be saved to {DRIVE_MODEL_PATH}")
    
    # Try with the regular drive config first, but if it fails, use the low memory config
    try:
        print("Using standard optimized configuration...")
        !python -m src.trainers.qlora_trainer configs/llama3_reasoning_drive.yaml --dataset_path {DRIVE_DATASET_PATH}
    except Exception as e:
        print(f"Standard training failed with error: {e}")
        print("Trying with more aggressive memory optimizations...")
        # Clear memory before retrying
        torch.cuda.empty_cache()
        gc.collect()
        !python -m src.trainers.qlora_trainer configs/llama3_reasoning_lowmem.yaml --dataset_path {DRIVE_DATASET_PATH}

## 6. Compare Models

Compare the performance of the base model vs. the fine-tuned model.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os

# Try to load actual results from evaluation
results_path = os.path.join(DRIVE_EVAL_PATH, "Meta-Llama-3-8B_results.txt")
finetuned_results = {"accuracy": 0.75}  # Default if file doesn't exist

if os.path.exists(results_path):
    with open(results_path, 'r') as f:
        for line in f:
            if line.startswith("accuracy"):
                finetuned_results["accuracy"] = float(line.split(":")[1].strip())
    print(f"Loaded actual evaluation results: {finetuned_results}")
else:
    print("Using placeholder results - actual evaluation results not found")

# Base model results (placeholder - replace with actual if available)
base_model_results = {"accuracy": 0.65}

# Create comparison dataframe
df = pd.DataFrame({
    "Model": ["Base Llama 3 8B", "Fine-tuned Llama 3 8B"],
    "Accuracy": [base_model_results["accuracy"], finetuned_results["accuracy"]]
})

# Plot comparison
plt.figure(figsize=(10, 6))
ax = df.plot.bar(x="Model", y="Accuracy", rot=0)
ax.set_ylim(0, 1.0)
ax.set_title("Reasoning Performance Comparison")

for i, v in enumerate(df["Accuracy"]):
    ax.text(i, v + 0.02, f"{v:.2f}", ha="center")

plt.tight_layout()
plot_path = os.path.join(DRIVE_EVAL_PATH, "model_comparison.png")
plt.savefig(plot_path)
plt.show()

print(f"Comparison plot saved to {plot_path}")

## 7. Package LoRA Adapter for Download

Create a downloadable package of the adapter for later use.

In [None]:
# Define paths
adapter_path = os.path.join(DRIVE_MODEL_PATH, "adapter_model")

if os.path.exists(adapter_path):
    # Create export directory
    !mkdir -p {DRIVE_ADAPTER_PATH}
    
    # Copy adapter files
    !cp -r {adapter_path}/* {DRIVE_ADAPTER_PATH}/
    
    print(f"Adapter exported to {DRIVE_ADAPTER_PATH}")
    
    # Create a zip file for easy download
    !cd {DRIVE_BASE_PATH} && zip -r lora_adapter.zip lora_adapter
    print(f"Adapter ZIP file created at {DRIVE_ADAPTER_ZIP}")
    
    # Display file sizes
    !du -h {DRIVE_ADAPTER_PATH} {DRIVE_ADAPTER_ZIP}
else:
    print(f"Adapter not found at {adapter_path}")

## 8. Test the Fine-tuned Model

Try out the fine-tuned model on custom reasoning questions.

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import PeftModel, PeftConfig

# Load the adapter config
config = PeftConfig.from_pretrained(DRIVE_MODEL_PATH)

# Load base model with authentication
base_model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    load_in_8bit=True,
    device_map="auto",
    trust_remote_code=True,
    attn_implementation="eager",  # Use eager implementation instead of flash attention
    token=HF_TOKEN
)

# Load adapter model
model = PeftModel.from_pretrained(base_model, DRIVE_MODEL_PATH, is_trainable=False)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    config.base_model_name_or_path, 
    trust_remote_code=True,
    token=HF_TOKEN
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Create text generation pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=128,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
)

In [None]:
# Test on some custom questions
test_questions = [
    "If all roses are flowers and some flowers fade quickly, can we conclude that some roses fade quickly?",
    "If no mammals can fly, and all bats can fly, what can we conclude about bats?",
    "If all A are B, and all B are C, what can we conclude about the relationship between A and C?"
]

# Create a file to store results
test_results_path = os.path.join(DRIVE_EVAL_PATH, "custom_test_results.txt")
with open(test_results_path, "w") as f:
    for question in test_questions:
        prompt = f"Question: {question}\n\nAnswer: "
        result = pipe(prompt, return_full_text=False)[0]["generated_text"]
        
        print(f"Question: {question}")
        print(f"Answer: {result}")
        print("-" * 80)
        
        # Also write to file
        f.write(f"Question: {question}\n")
        f.write(f"Answer: {result}\n")
        f.write("-" * 80 + "\n\n")

print(f"Test results also saved to {test_results_path}")

## 9. Access Your Outputs After Colab Shutdown

All important files are now stored in your Google Drive and will persist even after the Colab session ends. Here's how to find them:

In [None]:
print(f"\n=== ALL OUTPUT LOCATIONS (in Google Drive) ===\n")
print(f"Root directory:     {DRIVE_BASE_PATH}")
print(f"Processed Dataset:  {DRIVE_DATASET_PATH}")
print(f"Fine-tuned Model:   {DRIVE_MODEL_PATH}")
print(f"LoRA Adapter:       {DRIVE_ADAPTER_PATH}")
print(f"LoRA Adapter ZIP:   {DRIVE_ADAPTER_ZIP}")
print(f"Evaluation Results: {DRIVE_EVAL_PATH}")

# List all saved directories in Drive
print("\n=== DIRECTORIES CREATED IN GOOGLE DRIVE ===\n")
!find {DRIVE_BASE_PATH} -type d | sort

In [None]:
# Display a summary of what was created
print("=== LLM Fine-tuning Summary ===")
print(f"Dataset: {'✓' if os.path.exists(DRIVE_DATASET_PATH) else '✗'}")
print(f"Trained Model: {'✓' if os.path.exists(DRIVE_MODEL_PATH) else '✗'}")
print(f"LoRA Adapter: {'✓' if os.path.exists(DRIVE_ADAPTER_PATH) else '✗'}")
print(f"Evaluation Results: {'✓' if os.path.exists(DRIVE_EVAL_PATH) else '✗'}")
print("\nAll files are stored in your Google Drive and will be available after this Colab session ends.")
print(f"\nTo use a different Drive location for future runs, just change the DRIVE_OUTPUT_DIR variable at the beginning of the notebook.")