# Fine-tuning Llama 3 for Reasoning with QLoRA (Drive-Integrated)

This notebook demonstrates fine-tuning Llama 3 8B using QLoRA for improved reasoning capabilities, with all data saved to Google Drive for persistence.

## 1. Setup and Installation

First, check GPU availability, install dependencies, and set up persistent storage.

In [None]:
# Check GPU availability
!nvidia-smi

In [None]:
# Clone the repository
!git clone https://github.com/vmm/llm-trainer.git
%cd llm-trainer

In [None]:
# Install dependencies
!pip install -r requirements.txt

In [None]:
# Fix module import issues
import os
import sys

# Check and fix the working directory
if not os.path.exists('src'):
    # If we're not in the repo root, try to find it
    if os.path.exists('llm-trainer'):
        %cd llm-trainer
    else:
        # If we can't find it, raise an error
        raise FileNotFoundError("Cannot find repository root directory with 'src' folder")

# Add the current directory to Python's path
sys.path.append('.')
print(f"Working directory: {os.getcwd()}")
print(f"Python path includes current directory: {'./' in sys.path or '.' in sys.path}")

In [None]:
# Mount Google Drive for persistent storage
from google.colab import drive
drive.mount('/content/drive')

# Configure output directory in Google Drive (change this to your preferred location)
DRIVE_OUTPUT_DIR = "llm-trainer-output"  # Will be created under /content/drive/MyDrive/

# You can change this to a different name if you want, e.g.,:
# DRIVE_OUTPUT_DIR = "my-llm-experiments/llama3-reasoning-1"

# Full path to the output directory
DRIVE_BASE_PATH = f"/content/drive/MyDrive/{DRIVE_OUTPUT_DIR}"

# Specific paths for different components
DRIVE_DATASET_PATH = f"{DRIVE_BASE_PATH}/datasets/natural_reasoning_processed"
DRIVE_MODEL_PATH = f"{DRIVE_BASE_PATH}/models/llama3_reasoning"
DRIVE_EVAL_PATH = f"{DRIVE_BASE_PATH}/evaluation/reasoning_results"
DRIVE_ADAPTER_PATH = f"{DRIVE_BASE_PATH}/lora_adapter"
DRIVE_ADAPTER_ZIP = f"{DRIVE_BASE_PATH}/lora_adapter.zip"

# Create project directories in Drive
!mkdir -p {DRIVE_BASE_PATH}/datasets
!mkdir -p {DRIVE_BASE_PATH}/models
!mkdir -p {DRIVE_BASE_PATH}/evaluation
!mkdir -p {DRIVE_BASE_PATH}/logs

print(f"All outputs will be saved to Google Drive under: {DRIVE_BASE_PATH}")

In [None]:
# Define a function to perform final backup of all training artifacts
def backup_all_training_artifacts():
    """
    Perform a complete backup of all training artifacts to Google Drive.
    This should be called at the end of training or when you want to ensure
    everything is saved before shutting down the Colab instance.
    """
    import os
    import time
    
    print(f"\n{'='*40}")
    print(f"PERFORMING FINAL BACKUP TO GOOGLE DRIVE")
    print(f"{'='*40}\n")
    
    # Create all required directories
    os.makedirs(f"{DRIVE_BASE_PATH}/models", exist_ok=True)
    os.makedirs(f"{DRIVE_BASE_PATH}/datasets", exist_ok=True)
    os.makedirs(f"{DRIVE_BASE_PATH}/logs", exist_ok=True)
    os.makedirs(f"{DRIVE_BASE_PATH}/evaluation", exist_ok=True)
    
    # Check for local output directory
    if os.path.exists('output'):
        # Copy all outputs (models, checkpoints, logs)
        !rsync -av --progress output/ {DRIVE_BASE_PATH}/models/ 2>/dev/null || cp -r output/* {DRIVE_BASE_PATH}/models/ 2>/dev/null || true
        print(f"✓ Backed up output directory to {DRIVE_BASE_PATH}/models/")
    
    # Check for local data directory 
    if os.path.exists('data'):
        # Copy all datasets
        !rsync -av --progress data/ {DRIVE_BASE_PATH}/datasets/ 2>/dev/null || cp -r data/* {DRIVE_BASE_PATH}/datasets/ 2>/dev/null || true
        print(f"✓ Backed up data directory to {DRIVE_BASE_PATH}/datasets/")
    
    # Check for local logs
    if os.path.exists('logs'):
        # Copy all logs
        !rsync -av --progress logs/ {DRIVE_BASE_PATH}/logs/ 2>/dev/null || cp -r logs/* {DRIVE_BASE_PATH}/logs/ 2>/dev/null || true
        print(f"✓ Backed up logs directory to {DRIVE_BASE_PATH}/logs/")
    
    # Check for local evaluation results
    if os.path.exists('evaluation'):
        # Copy all evaluation results
        !rsync -av --progress evaluation/ {DRIVE_BASE_PATH}/evaluation/ 2>/dev/null || cp -r evaluation/* {DRIVE_BASE_PATH}/evaluation/ 2>/dev/null || true
        print(f"✓ Backed up evaluation directory to {DRIVE_BASE_PATH}/evaluation/")
    
    # Copy tensorboard logs if they exist
    if os.path.exists('runs'):
        os.makedirs(f"{DRIVE_BASE_PATH}/tensorboard", exist_ok=True)
        !rsync -av --progress runs/ {DRIVE_BASE_PATH}/tensorboard/ 2>/dev/null || cp -r runs/* {DRIVE_BASE_PATH}/tensorboard/ 2>/dev/null || true
        print(f"✓ Backed up tensorboard logs to {DRIVE_BASE_PATH}/tensorboard/")
    
    print(f"\n{'='*40}")
    print(f"BACKUP COMPLETED - ALL TRAINING ARTIFACTS SAVED")
    print(f"{'='*40}\n")
    
    # List all backed up directories
    print("Contents of Drive backup directory:")
    !find {DRIVE_BASE_PATH} -type d | sort

# Register this function to be called when the user runs it explicitly
print("Final backup function defined. Run 'backup_all_training_artifacts()' at any time to ensure all")
print("training artifacts are completely backed up to Google Drive, or when you're finishing your work.")

In [None]:
# Set up comprehensive logging and periodic saves to Google Drive
import time
import threading
import os
import datetime
import sys
import logging

# Create a log directory in Drive
DRIVE_LOG_PATH = f"{DRIVE_BASE_PATH}/logs"
!mkdir -p {DRIVE_LOG_PATH}

# Set up logging to both console and file
log_file = f"{DRIVE_LOG_PATH}/training_log_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_file),
        logging.StreamHandler(sys.stdout)
    ]
)

# Don't override print - just use logging directly
logging.info(f"Logging enabled to {log_file}")
print(f"Logging enabled to {log_file}")

# Function to save checkpoints to Google Drive
def save_checkpoint_periodically(interval=900):  # 900 seconds = 15 minutes
    while True:
        time.sleep(interval)
        timestamp = time.strftime('%Y-%m-%d %H:%M:%S')
        print(f"\n[{timestamp}] Saving checkpoint to Google Drive...")
        logging.info(f"[{timestamp}] Saving checkpoint to Google Drive...")
        
        # Ensure directories exist
        !mkdir -p {DRIVE_BASE_PATH}/models 2>/dev/null || true
        !mkdir -p {DRIVE_BASE_PATH}/datasets 2>/dev/null || true
        !mkdir -p {DRIVE_BASE_PATH}/logs 2>/dev/null || true
        
        # Copy training state file to know where we left off
        training_state = {}
        if os.path.exists('output/llama3_reasoning'):
            # Check for training state files
            if os.path.exists('output/llama3_reasoning/trainer_state.json'):
                !cp output/llama3_reasoning/trainer_state.json {DRIVE_BASE_PATH}/models/
                print(f"  - Saved trainer state file")
                logging.info(f"  - Saved trainer state file")
            
            # Check for checkpoint directories
            checkpoints = !ls -d output/llama3_reasoning/checkpoint-* 2>/dev/null || true
            if checkpoints:
                for checkpoint in checkpoints:
                    checkpoint_name = os.path.basename(checkpoint)
                    checkpoint_drive_path = f"{DRIVE_BASE_PATH}/models/{checkpoint_name}"
                    # Only copy if it doesn't exist or is newer
                    if not os.path.exists(checkpoint_drive_path):
                        !mkdir -p {checkpoint_drive_path}
                        !cp -r {checkpoint}/* {checkpoint_drive_path}/
                        print(f"  - Saved new checkpoint: {checkpoint_name}")
                        logging.info(f"  - Saved new checkpoint: {checkpoint_name}")
            
            # Check for adapter model
            if os.path.exists('output/llama3_reasoning/adapter_model'):
                adapter_drive_path = f"{DRIVE_BASE_PATH}/models/adapter_model"
                !mkdir -p {adapter_drive_path}
                !cp -r output/llama3_reasoning/adapter_model/* {adapter_drive_path}/
                print(f"  - Saved adapter model")
                logging.info(f"  - Saved adapter model")
        
        # Save any processed datasets
        datasets = !ls -d data/*_processed 2>/dev/null || true
        for dataset in datasets:
            dataset_name = os.path.basename(dataset)
            dataset_drive_path = f"{DRIVE_BASE_PATH}/datasets/{dataset_name}"
            if not os.path.exists(dataset_drive_path):
                !mkdir -p {dataset_drive_path}
                !cp -r {dataset}/* {dataset_drive_path}/
                print(f"  - Saved dataset: {dataset_name}")
                logging.info(f"  - Saved dataset: {dataset_name}")
                
        # Update the log file (copy the most recent version)
        !cp {log_file} {DRIVE_LOG_PATH}/
        
        print(f"[{timestamp}] Checkpoint save completed")
        logging.info(f"[{timestamp}] Checkpoint save completed")

# Start the checkpoint thread
checkpoint_thread = threading.Thread(target=save_checkpoint_periodically, daemon=True)
checkpoint_thread.start()
print(f"Automatic checkpointing to Drive enabled (every 15 minutes)")
logging.info(f"Automatic checkpointing to Drive enabled (every 15 minutes)")
print(f"All outputs will persist in: {DRIVE_BASE_PATH}")
logging.info(f"All outputs will persist in: {DRIVE_BASE_PATH}")

## 2. Create Drive-Integrated Configuration

Update the configuration to save outputs to Google Drive and disable Flash Attention to avoid errors.

In [None]:
# Update training config to save to Drive and optimize memory usage
import yaml

with open('configs/llama3_reasoning.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Update output directory to use our Drive path
config['training']['output_dir'] = DRIVE_MODEL_PATH

# Disable Flash Attention to avoid errors in Colab
if 'model' in config and 'use_flash_attention' in config['model']:
    config['model']['use_flash_attention'] = False
    print("Flash Attention disabled to avoid errors")

# Optimize for Colab memory constraints
print("Optimizing training configuration for Colab memory constraints...")
# Reduce batch size and optimize memory usage
if 'training' in config:
    # Reduce batch sizes
    config['training']['per_device_train_batch_size'] = 2
    config['training']['per_device_eval_batch_size'] = 2
    # Increase gradient accumulation to maintain effective batch size
    config['training']['gradient_accumulation_steps'] = 16
    # Reduce dataloader workers to avoid shared memory issues
    config['training']['dataloader_num_workers'] = 1
    # Enable mixed precision
    config['training']['fp16'] = True
    # Disable torch compilation which can use more memory
    if 'torch_compile' in config['training']:
        config['training']['torch_compile'] = False
    print("Training hyperparameters adjusted for memory efficiency")

# Reduce sequence length if needed
if 'dataset' in config and 'max_seq_length' in config['dataset']:
    original_length = config['dataset']['max_seq_length']
    if original_length > 1024:
        config['dataset']['max_seq_length'] = 1024
        print(f"Reduced sequence length from {original_length} to {config['dataset']['max_seq_length']} tokens")

# Save updated config
with open('configs/llama3_reasoning_drive.yaml', 'w') as f:
    yaml.dump(config, f)

print(f"Updated config saved to configs/llama3_reasoning_drive.yaml with output_dir={DRIVE_MODEL_PATH}")

## 3. Authenticate and Process Data

Authenticate with Hugging Face to access the gated Llama 3 model, then process the dataset.

In [None]:
# Authenticate with Hugging Face
import os
from huggingface_hub import login

# Replace with your actual token
HF_TOKEN = "your_huggingface_token_here"  

# Log in to Hugging Face
login(token=HF_TOKEN)

# Set environment variable for other libraries
os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN
os.environ["HF_TOKEN"] = HF_TOKEN

In [None]:
# Create a smaller subset of the dataset for faster training
from datasets import load_from_disk, DatasetDict, Dataset

DATASET_SUBSET_PERCENTAGE = 10  # Use only 10% of the data

def create_dataset_subset(dataset_path, subset_path, percentage=10):
    """Create a smaller subset of a dataset for faster training"""
    print(f"Creating {percentage}% subset of dataset from {dataset_path}")
    
    # Load the original dataset
    try:
        original_dataset = load_from_disk(dataset_path)
        print(f"Original dataset loaded with splits: {list(original_dataset.keys())}")
        
        # Create a subset of each split
        subset_dict = {}
        for split_name, split_data in original_dataset.items():
            original_size = len(split_data)
            subset_size = max(int(original_size * percentage / 100), 100)  # At least 100 examples
            
            # Take a random sample of the specified percentage
            subset = split_data.shuffle(seed=42).select(range(subset_size))
            subset_dict[split_name] = subset
            
            print(f"Split '{split_name}': {original_size} → {subset_size} examples ({percentage}%)")
        
        # Create a new DatasetDict with the subset
        subset_dataset = DatasetDict(subset_dict)
        
        # Save the subset to disk
        subset_dataset.save_to_disk(subset_path)
        print(f"Subset saved to {subset_path}")
        
        return subset_dataset
    
    except Exception as e:
        print(f"Error creating dataset subset: {e}")
        return None

# Define paths
FULL_DATASET_PATH = DRIVE_DATASET_PATH
SUBSET_DATASET_PATH = f"{DRIVE_BASE_PATH}/datasets/natural_reasoning_subset_{DATASET_SUBSET_PERCENTAGE}pct"

# Check if subset already exists
if os.path.exists(SUBSET_DATASET_PATH):
    print(f"Dataset subset already exists at {SUBSET_DATASET_PATH}")
    # Update the dataset path to use the subset
    DRIVE_DATASET_PATH = SUBSET_DATASET_PATH
else:
    # First ensure the full dataset exists
    if not os.path.exists(FULL_DATASET_PATH):
        print(f"Full dataset not found at {FULL_DATASET_PATH}. Will process it first.")
    else:
        # Create the subset
        subset = create_dataset_subset(FULL_DATASET_PATH, SUBSET_DATASET_PATH, DATASET_SUBSET_PERCENTAGE)
        if subset is not None:
            # Update the dataset path to use the subset
            DRIVE_DATASET_PATH = SUBSET_DATASET_PATH
            print(f"✅ Now using dataset subset at: {DRIVE_DATASET_PATH}")

In [None]:
# Reduce the number of training epochs and sequence length for faster training
import yaml

with open('configs/llama3_reasoning_drive.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Reduce sequence length from 2048 to 1024 for faster training and less memory
if 'dataset' in config and 'max_seq_length' in config['dataset']:
    original_length = config['dataset']['max_seq_length']
    if original_length > 1024:
        config['dataset']['max_seq_length'] = 1024
        print(f"Reduced sequence length from {original_length} to 1024 tokens")

# Reduce number of epochs from 3 to 1
if 'training' in config and 'num_train_epochs' in config['training']:
    original_epochs = config['training']['num_train_epochs']
    if original_epochs > 1:
        config['training']['num_train_epochs'] = 1
        print(f"Reduced training epochs from {original_epochs} to 1")

# Save the updated config
with open('configs/llama3_reasoning_lightweight.yaml', 'w') as f:
    yaml.dump(config, f)

print(f"Lightweight training config saved to configs/llama3_reasoning_lightweight.yaml")

In [None]:
# Check if dataset already exists in Drive
import os
from datasets import load_from_disk, DatasetDict

if os.path.exists(DRIVE_DATASET_PATH):
    print(f"Dataset found at {DRIVE_DATASET_PATH}")
    
    # Load the dataset to check for validation split
    dataset = load_from_disk(DRIVE_DATASET_PATH)
    
    # Check if validation split exists
    if 'validation' not in dataset.keys():
        print("No validation split found in dataset. Creating validation split...")
        
        # Create validation split (10% of train data)
        if 'train' in dataset:
            split_dataset = dataset['train'].train_test_split(test_size=0.1, seed=42)
            
            # Create new dataset with validation split
            updated_dataset = DatasetDict({
                'train': split_dataset['train'],
                'validation': split_dataset['test']
            })
            
            # Save the updated dataset back to the same location
            updated_dataset.save_to_disk(DRIVE_DATASET_PATH)
            print(f"✅ Created validation split from train data. Updated dataset saved to {DRIVE_DATASET_PATH}")
            
            # Update the dataset variable
            dataset = updated_dataset
    
    # Create a symlink to local directory for easier access
    !mkdir -p data
    !ln -sf {DRIVE_DATASET_PATH} data/natural_reasoning_processed
    print(f"✅ Using dataset from Drive: {DRIVE_DATASET_PATH}")
    
else:
    # Process the dataset and save directly to Drive
    print(f"Processing dataset and saving to {DRIVE_DATASET_PATH}...")
    !python -m src.data_processors.reasoning_processor --config configs/llama3_reasoning.yaml --output_path {DRIVE_DATASET_PATH}
    
    # Check if validation split was created during processing
    dataset = load_from_disk(DRIVE_DATASET_PATH)
    if 'validation' not in dataset.keys():
        print("No validation split was created during processing. Creating one now...")
        
        # Create validation split (10% of train data)
        if 'train' in dataset:
            split_dataset = dataset['train'].train_test_split(test_size=0.1, seed=42)
            
            # Create new dataset with validation split
            updated_dataset = DatasetDict({
                'train': split_dataset['train'],
                'validation': split_dataset['test']
            })
            
            # Save the updated dataset back to the same location
            updated_dataset.save_to_disk(DRIVE_DATASET_PATH)
            print(f"✅ Created validation split from train data. Updated dataset saved to {DRIVE_DATASET_PATH}")
    
    # Create a symlink to local directory for easier access
    !mkdir -p data
    !ln -sf {DRIVE_DATASET_PATH} data/natural_reasoning_processed

In [None]:
# Verify the processed dataset
from datasets import load_from_disk

# Load the processed dataset
try:
    dataset = load_from_disk("data/natural_reasoning_processed")
    
    # Print info about the dataset
    print(f"Dataset splits: {dataset.keys()}")
    if 'train' in dataset:
        print(f"Train size: {len(dataset['train'])}")
    if 'validation' in dataset:
        print(f"Validation size: {len(dataset['validation'])}")
    
    # See the first example
    print("\nExample data:")
    print(dataset[list(dataset.keys())[0]][0])
except Exception as e:
    print(f"Error loading dataset: {e}")
    print("Will attempt to process dataset again during training if needed.")

In [None]:
# Setup a heartbeat file to detect if Colab session disconnects
import threading
import time
import os
import datetime

# Create a heartbeat directory
HEARTBEAT_PATH = f"{DRIVE_BASE_PATH}/heartbeat"
!mkdir -p {HEARTBEAT_PATH}

# Write initial heartbeat file
heartbeat_file = f"{HEARTBEAT_PATH}/heartbeat.txt"

def update_heartbeat():
    """Update heartbeat file every minute to track if Colab is still running"""
    while True:
        # Write current timestamp to heartbeat file
        with open(heartbeat_file, 'w') as f:
            timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            f.write(f"Last heartbeat: {timestamp}\n")
            f.write(f"If you're seeing this file, it means the Colab session was running at {timestamp}.\n")
            f.write(f"If this timestamp is old, the session likely disconnected at that time.\n")
        
        # Copy to Drive
        !cp {heartbeat_file} {HEARTBEAT_PATH}/
        
        # Wait for 60 seconds
        time.sleep(60)

# Start heartbeat thread
heartbeat_thread = threading.Thread(target=update_heartbeat, daemon=True)
heartbeat_thread.start()

print(f"Heartbeat monitoring enabled - tracking session activity at {heartbeat_file}")
print(f"If Colab disconnects, you can check when it happened by looking at this file in your Drive.")

## Lightweight Llama 3 Training

This section uses a small subset of the data and reduced training parameters to make Llama 3 fine-tuning feasible on Colab's resources. It:

1. Creates a 10% subset of the original dataset
2. Reduces maximum sequence length
3. Decreases the number of training epochs
4. Uses optimized memory settings

This approach should complete in 3-4 hours instead of days.

In [None]:
# Clean up memory before training
import gc
import torch

# Clear CUDA cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("CUDA cache cleared")
    
# Run garbage collection
gc.collect()
print("Garbage collection completed")

# Show current GPU memory usage
if torch.cuda.is_available():
    print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
    print(f"GPU memory reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")
    
# Print current GPU usage
!nvidia-smi | grep MiB

In [None]:
# Create a modified config file with more aggressive memory optimizations
import yaml

# Load the existing drive config
with open('configs/llama3_reasoning_drive.yaml', 'r') as f:
    memory_config = yaml.safe_load(f)

# Add more aggressive memory optimizations if needed
memory_config['model']['load_in_4bit'] = True  # Ensure 4-bit quantization is enabled
memory_config['model']['use_nested_quant'] = True  # Enable nested quantization for even more memory savings

# Save as a separate config for low-memory environments
with open('configs/llama3_reasoning_lowmem.yaml', 'w') as f:
    yaml.dump(memory_config, f)

print("Created low-memory configuration with aggressive memory optimizations")
print("If you still encounter memory issues, you can use this config instead:")
print("!python -m src.trainers.qlora_trainer configs/llama3_reasoning_lowmem.yaml --dataset_path {DRIVE_DATASET_PATH}")

In [None]:
# Function to check for existing training state and set up for resuming
def check_for_resume_point():
    """Check if there's an existing training state to resume from"""
    import os
    import glob
    import json
    import re
    import yaml
    
    # First check if the fine-tuned model already exists (complete training)
    if os.path.exists(os.path.join(DRIVE_MODEL_PATH, "adapter_model")):
        print(f"✓ Fine-tuned model already exists at {DRIVE_MODEL_PATH}/adapter_model")
        print("Skipping training step. If you want to retrain, delete this directory from your Drive.")
        return True
    
    # If not complete, check for checkpoints to resume from
    print("Looking for checkpoints to resume training...")
    checkpoints = glob.glob(f"{DRIVE_MODEL_PATH}/checkpoint-*")
    
    if checkpoints:
        # Find the latest checkpoint by sorting (checkpoint numbers should be sequential)
        checkpoints.sort(key=lambda x: int(re.search(r'checkpoint-(\d+)', x).group(1)), reverse=True)
        latest_checkpoint = checkpoints[0]
        checkpoint_num = re.search(r'checkpoint-(\d+)', latest_checkpoint).group(1)
        
        print(f"✓ Found checkpoint: {latest_checkpoint}")
        
        # Check if trainer state exists
        trainer_state_path = os.path.join(DRIVE_MODEL_PATH, "trainer_state.json")
        if os.path.exists(trainer_state_path):
            try:
                with open(trainer_state_path, 'r') as f:
                    trainer_state = json.load(f)
                total_steps = trainer_state.get('max_steps', 'unknown')
                completed_steps = trainer_state.get('global_step', 0)
                print(f"✓ Training was at step {completed_steps}/{total_steps}")
            except Exception as e:
                print(f"Could not parse trainer state: {e}")
        
        # Create local output dir if needed
        !mkdir -p output/llama3_reasoning
        
        # Copy checkpoint to local storage for use
        local_checkpoint = f"output/llama3_reasoning/checkpoint-{checkpoint_num}"
        if not os.path.exists(local_checkpoint):
            print(f"Copying checkpoint from Drive to local storage for resuming...")
            !mkdir -p {local_checkpoint}
            !cp -r {latest_checkpoint}/* {local_checkpoint}/
        
        # Add resume flag to config
        print(f"Modifying config to resume from checkpoint...")
        with open('configs/llama3_reasoning_resume.yaml', 'r') as f:
            config = yaml.safe_load(f)
        
        # Point to local checkpoint for resuming
        config['model']['adapter_name_or_path'] = local_checkpoint
        
        with open('configs/llama3_reasoning_resume.yaml', 'w') as f:
            yaml.dump(config, f)
            
        print(f"⏳ Resuming training from checkpoint-{checkpoint_num}...")
        print(f"Model will continue saving to {DRIVE_MODEL_PATH}")
        
        try:
            print("Resuming with standard configuration...")
            !python -m src.trainers.qlora_trainer configs/llama3_reasoning_resume.yaml --dataset_path {DRIVE_DATASET_PATH}
        except Exception as e:
            print(f"Resuming with standard config failed: {e}")
            print("Trying with more aggressive memory optimizations...")
            
            # Update low memory config for resuming
            with open('configs/llama3_reasoning_lowmem.yaml', 'r') as f:
                low_config = yaml.safe_load(f)
            
            low_config['model']['adapter_name_or_path'] = local_checkpoint
            
            with open('configs/llama3_reasoning_lowmem_resume.yaml', 'w') as f:
                yaml.dump(low_config, f)
            
            # Clear memory before retrying
            torch.cuda.empty_cache()
            gc.collect()
            !python -m src.trainers.qlora_trainer configs/llama3_reasoning_lowmem_resume.yaml --dataset_path {DRIVE_DATASET_PATH}
        
        return True  # Training was resumed
    
    return False  # No resumption point found

In [None]:
# Start the training process with the lightweight configuration
print("Starting lightweight fine-tuning process (this should take 3-4 hours)...")
print(f"Model will be saved to {DRIVE_MODEL_PATH}")
print(f"Using 10% of the dataset and reduced parameters")

# Use our lightweight config
!python -m src.trainers.qlora_trainer configs/llama3_reasoning_lightweight.yaml

In [None]:
# Run the final backup to ensure all training artifacts are saved
print("Training completed! Performing final backup to ensure all artifacts are saved...")
backup_all_training_artifacts()

In [None]:
# Main training execution - checks for resume point or starts fresh
if not check_for_resume_point():
    print("No existing checkpoints found. Starting new training...")
    print(f"Model will be saved to {DRIVE_MODEL_PATH}")
    
    # Try with the regular drive config first, but if it fails, use the low memory config
    try:
        print("Using standard optimized configuration...")
        !python -m src.trainers.qlora_trainer configs/llama3_reasoning_drive.yaml --dataset_path {DRIVE_DATASET_PATH}
    except Exception as e:
        print(f"Standard training failed with error: {e}")
        print("Trying with more aggressive memory optimizations...")
        # Clear memory before retrying
        torch.cuda.empty_cache()
        gc.collect()
        !python -m src.trainers.qlora_trainer configs/llama3_reasoning_lowmem.yaml --dataset_path {DRIVE_DATASET_PATH}

## 6. Compare Models

Compare the performance of the base model vs. the fine-tuned model.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os

# Try to load actual results from evaluation
results_path = os.path.join(DRIVE_EVAL_PATH, "Meta-Llama-3-8B_results.txt")
finetuned_results = {"accuracy": 0.75}  # Default if file doesn't exist

if os.path.exists(results_path):
    with open(results_path, 'r') as f:
        for line in f:
            if line.startswith("accuracy"):
                finetuned_results["accuracy"] = float(line.split(":")[1].strip())
    print(f"Loaded actual evaluation results: {finetuned_results}")
else:
    print("Using placeholder results - actual evaluation results not found")

# Base model results (placeholder - replace with actual if available)
base_model_results = {"accuracy": 0.65}

# Create comparison dataframe
df = pd.DataFrame({
    "Model": ["Base Llama 3 8B", "Fine-tuned Llama 3 8B"],
    "Accuracy": [base_model_results["accuracy"], finetuned_results["accuracy"]]
})

# Plot comparison
plt.figure(figsize=(10, 6))
ax = df.plot.bar(x="Model", y="Accuracy", rot=0)
ax.set_ylim(0, 1.0)
ax.set_title("Reasoning Performance Comparison")

for i, v in enumerate(df["Accuracy"]):
    ax.text(i, v + 0.02, f"{v:.2f}", ha="center")

plt.tight_layout()
plot_path = os.path.join(DRIVE_EVAL_PATH, "model_comparison.png")
plt.savefig(plot_path)
plt.show()

print(f"Comparison plot saved to {plot_path}")

## 7. Package LoRA Adapter for Download

Create a downloadable package of the adapter for later use.

In [None]:
# Define paths
adapter_path = os.path.join(DRIVE_MODEL_PATH, "adapter_model")

if os.path.exists(adapter_path):
    # Create export directory
    !mkdir -p {DRIVE_ADAPTER_PATH}
    
    # Copy adapter files
    !cp -r {adapter_path}/* {DRIVE_ADAPTER_PATH}/
    
    print(f"Adapter exported to {DRIVE_ADAPTER_PATH}")
    
    # Create a zip file for easy download
    !cd {DRIVE_BASE_PATH} && zip -r lora_adapter.zip lora_adapter
    print(f"Adapter ZIP file created at {DRIVE_ADAPTER_ZIP}")
    
    # Display file sizes
    !du -h {DRIVE_ADAPTER_PATH} {DRIVE_ADAPTER_ZIP}
else:
    print(f"Adapter not found at {adapter_path}")

## 8. Test the Fine-tuned Model

Try out the fine-tuned model on custom reasoning questions.

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import PeftModel, PeftConfig

# Load the adapter config
config = PeftConfig.from_pretrained(DRIVE_MODEL_PATH)

# Load base model with authentication
base_model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    load_in_8bit=True,
    device_map="auto",
    trust_remote_code=True,
    attn_implementation="eager",  # Use eager implementation instead of flash attention
    token=HF_TOKEN
)

# Load adapter model
model = PeftModel.from_pretrained(base_model, DRIVE_MODEL_PATH, is_trainable=False)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    config.base_model_name_or_path, 
    trust_remote_code=True,
    token=HF_TOKEN
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Create text generation pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=128,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
)

In [None]:
# Test on some custom questions
test_questions = [
    "If all roses are flowers and some flowers fade quickly, can we conclude that some roses fade quickly?",
    "If no mammals can fly, and all bats can fly, what can we conclude about bats?",
    "If all A are B, and all B are C, what can we conclude about the relationship between A and C?"
]

# Create a file to store results
test_results_path = os.path.join(DRIVE_EVAL_PATH, "custom_test_results.txt")
with open(test_results_path, "w") as f:
    for question in test_questions:
        prompt = f"Question: {question}\n\nAnswer: "
        result = pipe(prompt, return_full_text=False)[0]["generated_text"]
        
        print(f"Question: {question}")
        print(f"Answer: {result}")
        print("-" * 80)
        
        # Also write to file
        f.write(f"Question: {question}\n")
        f.write(f"Answer: {result}\n")
        f.write("-" * 80 + "\n\n")

print(f"Test results also saved to {test_results_path}")

## 10. Analyze Training Progress After Session Restart

If your Colab session was disconnected and you've returned to a new session, run the cells below to get insights into what happened during the previous run.

In [None]:
# Check the heartbeat file to see when the previous session disconnected
import os
import glob
import json
import datetime

print(f"{'='*50}")
print("PREVIOUS SESSION ANALYSIS")
print(f"{'='*50}\n")

# Check if heartbeat file exists
heartbeat_path = f"{DRIVE_BASE_PATH}/heartbeat/heartbeat.txt"
if os.path.exists(heartbeat_path):
    with open(heartbeat_path, 'r') as f:
        heartbeat_content = f.read()
    print("Last session heartbeat:")
    print(f"{heartbeat_content}\n")
else:
    print("No heartbeat file found. Previous session may not have created one.\n")

# Check for training state
trainer_state_path = f"{DRIVE_BASE_PATH}/models/trainer_state.json"
if os.path.exists(trainer_state_path):
    try:
        with open(trainer_state_path, 'r') as f:
            trainer_state = json.load(f)
        
        # Extract key information
        total_steps = trainer_state.get('max_steps', 'unknown')
        completed_steps = trainer_state.get('global_step', 0)
        last_log = trainer_state.get('log_history', [{}])[-1]
        
        # Calculate percentage complete
        if isinstance(total_steps, int) and total_steps > 0:
            pct_complete = (completed_steps / total_steps) * 100
            print(f"Training Progress: {completed_steps}/{total_steps} steps ({pct_complete:.1f}% complete)")
        else:
            print(f"Training Progress: {completed_steps}/{total_steps} steps")
        
        # Show last metrics
        if last_log:
            print("\nLast logged metrics:")
            for key, value in last_log.items():
                if isinstance(value, float):
                    print(f"  - {key}: {value:.5f}")
                else:
                    print(f"  - {key}: {value}")
        
        print("")
    except Exception as e:
        print(f"Error parsing trainer state: {e}\n")
else:
    print("No trainer state file found. Training may not have started or saved state.\n")

# List checkpoints
checkpoints = glob.glob(f"{DRIVE_BASE_PATH}/models/checkpoint-*")
if checkpoints:
    checkpoints.sort(key=lambda x: int(os.path.basename(x).split('-')[1]))
    print(f"Found {len(checkpoints)} checkpoints:")
    for i, checkpoint in enumerate(checkpoints[-5:], 1):  # Show last 5 checkpoints
        checkpoint_num = os.path.basename(checkpoint).split('-')[1]
        checkpoint_time = datetime.datetime.fromtimestamp(os.path.getmtime(checkpoint)).strftime('%Y-%m-%d %H:%M:%S')
        print(f"  {i}. checkpoint-{checkpoint_num} (saved at {checkpoint_time})")
    
    print(f"\nLatest checkpoint: {os.path.basename(checkpoints[-1])}")
    print(f"To resume training from this checkpoint, continue with the notebook\n")
else:
    print("No checkpoints found in Drive. Training may not have saved any checkpoints yet.\n")

# Check logs
log_files = glob.glob(f"{DRIVE_BASE_PATH}/logs/*.txt")
if log_files:
    log_files.sort(key=os.path.getmtime, reverse=True)
    latest_log = log_files[0]
    log_time = datetime.datetime.fromtimestamp(os.path.getmtime(latest_log)).strftime('%Y-%m-%d %H:%M:%S')
    
    print(f"Latest log file: {os.path.basename(latest_log)} (last modified: {log_time})")
    print("Last 10 lines of log:")
    
    try:
        with open(latest_log, 'r') as f:
            lines = f.readlines()
            for line in lines[-10:]:
                print(f"  {line.strip()}")
    except Exception as e:
        print(f"Error reading log file: {e}")
else:
    print("No log files found in Drive.")

print(f"\n{'='*50}")
print("To resume training, continue with the notebook from Cell 3 (Setup)")
print(f"{'='*50}")

## 9. Access Your Outputs After Colab Shutdown

All important files are now stored in your Google Drive and will persist even after the Colab session ends. Here's how to find them:

In [None]:
print(f"\n=== ALL OUTPUT LOCATIONS (in Google Drive) ===\n")
print(f"Root directory:     {DRIVE_BASE_PATH}")
print(f"Processed Dataset:  {DRIVE_DATASET_PATH}")
print(f"Fine-tuned Model:   {DRIVE_MODEL_PATH}")
print(f"LoRA Adapter:       {DRIVE_ADAPTER_PATH}")
print(f"LoRA Adapter ZIP:   {DRIVE_ADAPTER_ZIP}")
print(f"Evaluation Results: {DRIVE_EVAL_PATH}")

# List all saved directories in Drive
print("\n=== DIRECTORIES CREATED IN GOOGLE DRIVE ===\n")
!find {DRIVE_BASE_PATH} -type d | sort

In [None]:
# Display a summary of what was created
print("=== LLM Fine-tuning Summary ===")
print(f"Dataset: {'✓' if os.path.exists(DRIVE_DATASET_PATH) else '✗'}")
print(f"Trained Model: {'✓' if os.path.exists(DRIVE_MODEL_PATH) else '✗'}")
print(f"LoRA Adapter: {'✓' if os.path.exists(DRIVE_ADAPTER_PATH) else '✗'}")
print(f"Evaluation Results: {'✓' if os.path.exists(DRIVE_EVAL_PATH) else '✗'}")
print("\nAll files are stored in your Google Drive and will be available after this Colab session ends.")
print(f"\nTo use a different Drive location for future runs, just change the DRIVE_OUTPUT_DIR variable at the beginning of the notebook.")