# CubeDiff: Training Pipeline

This notebook demonstrates the training pipeline for CubeDiff:

1. Configure training parameters
2. Initialize the model and training components
3. Train on a small dataset
4. Monitor training progress
5. Save checkpoints

In [None]:
import os
import sys
import numpy as np
import torch
from diffusers import StableDiffusionPipeline, DDPMScheduler
import matplotlib.pyplot as plt
from PIL import Image
from tqdm.auto import tqdm
import json
import time

# Add parent directory to path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# Import custom modules
from model.architecture import CubeDiffModel
from data.dataset import CubemapDataset, get_dataloader
from training.trainer import CubeDiffTrainer
from training.lora import add_lora_to_model

## 1. Define Training Configuration

In [None]:
# Create a configuration class
class TrainingConfig:
    def __init__(self):
        # Model config
        self.pretrained_model_name = "runwayml/stable-diffusion-v1-5"
        self.lora_rank = 16
        self.lora_alpha = 16
        self.prediction_type = "v_prediction"  # or "epsilon"
        
        # Training config
        self.output_dir = "../outputs/cubediff_mini"
        self.data_dir = "../data/processed/cubemaps"
        self.captions_file = "../data/processed/captions.json"
        self.batch_size = 1
        self.learning_rate = 1e-4
        self.min_learning_rate = 1e-6
        self.weight_decay = 0.01
        self.max_grad_norm = 1.0
        self.num_workers = 2
        self.gradient_accumulation_steps = 4
        self.mixed_precision = "fp16"
        
        # Logging config
        self.use_wandb = False
        self.wandb_project = "cubediff"
        self.wandb_run_name = "cubediff_mini"
        self.log_every_n_steps = 10
        self.save_every_n_steps = 100
        self.eval_every_n_steps = 100

config = TrainingConfig()

## 2. Environment Check

Let's confirm that we have the required resources available.

In [None]:
# Check if CUDA is available
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024 / 1024 / 1024:.2f} GB")
    print(f"Number of GPUs: {torch.cuda.device_count()}")
else:
    print("GPU not available, training will be slow on CPU.")

# Check directory structure
data_path_exists = os.path.exists(config.data_dir)
print(f"Data directory exists: {data_path_exists}")

# Create output directory if it doesn't exist
os.makedirs(config.output_dir, exist_ok=True)
print(f"Output directory: {config.output_dir}")

## 3. Prepare Dataset

In [None]:
# Create datasets
train_dataset = CubemapDataset(
    data_dir=config.data_dir,
    captions_file=config.captions_file
)

# For this mini example, we'll use the same dataset for validation
val_dataset = train_dataset

print(f"Training dataset size: {len(train_dataset)}")

# Inspect a sample
sample = train_dataset[0]
print(f"Sample caption: {sample['caption']}")
print(f"Sample faces shape: {sample['faces'].shape}")

## 4. Initialize Trainer

In [None]:
# Initialize trainer
trainer = CubeDiffTrainer(
    config=config,
    pretrained_model_name=config.pretrained_model_name,
    output_dir=config.output_dir,
    mixed_precision=config.mixed_precision,
    gradient_accumulation_steps=config.gradient_accumulation_steps
)

## 5. Run Mini-Training

For this demonstration, we'll run a small number of training steps.

In [None]:
# Run a mini training session
num_steps = 250  # Just for demonstration

# Start timer
start_time = time.time()

# Train the model
trainer.train(
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    num_train_epochs=num_steps
)

# End timer
end_time = time.time()
training_time = end_time - start_time

print(f"\nTraining completed in {training_time:.2f} seconds")
print(f"Average time per step: {training_time / num_steps:.2f} seconds")
print(f"Estimated time for 30000 steps: {(training_time / num_steps) * 30000 / 3600:.2f} hours")

## 6. Visualize Training Progress

In [None]:
# If wandb was used, we can visualize the training curves
if config.use_wandb:
    import wandb
    
    # Get run history
    api = wandb.Api()
    run = api.run(f"{config.wandb_project}/{config.wandb_run_name}")
    
    # Get loss history
    history = run.history()
    
    # Plot loss curve
    plt.figure(figsize=(10, 6))
    plt.plot(history['train/loss'], label='Training Loss')
    if 'val/loss' in history.columns:
        plt.plot(history['val/loss'], label='Validation Loss')
    plt.xlabel('Step')
    plt.ylabel('Loss')
    plt.title('Training Loss Curve')
    plt.legend()
    plt.grid(True)
    plt.show()
else:
    print("WandB logging was disabled, no training curves available.")
    
    # Instead, let's look at the checkpoint files
    checkpoint_dir = config.output_dir
    if os.path.exists(checkpoint_dir):
        checkpoints = [d for d in os.listdir(checkpoint_dir) if d.startswith('checkpoint-')]
        print(f"Saved checkpoints: {checkpoints}")
        
        # Plot checkpoint sizes to get a rough idea of model evolution
        sizes = []
        steps = []
        
        for ckpt in checkpoints:
            ckpt_path = os.path.join(checkpoint_dir, ckpt, "model.pt")
            if os.path.exists(ckpt_path):
                size = os.path.getsize(ckpt_path) / (1024 * 1024)  # MB
                step = int(ckpt.split('-')[1])
                sizes.append(size)
                steps.append(step)
        
        if sizes:
            plt.figure(figsize=(10, 6))
            plt.plot(steps, sizes, 'o-')
            plt.xlabel('Training Step')
            plt.ylabel('Checkpoint Size (MB)')
            plt.title('Model Checkpoint Size Evolution')
            plt.grid(True)
            plt.show()
        else:
            print("No checkpoint files found.")

## 7. Test Checkpoint Loading

In [None]:
# Get the latest checkpoint
checkpoints = [d for d in os.listdir(config.output_dir) if d.startswith('checkpoint-')]
if checkpoints:
    # Sort by step number
    checkpoints = sorted(checkpoints, key=lambda x: int(x.split('-')[1]))
    latest_checkpoint = os.path.join(config.output_dir, checkpoints[-1], "model.pt")
    
    print(f"Latest checkpoint: {latest_checkpoint}")
    
    # Load the checkpoint to test it
    if os.path.exists(latest_checkpoint):
        try:
            # Initialize a new model
            test_model = CubeDiffModel(config.pretrained_model_name)
            
            # Load the checkpoint
            test_model.load_state_dict(torch.load(latest_checkpoint, map_location="cpu"))
            
            print("Successfully loaded checkpoint")
        except Exception as e:
            print(f"Error loading checkpoint: {e}")
    else:
        print("Checkpoint file not found")
else:
    print("No checkpoints found")

## 8. Estimate Full Training Resources

Based on the mini-training, let's estimate the resources needed for full training.

In [None]:
# Estimate full training resources
if 'training_time' in locals():
    # Estimated total training time
    total_hours = (training_time / num_steps) * 30000 / 3600
    
    # With 8-hour daily sessions
    days_needed = total_hours / 8
    
    print(f"Estimated resources for full 30,000 steps training:")
    print(f"Total training time: {total_hours:.2f} hours")
    print(f"Training days with 8-hour sessions: {days_needed:.2f} days")
    
    # Estimate GPU memory usage
    if torch.cuda.is_available():
        # Get current GPU memory usage
        memory_used = torch.cuda.max_memory_allocated(0) / 1024 / 1024 / 1024  # GB
        print(f"Peak GPU memory usage: {memory_used:.2f} GB")
        
        # Check if we need to optimize further
        total_memory = torch.cuda.get_device_properties(0).total_memory / 1024 / 1024 / 1024  # GB
        if memory_used > 0.9 * total_memory:
            print("WARNING: Memory usage is close to capacity. Consider reducing batch size or model size.")
        else:
            print(f"Memory headroom: {(total_memory - memory_used):.2f} GB")
    
    # Estimate cost (assuming $1.25/hour for L4 GPU on GCP)
    hourly_rate = 1.25 * torch.cuda.device_count()  # Cost per hour for all GPUs
    estimated_cost = hourly_rate * total_hours
    
    print(f"Estimated cost (at ${hourly_rate:.2f}/hour): ${estimated_cost:.2f}")
    
    # Check if within budget
    daily_budget = 80  # Daily budget in dollars
    daily_cost = hourly_rate * 8  # Cost for 8 hours of training
    
    print(f"Daily cost (8 hours): ${daily_cost:.2f} (Budget: ${daily_budget})") 
    if daily_cost > daily_budget:
        print("WARNING: Daily cost exceeds budget.")
else:
    print("No training data available for estimation.")

## 9. Next Steps

Based on this mini-training session, here are the next steps for full CubeDiff implementation:

1. **Expand the dataset**:
   - Collect and process more panorama images
   - Ensure diversity in scene types (indoor/outdoor, natural/urban)
   - Create high-quality captions for all images

2. **Optimize training parameters**:
   - Adjust learning rate and scheduling based on mini-training results
   - Fine-tune LoRA parameters for better efficiency
   - Consider gradient accumulation steps for larger effective batch size

3. **Set up long-running training**:
   - Configure automatic checkpointing for 8-hour sessions
   - Implement a robust session management system
   - Ensure training can be resumed from checkpoints

4. **Implement evaluation metrics**:
   - Face consistency measures
   - FID score for panorama quality
   - Text-image alignment metrics for conditional generation

5. **Create inference pipeline**:
   - Build an optimized inference system
   - Implement text-to-panorama and image-to-panorama modes
   - Optimize for high-resolution output