In [None]:
# xBD Pipeline: Model Training

This notebook demonstrates how to:
1. Configure and initialize models
2. Set up training parameters
3. Train models with experiment tracking
4. Monitor training progress


In [None]:
import sys
sys.path.append('..')

import torch
import mlflow
from pathlib import Path
from src.models.localization import UNet
from src.utils.training_utils import calculate_iou, save_checkpoint
from src.utils.tracking import ExperimentTracker
from src.data.dataset import XBDDataset
from torch.utils.data import DataLoader


In [None]:
## 1. Configure Training


In [None]:
# Training configuration
config = {
    'data': {
        'train_dir': '../Data/train',
        'val_dir': '../Data/val',
        'image_size': (512, 512),
        'batch_size': 8,
        'num_workers': 2
    },
    'model': {
        'architecture': 'unet',
        'encoder': 'resnet34',
        'num_classes': 1,
        'learning_rate': 0.001
    },
    'training': {
        'epochs': 10,
        'device': 'cuda' if torch.cuda.is_available() else 'cpu'
    }
}

# Initialize experiment tracking
tracker = ExperimentTracker(
    experiment_name='building_segmentation',
    run_name='unet_training_example'
)

# Log parameters
tracker.log_params(config)


In [None]:
## 2. Prepare Data


In [None]:
# Create datasets
train_dataset = XBDDataset(
    data_dir=Path(config['data']['train_dir']),
    image_size=config['data']['image_size'],
    augment=True
)

val_dataset = XBDDataset(
    data_dir=Path(config['data']['val_dir']),
    image_size=config['data']['image_size'],
    augment=False
)

# Create dataloaders
train_loader = DataLoader(
    train_dataset,
    batch_size=config['data']['batch_size'],
    shuffle=True,
    num_workers=config['data']['num_workers']
)

val_loader = DataLoader(
    val_dataset,
    batch_size=config['data']['batch_size'],
    shuffle=False,
    num_workers=config['data']['num_workers']
)

print(f'Training samples: {len(train_dataset)}')
print(f'Validation samples: {len(val_dataset)}')


In [None]:
## 3. Initialize Model


In [None]:
# Initialize model
model = UNet(
    encoder=config['model']['encoder'],
    num_classes=config['model']['num_classes']
)
model = model.to(config['training']['device'])

# Initialize optimizer and criterion
optimizer = torch.optim.Adam(
    model.parameters(),
    lr=config['model']['learning_rate']
)
criterion = model.get_loss_fn()

print(f'Model parameters: {sum(p.numel() for p in model.parameters())}')
print(f'Training on: {config["training"]["device"]}')


In [None]:
## 4. Training Loop


In [None]:
from tqdm.notebook import tqdm

def train_epoch(model, loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    total_iou = 0
    
    for images, masks in tqdm(loader, desc='Training'):
        images = images.to(device)
        masks = masks.to(device)
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, masks)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        total_iou += calculate_iou(
            outputs.detach().sigmoid() > 0.5,
            masks
        )
    
    return total_loss / len(loader), total_iou / len(loader)

def validate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    total_iou = 0
    
    with torch.no_grad():
        for images, masks in tqdm(loader, desc='Validation'):
            images = images.to(device)
            masks = masks.to(device)
            
            outputs = model(images)
            loss = criterion(outputs, masks)
            
            total_loss += loss.item()
            total_iou += calculate_iou(
                outputs.sigmoid() > 0.5,
                masks
            )
    
    return total_loss / len(loader), total_iou / len(loader)

# Training loop
best_val_iou = 0

for epoch in range(config['training']['epochs']):
    print(f'\nEpoch {epoch + 1}/{config["training"]["epochs"]}')
    
    # Train
    train_loss, train_iou = train_epoch(
        model, train_loader, criterion, optimizer,
        config['training']['device']
    )
    
    # Validate
    val_loss, val_iou = validate(
        model, val_loader, criterion,
        config['training']['device']
    )
    
    # Log metrics
    metrics = {
        'train_loss': train_loss,
        'train_iou': train_iou,
        'val_loss': val_loss,
        'val_iou': val_iou
    }
    tracker.log_metrics(metrics, step=epoch)
    
    print(f'Train Loss: {train_loss:.4f}, Train IoU: {train_iou:.4f}')
    print(f'Val Loss: {val_loss:.4f}, Val IoU: {val_iou:.4f}')
    
    # Save best model
    if val_iou > best_val_iou:
        best_val_iou = val_iou
        save_checkpoint(
            Path('../output/checkpoints/best_model.pt'),
            model=model,
            optimizer=optimizer,
            epoch=epoch,
            loss=val_loss,
            config=config
        )
        tracker.log_artifact('../output/checkpoints/best_model.pt')

tracker.end_run()
