In [2]:
!hostname
!pip install pandas==2.2.3 \
xgboost==2.1.2 \
catboost==1.2.7 \
lightgbm==4.5.0 \
loky==3.4.1 \
scikit-learn==1.5.2 \
joblib==1.4.2 \
seaborn==0.13.2 \
kaggle==1.6.17 \
tqdm==4.66.6 \
colorama==0.4.6 \
biosppy==0.8.0 \
neurokit2==0.2.10 \
imbalanced-learn==0.12.4 \
pywavelets==1.7.0 \
entropy==0.1.5 \
torch==2.5.1 \
torchvision==0.20.1


193-122-153-173
Defaulting to user installation because normal site-packages is not writeable


In [1]:
import sys
import os

base_path = './tibo1/Lab3/'
# Set the working directory to the project root
#os.chdir('./tibo1/Lab3/')

# Convert to absolute path and add to sys.path
sys.path.append(os.path.abspath(base_path))

In [2]:
import re
import matplotlib.pyplot as plt
import numpy as np
from torch.utils.data import DataLoader
import torch.optim as optim
from tqdm import tqdm
from utils import get_device
from torch import Tensor
from dataset import MyDataset
import multiprocessing
from model import UNet
import torch

### 1. Training
#### 1a. Load the Expert Data

In [3]:
# Load expert data
expert_X_train = np.load('./tibo1/Lab3/out/preprocessed/expert_X_train.npy')
expert_y_train = np.load('./tibo1/Lab3/out/preprocessed/expert_y_train.npy')
expert_X_val = np.load('./tibo1/Lab3/out/preprocessed/expert_X_val.npy')
expert_y_val = np.load('./tibo1/Lab3/out/preprocessed/expert_y_val.npy')

In [10]:


# Example Usage
device = get_device().type
use_amp = device == 'cuda'  # Use AMP only if on CUDA
BATCH_SIZE = 16


#### 1b. Create the Dataloaders with Transformations and Data Augmentation

In [14]:
# Get the number of CPU cores available
num_workers = multiprocessing.cpu_count()

# Create datasets
train_dataset = MyDataset(expert_X_train, expert_y_train, transform=True)
val_dataset = MyDataset(expert_X_val, expert_y_val)  # No transform for validation

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=num_workers, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=num_workers, pin_memory=True)

print(f"Using {num_workers} workers for DataLoader")


Using 30 workers for DataLoader


#### 1c. Define the Model (U-Net)

| **Aspect**             | **Concatenation**                         | **Addition**                              |
|-------------------------|-------------------------------------------|-------------------------------------------|
| **Feature Combination** | Combines all features (doubles channels). | Merges features (no change in channels).  |
| **Parameter Efficiency**| More parameters due to increased channels.| Fewer parameters; efficient.              |
| **Memory Usage**        | Higher memory usage.                      | Lower memory usage.                       |
| **Use Case**            | Tasks requiring high precision (e.g., segmentation).| Tasks prioritizing efficiency or residual learning. |


**Dimension Example**

For an input of $1 \times 1 \times 256 \times 256$ (batch size, channels, height, width):

| Step | Output Dimensions (Your UNet and Modular UNet) |
| --- | --- |
| Conv1 | 64×256×25664 \times 256 \times 25664×256×256 |
| Pool1 | 64×128×12864 \times 128 \times 12864×128×128 |
| Conv2 | 128×128×128128 \times 128 \times 128128×128×128 |
| Pool2 | 128×64×64128 \times 64 \times 64128×64×64 |
| Conv3 | 256×64×64256 \times 64 \times 64256×64×64 |
| Pool3 | 256×32×32256 \times 32 \times 32256×32×32 |
| Conv4 | 512×32×32512 \times 32 \times 32512×32×32 |
| Pool4 | 512×16×16512 \times 16 \times 16512×16×16 |
| Conv5 | 1024×16×161024 \times 16 \times 161024×16×16 |
| Up6 + Skip4 | 1024×32×321024 \times 32 \times 321024×32×32 |
| Conv6 | 512×32×32512 \times 32 \times 32512×32×32 |
| Up7 + Skip3 | 512×64×64512 \times 64 \times 64512×64×64 |
| Conv7 | 256×64×64256 \times 64 \times 64256×64×64 |
| Up8 + Skip2 | 256×128×128256 \times 128 \times 128256×128×128 |
| Conv8 | 128×128×128128 \times 128 \times 128128×128×128 |
| Up9 + Skip1 | 128×256×256128 \times 256 \times 256128×256×256 |
| Conv9 | 64×256×25664 \times 256 \times 25664×256×256 |
| Output Conv | n_classes×256×256n\_classes \times 256 \times 256n_classes×256×256 |


In [6]:
# Initialize the model
model = UNet(n_classes=1, n_channels=1, bilinear=False)

# Move the model to the appropriate device
model = model.to(device)

#### 1d. Define the Loss Function and Optimizer

In [7]:
def power_jaccard_loss(y_true, y_pred, p=2, smooth=1e-6):
    """
    From https://www.scitepress.org/Papers/2021/103040/103040.pdf
    """
    y_true = y_true.float().view(-1)
    y_pred = y_pred.float().view(-1)

    intersection = (y_true * y_pred).sum()
    total = (torch.pow(y_true, p) + torch.pow(y_pred, p)).sum()
    union = total - intersection

    IoU = (intersection + smooth) / (union + smooth)

    return 1 - IoU

# Set up optimizer
LEARNING_RATE = 3e-4
optimizer = optim.RMSprop(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-8, momentum=0.999)

# Set up learning rate scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', patience=5)

# Set up loss function
criterion = power_jaccard_loss

#### 1g. Metrics for Evaluation

**Dice Coefficient**
- **Intuition** : Measures the overlap between predicted and ground truth masks. Values range from 0 (no overlap) to 1 (perfect overlap). Useful for imbalanced datasets.

- **Formula** :
$$
 \text{Dice Coefficient} = \frac{2 \cdot |A \cap B|}{|A| + |B|}
$$


---

**Dice Loss**
- **Intuition** : Penalizes poor segmentation overlap. Loss decreases as overlap improves.

- **Formula** :
$$
 \text{Dice Loss} = 1 - \text{Dice Coefficient}
$$


---

**IoU Coefficient**
- **Intuition** : Ratio of overlap to total area. Complements Dice for evaluating segmentation.

- **Formula** :
$$
 \text{IoU} = \frac{|A \cap B|}{|A \cup B|} = \frac{|A \cap B|}{|A| + |B| - |A \cap B|}
$$

In [8]:
# Define the Dice coefficient and Dice loss
def dice_coeff(input: Tensor, target: Tensor, epsilon: float = 1e-6):
    # Average of Dice coefficient for all batches, or for a single mask
    iou = iou_coeff(input, target, smooth=epsilon)
    return (2 * iou) / (iou + 1)

def dice_loss(input: Tensor, target: Tensor):
    # Dice loss (objective to minimize) between 0 and 1
    return 1 - dice_coeff(input, target)

def iou_coeff(y_true: Tensor, y_pred: Tensor, smooth=1e-6):
    y_true = y_true.float().view(-1)
    y_pred = y_pred.float().view(-1)

    # intersection is equivalent to True Positive count
    # union is the mutually inclusive area of all labels & predictions
    intersection = (y_true * y_pred).sum()
    total = (y_true + y_pred).sum()
    union = total - intersection

    return (intersection + smooth) / (union + smooth)

#### 1e. Training Loop

In [15]:
import os
import re
from tqdm import tqdm
import torch
import traceback


# Directories for checkpoints and outputs
checkpoint_dir = './tibo1/Lab3/out/checkpoints/'
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)

# Initialize tracking variables
last_checkpoint = None
last_epoch = 0

# Search for the latest checkpoint
for file_name in os.listdir(checkpoint_dir):
    match = re.match(r'checkpoint_epoch(\d+)\.pth', file_name)
    if match:
        epoch_num = int(match.group(1))
        if epoch_num > last_epoch:
            last_epoch = epoch_num
            last_checkpoint = os.path.join(checkpoint_dir, file_name)

# Resume training if a checkpoint is found
if last_checkpoint:
    print(f"Resuming training from checkpoint: {last_checkpoint}")
    model.load_state_dict(torch.load(last_checkpoint), weights_only=True)
    optimizer.load_state_dict(torch.load(os.path.join(checkpoint_dir, f'optimizer_epoch{last_epoch}.pth'), weights_only=True))
    print(f"Resumed from epoch {last_epoch}")
else:
    print("No checkpoint found. Starting training from scratch.")

# Update starting epoch
start_epoch = last_epoch + 1

# Number of epochs for training
EPOCHS = 40
best_val_loss = float('inf')  # Initialize the best validation loss

train_losses = globals().get('train_losses', [])  # Resume losses if exists
val_losses = globals().get('val_losses', [])
val_dices = globals().get('val_dices', [])
val_ious = globals().get('val_ious', [])

# Get device
device = get_device()
print("Running on device:", device)
print("Batch Size: ", BATCH_SIZE)

# Training Loop
for epoch in range(start_epoch, EPOCHS + 1):
    model.train()
    epoch_loss = 0
    with tqdm(total=len(train_loader), desc=f'Epoch {epoch}/{EPOCHS}', unit='batch') as pbar:
        for images, masks in train_loader:
            images = images.to(device, dtype=torch.float32)
            masks = masks.to(device, dtype=torch.float32)

            optimizer.zero_grad()

            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, masks)

            # Backward pass
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            pbar.set_postfix({'loss': loss.item()})
            pbar.update()

    # Validation
    model.eval()
    val_loss = 0
    dice_score = 0
    iou_score = 0
    with torch.no_grad():
        for images, masks in val_loader:
            images = images.to(device, dtype=torch.float32)
            masks = masks.to(device, dtype=torch.float32)

            outputs = model(images)
            loss = criterion(outputs, masks)

            val_loss += loss.item()

            # Compute metrics
            outputs = (outputs > 0.5).float()
            dice_score += dice_coeff(outputs, masks).item()
            iou_score += iou_coeff(outputs, masks).item()

    val_loss /= len(val_loader)
    dice_score /= len(val_loader)
    iou_score /= len(val_loader)

    # Update learning rate scheduler
    scheduler.step(dice_score)

    # Save metrics
    train_losses.append(epoch_loss / len(train_loader))
    val_losses.append(val_loss)
    val_dices.append(dice_score)
    val_ious.append(iou_score)

    print(f'Epoch {epoch}: Train Loss {train_losses[-1]:.4f}, Val Loss {val_losses[-1]:.4f}, Dice {dice_score:.4f}, IoU {iou_score:.4f}')

    # Save model and optimizer checkpoint
    torch.save(model.state_dict(), os.path.join(checkpoint_dir, f'checkpoint_epoch{epoch}.pth'))
    torch.save(optimizer.state_dict(), os.path.join(checkpoint_dir, f'optimizer_epoch{epoch}.pth'))

    # Save the best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model_path = os.path.join(checkpoint_dir, f'best_model_epoch_{epoch}.pth')
        torch.save(model.state_dict(), best_model_path)
        print(f"Best model saved with validation loss {best_val_loss:.4f} at epoch {epoch}")

No checkpoint found. Starting training from scratch.
Running on device: cuda
Batch Size:  16


Epoch 1/40: 100%|██████████| 675/675 [01:15<00:00,  8.97batch/s, loss=0.838]


Epoch 1: Train Loss 0.7495, Val Loss 0.7798, Dice 0.2202, IoU 0.2202
Best model saved with validation loss 0.7798 at epoch 1


Epoch 2/40: 100%|██████████| 675/675 [01:13<00:00,  9.21batch/s, loss=0.739]


Epoch 2: Train Loss 0.7517, Val Loss 0.7798, Dice 0.2202, IoU 0.2202


Epoch 3/40: 100%|██████████| 675/675 [01:22<00:00,  8.14batch/s, loss=0.938]


Epoch 3: Train Loss 0.9178, Val Loss 0.7798, Dice 0.2202, IoU 0.2202


Epoch 4/40: 100%|██████████| 675/675 [01:40<00:00,  6.72batch/s, loss=1]    


Epoch 4: Train Loss 0.9421, Val Loss 0.7798, Dice 0.2202, IoU 0.2202


Epoch 5/40: 100%|██████████| 675/675 [01:02<00:00, 10.87batch/s, loss=0.878]


Epoch 5: Train Loss 0.9820, Val Loss 0.7798, Dice 0.2202, IoU 0.2202


Epoch 6/40: 100%|██████████| 675/675 [01:02<00:00, 10.88batch/s, loss=0.894]


Epoch 6: Train Loss 0.8787, Val Loss 0.7798, Dice 0.2202, IoU 0.2202


Epoch 7/40: 100%|██████████| 675/675 [01:02<00:00, 10.86batch/s, loss=0.851]


Epoch 7: Train Loss 0.8562, Val Loss 0.7798, Dice 0.2202, IoU 0.2202


Epoch 8/40: 100%|██████████| 675/675 [01:02<00:00, 10.85batch/s, loss=0.876]


Epoch 8: Train Loss 0.8296, Val Loss 0.7798, Dice 0.2202, IoU 0.2202


Epoch 9/40: 100%|██████████| 675/675 [01:02<00:00, 10.88batch/s, loss=0.866]


Epoch 9: Train Loss 0.8332, Val Loss 0.7798, Dice 0.2202, IoU 0.2202


Epoch 10/40: 100%|██████████| 675/675 [01:02<00:00, 10.88batch/s, loss=0.836]


Epoch 10: Train Loss 0.8270, Val Loss 0.7798, Dice 0.2202, IoU 0.2202


Epoch 11/40: 100%|██████████| 675/675 [01:02<00:00, 10.88batch/s, loss=0.844]


Epoch 11: Train Loss 0.8025, Val Loss 0.7798, Dice 0.2202, IoU 0.2202


Epoch 12/40: 100%|██████████| 675/675 [01:02<00:00, 10.85batch/s, loss=0.809]


Epoch 12: Train Loss 0.7910, Val Loss 0.7798, Dice 0.2202, IoU 0.2202


Epoch 13/40: 100%|██████████| 675/675 [01:18<00:00,  8.58batch/s, loss=0.725]


Epoch 13: Train Loss 0.7840, Val Loss 0.7798, Dice 0.2202, IoU 0.2202


Epoch 14/40: 100%|██████████| 675/675 [01:02<00:00, 10.87batch/s, loss=0.732]


Epoch 14: Train Loss 0.7616, Val Loss 0.7798, Dice 0.2202, IoU 0.2202


Epoch 15/40:  55%|█████▌    | 373/675 [00:34<00:28, 10.72batch/s, loss=0.785]


KeyboardInterrupt: 

#### 1f. Save and Visualize Training Metrics

In [None]:
def save_metrics(train_losses, val_losses, val_dices, val_ious, dir_figures, name='metrics.png'):
    plt.figure(figsize=(15, 5))
    # Plot the training and validation losses
    plt.subplot(1, 4, 1)
    plt.plot(train_losses, label='Train Loss')
    plt.plot(val_losses, label='Val Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Loss')
    plt.legend()

    # Plot the validation Dice Coefficient
    plt.subplot(1, 4, 2)
    plt.plot(val_dices)
    plt.xlabel('Epoch')
    plt.ylabel('Dice Coefficient')
    plt.title('Validation Dice Coefficient')

    # Plot the validation IoU
    plt.subplot(1, 4, 3)
    plt.plot(val_ious)
    plt.xlabel('Epoch')
    plt.ylabel('IoU')
    plt.title('Validation IoU')

    plt.tight_layout()
    plt.savefig(os.path.join(dir_figures, name))
    plt.show()

In [None]:

# Save metrics
dir_figures = './tibo1/Lab3/figures'
os.makedirs(dir_figures, exist_ok=True)
save_metrics(train_losses, val_losses, val_dices, val_ious, dir_figures, name='metrics.png')

### 2. Fine-Tuning with Amateur Data
#### 2a. Load the Amateur Data

In [None]:
amateur_X_train = np.load('./tibo1/Lab3/out/preprocessed/amateur_X_train.npy')
amateur_y_train = np.load('./tibo1/Lab3/out/preprocessed/amateur_y_train.npy')
amateur_X_val = np.load('./tibo1/Lab3/out/preprocessed/amateur_X_val.npy')
amateur_y_val = np.load('./tibo1/Lab3/out/preprocessed/amateur_y_val.npy')


#### 2b. Load the Pre-Trained Model

In [None]:
# best_model_path = './out/checkpoints/best_model.pth'  # Adjust as needed
# model.load_state_dict(torch.load(best_model_path, weights_only=True))
# print("Loaded pretrained model weights for fine-tuning.")

# Load weights to CPU first
state_dict = torch.load(best_model_path, map_location=torch.device('cpu'), weights_only=True)

# Load the state dict into the model
model.load_state_dict(state_dict)

# Move the model to the GPU
model = model.to(device)

#### 2c. Update the Dataloaders

In [None]:
# Get the number of CPU cores available
#num_workers = multiprocessing.cpu_count()

train_dataset = MyDataset(amateur_X_train, amateur_y_train, transform=True)
val_dataset = MyDataset(amateur_X_val, amateur_y_val)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=5, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=5, pin_memory=True)

print(f"Using {num_workers} workers for DataLoader")

#### 2d.Adjust Learning Rate for Fine-Tuning

In [None]:
LEARNING_RATE = 1e-4  # Smaller learning rate for fine-tuning
for param_group in optimizer.param_groups:
    param_group['lr'] = LEARNING_RATE

#### 2e.Continue Training Loop for Fine-Tuning

In [None]:
import os
import re
from tqdm import tqdm
import torch

# Function to get the device
def get_device():
    return torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize directories and tracking variables
checkpoint_dir = './tibo1/Lab3/out/checkpoints/'
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)

last_checkpoint = None
last_epoch = 0

# Search for the last checkpoint in the directory
for file_name in os.listdir(checkpoint_dir):
    match = re.match(r'fine_tune_checkpoint_epoch(\d+)\.pth', file_name)
    if match:
        epoch_num = int(match.group(1))
        if epoch_num > last_epoch:
            last_epoch = epoch_num
            last_checkpoint = os.path.join(checkpoint_dir, file_name)

# Resume training if a checkpoint exists
if last_checkpoint:
    print(f"Resuming training from checkpoint: {last_checkpoint}")
    model.load_state_dict(torch.load(last_checkpoint))
    optimizer.load_state_dict(torch.load(os.path.join(checkpoint_dir, f'fine_tune_optimizer_epoch{last_epoch}.pth')))
    print(f"Resumed from epoch {last_epoch}")
else:
    print("No checkpoint found. Starting training from scratch.")

# Update starting epoch
start_epoch = last_epoch + 1
EPOCHS_FINE_TUNE = 20  # Total number of epochs

best_val_loss = float('inf')  # Reset best validation loss
train_losses = globals().get('train_losses', [])  # Resume metrics if they exist
val_losses = globals().get('val_losses', [])
val_dices = globals().get('val_dices', [])
val_ious = globals().get('val_ious', [])

# Get device and setup GradScaler for mixed precision
device = get_device()
use_amp = device.type == 'cuda'
scaler = torch.cuda.amp.GradScaler() if use_amp else None

print(f"Running fine-tuning on device: {device}")

# Fine-tuning loop
for epoch in range(start_epoch, EPOCHS_FINE_TUNE + 1):
    model.train()
    epoch_loss = 0
    with tqdm(total=len(train_loader), desc=f'Fine-Tune Epoch {epoch}/{EPOCHS_FINE_TUNE}', unit='batch') as pbar:
        for images, masks in train_loader:
            images = images.to(device, dtype=torch.float32)
            masks = masks.to(device, dtype=torch.float32)

            optimizer.zero_grad()

            # Mixed precision training
            if use_amp:
                with torch.cuda.amp.autocast():
                    outputs = model(images)
                    loss = criterion(outputs, masks)
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                outputs = model(images)
                loss = criterion(outputs, masks)
                loss.backward()
                optimizer.step()

            epoch_loss += loss.item()
            pbar.set_postfix({'loss': loss.item()})
            pbar.update()

    # Validation
    model.eval()
    val_loss = 0
    dice_score = 0
    iou_score = 0
    with torch.no_grad():
        for images, masks in val_loader:
            images = images.to(device, dtype=torch.float32)
            masks = masks.to(device, dtype=torch.float32)

            # Mixed precision validation
            if use_amp:
                with torch.cuda.amp.autocast():
                    outputs = model(images)
                    loss = criterion(outputs, masks)
            else:
                outputs = model(images)
                loss = criterion(outputs, masks)

            val_loss += loss.item()

            # Compute metrics
            outputs = (outputs > 0.5).float()
            dice_score += dice_coeff(outputs, masks).item()
            iou_score += iou_coeff(outputs, masks).item()

    val_loss /= len(val_loader)
    dice_score /= len(val_loader)
    iou_score /= len(val_loader)

    # Update learning rate scheduler
    scheduler.step(dice_score)

    # Save metrics
    train_losses.append(epoch_loss / len(train_loader))
    val_losses.append(val_loss)
    val_dices.append(dice_score)
    val_ious.append(iou_score)

    print(f'Fine-Tune Epoch {epoch}: Train Loss {train_losses[-1]:.4f}, Val Loss {val_losses[-1]:.4f}, '
          f'Dice {dice_score:.4f}, IoU {iou_score:.4f}')

    # Save model and optimizer checkpoint
    torch.save(model.state_dict(), os.path.join(checkpoint_dir, f'fine_tune_checkpoint_epoch{epoch}.pth'))
    torch.save(optimizer.state_dict(), os.path.join(checkpoint_dir, f'fine_tune_optimizer_epoch{epoch}.pth'))

    # Save the best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model_path = os.path.join(checkpoint_dir, f'best_fine_tuned_model_epoch{epoch}.pth')
        torch.save(model.state_dict(), best_model_path)
        print(f'Best model saved at epoch {epoch} with validation loss {best_val_loss:.4f}')


In [None]:
# Save metrics
dir_figures = './tibo1/Lab3/figures'
os.makedirs(dir_figures, exist_ok=True)
save_metrics(train_losses, val_losses, val_dices, val_ious, dir_figures, name='metrics_finetuned.png')