In [1]:
import os
import torch
import torchvision
import xml.etree.ElementTree as ET
from PIL import Image
from torch.utils.data import DataLoader, Dataset, Subset
from torchvision import transforms
from torchvision.models.detection import FasterRCNN, fasterrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.transforms import functional as F
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Set random seed for reproducibility
random_seed = 42
torch.manual_seed(random_seed)

<torch._C.Generator at 0x7fbc4c0ef2f0>

In [2]:
# Paths
train_val_path = "train"
test_path = "valid"

# List all images and annotations for training/validation set
images = [os.path.join(train_val_path, f) for f in os.listdir(train_val_path) if f.endswith('.jpg')]
annotations = [os.path.join(train_val_path, f) for f in os.listdir(train_val_path) if f.endswith('.xml')]

# List all images and annotations for test set
test_images = [os.path.join(test_path, f) for f in os.listdir(test_path) if f.endswith('.jpg')]
test_annotations = [os.path.join(test_path, f) for f in os.listdir(test_path) if f.endswith('.xml')]

# Ensure images and annotations are aligned
images.sort()
annotations.sort()
test_images.sort()
test_annotations.sort()


In [3]:
# Mapping Pascal VOC classes to numerical labels
VOC_CLASSES = {
    "aeroplane": 1, "bicycle": 2, "bird": 3, "boat": 4, "bottle": 5,
    "bus": 6, "car": 7, "cat": 8, "chair": 9, "cow": 10,
    "diningtable": 11, "dog": 12, "horse": 13, "motorbike": 14, "person": 15,
    "pottedplant": 16, "sheep": 17, "sofa": 18, "train": 19, "tvmonitor": 20
}

class PascalVOCDataset(Dataset):
    def __init__(self, image_paths, annotation_paths, transform=None):
        self.image_paths = image_paths
        self.annotation_paths = annotation_paths
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        annotation_path = self.annotation_paths[idx]
        image = Image.open(img_path).convert("RGB")
        
        # Load annotations (bounding boxes and labels)
        boxes, labels = self.load_annotations(annotation_path)

        # Debug: print annotations for verification
        # print(f"Image {img_path}, boxes: {boxes}, labels: {labels}")
        
        if self.transform:
            image = self.transform(image)
        
        target = {
            "boxes": boxes,
            "labels": labels
        }
        
        return image, target

    def load_annotations(self, annotation_path):
        # Parse the XML annotation file and extract bounding boxes and labels
        boxes = []
        labels = []
        tree = ET.parse(annotation_path)
        root = tree.getroot()
        
        for obj in root.findall("object"):
            label = obj.find("name").text
            bbox = obj.find("bndbox")
            xmin = int(bbox.find("xmin").text)
            ymin = int(bbox.find("ymin").text)
            xmax = int(bbox.find("xmax").text)
            ymax = int(bbox.find("ymax").text)
            boxes.append([xmin, ymin, xmax, ymax])
            # Assign the correct label based on the object name
            labels.append(VOC_CLASSES[label])
        
        return torch.tensor(boxes, dtype=torch.float32), torch.tensor(labels, dtype=torch.int64)

# This handles multi-class detection and retrieves annotations for all classes.


In [4]:
# Define the transformation pipeline (resizing and normalization)
def transform(image):
    return transforms.Compose([
        transforms.Resize((224, 224)),  # Resize image to 224x224
        transforms.ToTensor(),  # Convert image to tensor
        transforms.Normalize(mean=[0.485, 0.456, 0.406],  # Normalize using ImageNet stats
                             std=[0.229, 0.224, 0.225])
    ])(image)


# Initialize datasets
full_dataset = PascalVOCDataset(images, annotations, transform=transform)
test_dataset = PascalVOCDataset(test_images, test_annotations, transform=transform)

# Create data loaders
def collate_fn(batch):
    return tuple(zip(*batch))

# Define the IoU calculation function
def calculate_iou(outputs, targets, threshold=0.5):
    ious = []
    for i in range(len(outputs)):
        pred_boxes = outputs[i]['boxes'].cpu().numpy()
        true_boxes = targets[i]['boxes'].cpu().numpy()
        pred_labels = outputs[i]['labels'].cpu().numpy()
        true_labels = targets[i]['labels'].cpu().numpy()

        for j, pred_box in enumerate(pred_boxes):
            pred_label = pred_labels[j]
            for k, true_box in enumerate(true_boxes):
                true_label = true_labels[k]
                
                # Only compute IoU if the predicted and true labels are the same
                if pred_label == true_label:
                    iou = compute_iou(pred_box, true_box)
                    if iou >= threshold:
                        ious.append(iou)
    return sum(ious) / len(ious) if ious else 0

# Compute the IoU of two bounding boxes (no change needed here)
def compute_iou(box1, box2):
    x1, y1, x2, y2 = box1
    x1g, y1g, x2g, y2g = box2

    xi1 = max(x1, x1g)
    yi1 = max(y1, y1g)
    xi2 = min(x2, x2g)
    yi2 = min(y2, y2g)
    inter_area = max(0, xi2 - xi1 + 1) * max(0, yi2 - yi1 + 1)

    box1_area = (x2 - x1 + 1) * (y2 - y1 + 1)
    box2_area = (x2g - x1g + 1) * (y2g - y1g + 1)

    union_area = box1_area + box2_area - inter_area

    return inter_area / union_area


In [5]:
# Generate and evaluate different train/valid splits
results = {}
test_results = {}

n_samples = len(images)
print(f"Total number of samples: {n_samples}")

for i in range(1, 10):
    train_size = i * 0.1
    print(f"\nIteration {i}, train_size: {train_size:.1f}")

    # Ensure the training set has at least one sample
    adjusted_train_size = max(int(n_samples * train_size), 1)
    adjusted_valid_size = n_samples - adjusted_train_size

    if adjusted_valid_size == 0:
        print(f"Skipping iteration {i} as there are no validation samples.")
        continue

    print(f"Adjusted train size: {adjusted_train_size}, validation size: {adjusted_valid_size}")

    # Split the dataset
    train_indices, valid_indices = train_test_split(range(n_samples), train_size=adjusted_train_size, random_state=random_seed)
    print(f"Number of training samples: {len(train_indices)}")
    print(f"Number of validation samples: {len(valid_indices)}")

    # Check for empty training set
    if len(train_indices) == 0:
        print("Error: Train set is empty!")
        continue

    train_loader = DataLoader(Subset(full_dataset, train_indices), batch_size=4, shuffle=True, collate_fn=collate_fn)
    valid_loader = DataLoader(Subset(full_dataset, valid_indices), batch_size=4, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)

    # Load the pre-trained Faster R-CNN model
    model = fasterrcnn_resnet50_fpn(pretrained=True)

    # Replace the box predictor with a new one for Pascal VOC's 20 classes + background
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    num_classes = 21  # 20 object classes + background
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    # Move model to GPU if available
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)

    # Fine-tune the model
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    num_epochs = 10

    # model training
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for images, targets in train_loader:
            # Only continue if the batch contains valid bounding boxes
            if any(len(t['boxes']) == 0 for t in targets):
                print(f"Skipping batch with empty targets: {[len(t['boxes']) for t in targets]}")
                continue
            
            images = list(image.to(device) for image in images)
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            optimizer.zero_grad()
            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())
            losses.backward()
            optimizer.step()
            running_loss += losses.item()

        print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader)}")


    # Evaluate the model on the validation set
    model.eval()
    val_mean_iou = 0.0
    with torch.no_grad():
        for images, targets in valid_loader:
            images = list(image.to(device) for image in images)
            
            # During evaluation, just pass the images (no targets)
            outputs = model(images)
            
            # Calculate IoU for validation set (uses multi-class IoU function)
            batch_iou = calculate_iou(outputs, targets)
            val_mean_iou += batch_iou

    val_mean_iou /= len(valid_loader)
    print(f"Validation Mean IoU: {val_mean_iou}")
    results[f"split_{i}"] = val_mean_iou

    # Evaluate the model on the test set
    test_mean_iou = 0.0
    with torch.no_grad():
        for images, targets in test_loader:
            images = list(image.to(device) for image in images)
            
            # During evaluation, just pass the images (no targets)
            outputs = model(images)
            
            # Calculate IoU for test set (uses multi-class IoU function)
            batch_iou = calculate_iou(outputs, targets)
            test_mean_iou += batch_iou

    test_mean_iou /= len(test_loader)
    print(f"Test Mean IoU: {test_mean_iou}")
    test_results[f"split_{i}"] = test_mean_iou


Total number of samples: 13690

Iteration 1, train_size: 0.1
Adjusted train size: 1369, validation size: 12321
Number of training samples: 1369
Number of validation samples: 12321




Image train/2011_001592_jpg.rf.0c269fc0752f4137d8fc3f3bb06aea22.jpg, boxes: tensor([[  1.,   2., 434., 319.]]), labels: tensor([4])
Image train/2008_003466_jpg.rf.6a4560016bfc94ba8e48b098b0636fc3.jpg, boxes: tensor([[128.,  73., 357., 287.],
        [448.,   1., 500., 262.]]), labels: tensor([20, 16])
Image train/2008_003252_jpg.rf.baaf66bd733eacccc78dc04ab366f75f.jpg, boxes: tensor([[136.,  77., 436., 373.]]), labels: tensor([9])
Image train/2012_003662_jpg.rf.d37319455f377f77d3c518649920e641.jpg, boxes: tensor([[250., 101., 296., 175.]]), labels: tensor([15])
Image train/2012_003987_jpg.rf.6084911012ad2a1e071986e42f764a9a.jpg, boxes: tensor([[235.,  97., 415., 369.],
        [140., 173., 257., 365.],
        [ 58., 161., 139., 410.]]), labels: tensor([15, 15, 15])
Image train/2008_002697_jpg.rf.3c748708cd5deaa2f18398958ff66124.jpg, boxes: tensor([[ 69., 116., 333., 347.]]), labels: tensor([13])
Image train/2009_001075_jpg.rf.f35cdcc711c713a7ec45e647d9a4d788.jpg, boxes: tensor([[177.,

KeyboardInterrupt: 

In [None]:
# Extract train percentages and metrics from the results
train_percentages = [i * 10 for i in range(1, 10)]
val_metrics = [results[f"split_{i}"] for i in range(1, 10)]
test_metrics = [test_results[f"split_{i}"] for i in range(1, 10)]

# Plot the results
plt.figure(figsize=(10, 6))
plt.plot(train_percentages, val_metrics, marker='o', label='Validation Mean IoU', color='blue')
plt.plot(train_percentages, test_metrics, marker='x', label='Test Mean IoU', color='green')

# Customize plot appearance
plt.xlabel('Train Percentage (%)')
plt.ylabel('Mean IoU')
plt.title('Validation and Test Mean IoU vs. Train Percentage')
plt.legend()
plt.grid(True)  # Adding grid for better readability
plt.xticks(train_percentages)  # Set x-axis ticks at each train percentage

# Display the plot
plt.show()


In [None]:

# Example data based on the earlier context
train_percentages = [i * 10 for i in range(1, 10)]

# Creating a DataFrame for the table
df = pd.DataFrame({
    "Train Percentage": train_percentages,
    "Validation Mean IoU": np.round(val_metrics, 2),
    "Test Mean IoU": np.round(test_metrics, 2)
}).set_index("Train Percentage")

In [None]:
df

# Pascal VOC 2012 Dataset with Multiple Classes

In [None]:

# Step 1: Load Pascal VOC 2012 dataset and handle multiple classes
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# Pascal VOC specific transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

# Load Pascal VOC 2012 dataset
train_dataset = datasets.VOCDetection(
    root='path_to_data',  # Replace with the correct path to Pascal VOC 2012
    year='2012',
    image_set='train',
    download=True,
    transform=transform
)

val_dataset = datasets.VOCDetection(
    root='path_to_data',
    year='2012',
    image_set='val',
    download=True,
    transform=transform
)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)


In [None]:

# Step 2: Modify the model architecture for multi-class detection
import torchvision.models as models
from torch import nn

# Load pre-trained ResNet model and modify for multi-class object detection
model = models.resnet50(pretrained=True)

# Modify the final layer for VOC's 20 classes
num_classes = 20  # Pascal VOC has 20 classes
model.fc = nn.Linear(model.fc.in_features, num_classes)

# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)


In [None]:

# Step 3: Define the training loop and evaluation with IoU for each class
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from sklearn.metrics import jaccard_score

# Set up optimizer and loss function
optimizer = Adam(model.parameters(), lr=0.001)
criterion = CrossEntropyLoss()

# Training loop and evaluation
def train_model(model, train_loader, val_loader, num_epochs=10):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        
        # Training phase
        for images, targets in train_loader:
            images = images.to(device)
            labels = [target['annotation']['object'] for target in targets]
            
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss / len(train_loader)}")

        # Evaluation on validation set with IoU for each class
        model.eval()
        with torch.no_grad():
            all_preds, all_labels = [], []
            for images, targets in val_loader:
                images = images.to(device)
                labels = [target['annotation']['object'] for target in targets]
                outputs = model(images)

                preds = torch.argmax(outputs, dim=1).cpu().numpy()
                labels = labels.cpu().numpy()

                all_preds.extend(preds)
                all_labels.extend(labels)

            iou_score = jaccard_score(all_labels, all_preds, average=None)
            print(f"Validation IoU per class: {iou_score}")
