Models Used
ResNet-50,
VGG-16,
EfficientNet-B0

In [None]:
import os
import zipfile
import torch
import torchvision.transforms as transforms
from torchvision.datasets import VOCDetection
from torch.utils.data import DataLoader
import tarfile

print("Files in /content:", os.listdir("/content"))
tar_path = "/content/VOC2008.tar"
extract_path = "/content/VOC2008"

if os.path.exists(tar_path):
    print("Extracting dataset...")
    with tarfile.open(tar_path, "r") as tar:
        tar.extractall(path=extract_path)
    print("Extraction complete!")
else:
    print("ERROR: .tar file not found.")


Files in /content: ['.config', 'VOC2008.tar', 'VOC2008', '.ipynb_checkpoints', 'sample_data']
Extracting dataset...
Extraction complete!


In [None]:
import torchvision.transforms as transforms
from torchvision.datasets import VOCDetection
from torch.utils.data import DataLoader


data_root = "/content/VOC2008"


transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


train_dataset = VOCDetection(root=data_root, year="2008", image_set="train", transform=transform)
val_dataset = VOCDetection(root=data_root, year="2008", image_set="val", transform=transform)


batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

print("Dataset loaded successfully!")


Dataset loaded successfully!


In [None]:
import torch

torch.save(train_loader, "/content/train_loader.pth")
torch.save(val_loader, "/content/val_loader.pth")

print("DataLoaders saved!")


DataLoaders saved!


In [None]:
import torchvision.transforms as transforms
from torchvision.datasets import VOCDetection
from torch.utils.data import DataLoader


data_root = "/content/VOC2008"


transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


train_dataset = VOCDetection(root=data_root, year="2008", image_set="train", transform=transform)
val_dataset = VOCDetection(root=data_root, year="2008", image_set="val", transform=transform)


batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

print("DataLoaders successfully reloaded!")



DataLoaders successfully reloaded!


In [None]:
import torch
from torch.utils.data import DataLoader
from torchvision.datasets import VOCDetection
import torchvision.transforms as transforms


data_root = "/content/VOC2008"


transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


VOC_CLASSES = [
    "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow",
    "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"
]
class_to_idx = {cls: idx for idx, cls in enumerate(VOC_CLASSES)}


def voc_collate_fn(batch):
    images, labels = [], []

    for img, target in batch:
        images.append(img)


        objects = target['annotation'].get('object', [])
        if isinstance(objects, dict):
            objects = [objects]


        label_vector = torch.zeros(len(VOC_CLASSES), dtype=torch.float32)
        for obj in objects:
            if obj['name'] in class_to_idx:
                label_vector[class_to_idx[obj['name']]] = 1.0

        labels.append(label_vector)

    images = torch.stack(images, dim=0)
    labels = torch.stack(labels, dim=0)

    return images, labels


train_dataset = VOCDetection(root=data_root, year="2008", image_set="train", transform=transform)
val_dataset = VOCDetection(root=data_root, year="2008", image_set="val", transform=transform)


batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2, collate_fn=voc_collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2, collate_fn=voc_collate_fn)

print("✅ DataLoader is set up correctly!")


✅ DataLoader is set up correctly!


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=5):
    model.to(device)

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")

    print("✅ Training complete!")
    return model


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.models as models
from torchvision.models import efficientnet_b0
from torchvision.datasets import VOCDetection
from torch.utils.data import DataLoader, Subset
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


data_root = "/content/VOC2008"


transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


VOC_CLASSES = [
    "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow",
    "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"
]
class_to_idx = {cls: idx for idx, cls in enumerate(VOC_CLASSES)}


def voc_collate_fn(batch):
    images = []
    labels = []

    for img, target in batch:
        images.append(img)

        objects = target['annotation'].get('object', [])
        if isinstance(objects, dict):
            objects = [objects]

        label_vector = torch.zeros(len(VOC_CLASSES), dtype=torch.float32)
        for obj in objects:
            if obj['name'] in class_to_idx:
                label_vector[class_to_idx[obj['name']]] = 1.0

        labels.append(label_vector)

    images = torch.stack(images, dim=0)
    labels = torch.stack(labels, dim=0)

    return images, labels

train_dataset = VOCDetection(root=data_root, year="2008", image_set="train", transform=transform)
val_dataset = VOCDetection(root=data_root, year="2008", image_set="val", transform=transform)

subset_size = 2000
train_indices = np.random.choice(len(train_dataset), subset_size, replace=False)
val_indices = np.random.choice(len(val_dataset), int(subset_size * 0.2), replace=False)

train_dataset = Subset(train_dataset, train_indices)
val_dataset = Subset(val_dataset, val_indices)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2, collate_fn=voc_collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2, collate_fn=voc_collate_fn)

print("✅ DataLoader Ready!")


def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=1):
    model = model.to(device)
    scaler = torch.cuda.amp.GradScaler()
    best_loss = float("inf")

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)

            optimizer.zero_grad()

            with torch.cuda.amp.autocast():
                outputs = model(images)
                loss = criterion(outputs, labels)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            running_loss += loss.item()

        avg_train_loss = running_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{num_epochs} - Training Loss: {avg_train_loss:.4f}")

    print("✅ Training complete!")
    return model


resnet_model = models.resnet50(pretrained=True)
num_ftrs = resnet_model.fc.in_features
resnet_model.fc = nn.Linear(num_ftrs, len(VOC_CLASSES))


for param in resnet_model.parameters():
    param.requires_grad = False
for param in resnet_model.fc.parameters():
    param.requires_grad = True


criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(resnet_model.fc.parameters(), lr=0.001)


resnet_model = train_model(resnet_model, train_loader, val_loader, criterion, optimizer, num_epochs=1)
torch.save(resnet_model.state_dict(), "/content/resnet_model.pth")
print("✅ ResNet-50 model saved!")


efficientnet_model = efficientnet_b0(pretrained=True)
num_ftrs = efficientnet_model.classifier[1].in_features
efficientnet_model.classifier[1] = nn.Linear(num_ftrs, len(VOC_CLASSES))


for param in efficientnet_model.parameters():
    param.requires_grad = False
for param in efficientnet_model.classifier.parameters():
    param.requires_grad = True


criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(efficientnet_model.classifier.parameters(), lr=0.001)

efficientnet_model = train_model(efficientnet_model, train_loader, val_loader, criterion, optimizer, num_epochs=1)
torch.save(efficientnet_model.state_dict(), "/content/efficientnet_model.pth")
print("✅ EfficientNet-B0 model saved!")


vgg_model = models.vgg16(pretrained=True)
num_ftrs = vgg_model.classifier[6].in_features
vgg_model.classifier[6] = nn.Linear(num_ftrs, len(VOC_CLASSES))


for param in vgg_model.features.parameters():
    param.requires_grad = False
for param in vgg_model.classifier[6].parameters():
    param.requires_grad = True


criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(vgg_model.classifier[6].parameters(), lr=0.001)


vgg_model = train_model(vgg_model, train_loader, val_loader, criterion, optimizer, num_epochs=1)
torch.save(vgg_model.state_dict(), "/content/vgg_model.pth")
print("✅ VGG-16 model saved!")


✅ DataLoader Ready!


  scaler = torch.cuda.amp.GradScaler()  # Enable mixed precision
  with torch.cuda.amp.autocast():  # Enable FP16


Epoch 1/1 - Training Loss: 0.2224
✅ Training complete!
✅ ResNet-50 model saved!
Epoch 1/1 - Training Loss: 0.3049
✅ Training complete!
✅ EfficientNet-B0 model saved!


Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /root/.cache/torch/hub/checkpoints/vgg16-397923af.pth
100%|██████████| 528M/528M [00:05<00:00, 92.8MB/s]


Epoch 1/1 - Training Loss: 0.1885
✅ Training complete!
✅ VGG-16 model saved!


COMPARISION B/W MODELS

In [None]:
import torch


resnet_model.load_state_dict(torch.load("/content/resnet_model.pth"))
efficientnet_model.load_state_dict(torch.load("/content/efficientnet_model.pth"))
vgg_model.load_state_dict(torch.load("/content/vgg_model.pth"))


resnet_model.eval()
efficientnet_model.eval()
vgg_model.eval()


def evaluate_model(model, val_loader):
    correct, total = 0, 0
    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            predicted = (torch.sigmoid(outputs) > 0.5).float()
            correct += (predicted == labels).sum().item()
            total += labels.numel()

    return correct / total


resnet_acc = evaluate_model(resnet_model, val_loader)
efficientnet_acc = evaluate_model(efficientnet_model, val_loader)
vgg_acc = evaluate_model(vgg_model, val_loader)

print(f"📊 Model Comparison Results:")
print(f"✅ ResNet-50 Accuracy: {resnet_acc:.4f}")
print(f"✅ EfficientNet-B0 Accuracy: {efficientnet_acc:.4f}")
print(f"✅ VGG-16 Accuracy: {vgg_acc:.4f}")


📊 Model Comparison Results:
✅ ResNet-50 Accuracy: 0.9521
✅ EfficientNet-B0 Accuracy: 0.9189
✅ VGG-16 Accuracy: 0.9610
