In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
import pandas as pd
from pathlib import Path
import os

class FilteredDataset(ImageFolder):
    """
    A modified version of torchvision.datasets.ImageFolder that filters out samples whose filenames
    are listed in a given CSV file.
    
    See: https://pytorch.org/vision/main/generated/torchvision.datasets.ImageFolder.html
    
    Args:
        root_dir (string): Root directory path of the dataset.
        csv_path (string, optional): Path to a CSV file containing a list of excluded filenames.
                                     Default: None.
        transform (callable, optional): A function/transform that takes in a PIL image and returns a
                                         transformed version. E.g, ``transforms.RandomCrop``
                                         Default: None.
        target_transform (callable, optional): A function/transform that takes in the target and
                                                transforms it. Default: None.

    Example usage:

        # Load the dataset and exclude certain samples
        dataset = FilteredDataset('data/train_set', 'data/excluded_samples.csv', transform=transforms.ToTensor())

        # Create a dataloader
        dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
    """
    
    def __init__(self, root_dir, csv_path=None, transform=None, target_transform=None):
        root_dir = Path(root_dir)
        super().__init__(root_dir, transform=transform, target_transform=target_transform)

        if csv_path:
            self.excluded_files = set(pd.read_csv(csv_path, header=0, names=['filename'])['filename'])
            print(f"Original Samples: {len(self.samples)} in {root_dir}")
            print(f"Excluded: {len(self.excluded_files)} in {root_dir}")
            self.samples = [(path, target) for path, target in self.samples if Path(path) not in self.excluded_files]
            print(f"Cleaned Samples: {len(self.samples)} in {root_dir}")


In [2]:
# Define the preprocessing transforms
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(150),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
valid_transform = transforms.Compose([
    transforms.Resize(156),
    transforms.CenterCrop(150),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [3]:
train_dataset = FilteredDataset('data/train', 'duplicates.csv', transform=train_transform)
valid_dataset = FilteredDataset('data/val', 'duplicates.csv', transform=valid_transform)

Original Samples: 14034 in data/train
Excluded: 11 in data/train
Cleaned Samples: 14034 in data/train
Original Samples: 3000 in data/val
Excluded: 11 in data/val
Cleaned Samples: 3000 in data/val


In [4]:
train_dataset.excluded_files

{'data/train/forest/18807.jpg',
 'data/train/forest/8689.jpg',
 'data/train/mountain/15770.jpg',
 'data/train/mountain/17775.jpg',
 'data/train/mountain/19959.jpg',
 'data/train/mountain/6518.jpg',
 'data/train/mountain/7654.jpg',
 'data/train/mountain/7865.jpg',
 'data/train/sea/6337.jpg',
 'data/train/street/1495.jpg',
 'data/train/street/2764.jpg'}

In [5]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=96, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=96, shuffle=True)

In [6]:
# Define the model architecture
model = torchvision.models.resnet18(pretrained=True)
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, len(train_dataset.classes))



In [7]:
# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [8]:
# Train the model
num_epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch {epoch+1} - Loss: {running_loss/len(train_loader)}")

Epoch 1 - Loss: 0.5492746412348585
Epoch 2 - Loss: 0.4334449029090453
Epoch 3 - Loss: 0.411048455505955
Epoch 4 - Loss: 0.39966326100485666
Epoch 5 - Loss: 0.3724620054368259
Epoch 6 - Loss: 0.35996732409714033
Epoch 7 - Loss: 0.34730940438857694
Epoch 8 - Loss: 0.34328323308707903
Epoch 9 - Loss: 0.32826554096069466
Epoch 10 - Loss: 0.31970550099603173


In [9]:
# Evaluate the model
correct = 0
total = 0
with torch.no_grad():
    for data in valid_loader:
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Accuracy: {100 * correct / total}")

Accuracy: 90.0
