In [53]:
import os
import glob
import torch
import pandas as pd
import numpy as np 
import xml.etree.ElementTree as ET
from scipy.io import loadmat
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from tqdm.notebook import tqdm
from sklearn import metrics

In [60]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def load_dataset():

    # Get train list
    f = loadmat("lists/train_list.mat")
    train_images = [x[0][0] for x in f['file_list']]
    train_labels = [x[0] for x in f['labels']]

    # Get file list
    f = loadmat("lists/test_list.mat")
    test_images = [x[0][0] for x in f['file_list']]
    test_labels = [x[0] for x in f['labels']]

    # Gather data
    train_data = []
    test_data = []

    # Record category ids
    categories = {}

    for i in range(len(train_images) + len(test_images)):

        # Determine if train or test
        image = train_images[i] if i < len(train_images) else test_images[i - len(train_images)]
        label = train_labels[i] if i < len(train_images) else test_labels[i - len(train_images)]
        label_name = os.path.split(image)[0]
        # Label -1 to make it 0-indexed
        categories[label_name] = label-1
        annotation_path = os.path.join("Annotation", image.replace(".jpg", ""))

        # Read XML
        tree = ET.parse(annotation_path)
        root = tree.getroot()

        width = int(root.find("size").find("width").text)
        height = int(root.find("size").find("height").text)

        bndbox = root.find("object").find("bndbox")
        xmin = int(bndbox.find("xmin").text)
        ymin = int(bndbox.find("ymin").text)
        xmax = int(bndbox.find("xmax").text)
        ymax = int(bndbox.find("ymax").text)

        # Append to data
        if i < len(train_images):
            train_data.append(dict(
                image=os.path.join("Images", image),
                label=label-1,
                label_name=label_name,
                width=width,
                height=height,
                xmin=xmin,
                ymin=ymin,
                xmax=xmax,
                ymax=ymax
            ))
        else:
            test_data.append(dict(
                image=os.path.join("Images", image),
                label=label-1,
                label_name=label_name,
                width=width,
                height=height,
                xmin=xmin,
                ymin=ymin,
                xmax=xmax,
                ymax=ymax
            ))


    return train_data, test_data, categories

# Read dataset and gather into dataframe
train_data, test_data, categories = load_dataset()
dftrain = pd.DataFrame(train_data)
dftest = pd.DataFrame(test_data)

In [61]:
# Get the classes summary
print("Number of classes: ", len(categories))
print("Number of training samples: ", len(dftrain))
print("Number of testing samples: ", len(dftest))

Number of classes:  120
Number of training samples:  12000
Number of testing samples:  8580


In [64]:
# Inherit from Dataset
class CustomDataset(Dataset):

    def __init__(self, df, transform=None):
        self.df = df
        self.transform = transform

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image = Image.open(row['image'])
        image = image.convert('RGB')
        label = row['label']
        if self.transform:
            image = self.transform(image)
        return image, label
        

train_transforms = transforms.Compose([
    # Randomly resize and crop the image to 224
    transforms.RandomResizedCrop(224),
    # Randomly flip the image horizontally
    transforms.RandomHorizontalFlip(),
    # Convert the image to a PyTorch Tensor
    transforms.ToTensor(),
    # Normalize the image
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

test_transforms = transforms.Compose([
    # Resize the image to 256
    transforms.Resize(256),
    # Crop the center of the image
    transforms.CenterCrop(224),
    # Convert the image to a PyTorch Tensor
    transforms.ToTensor(),
    # Normalize the image
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])



train_dataset = CustomDataset(dftrain, transform=train_transforms)
test_dataset = CustomDataset(dftest, transform=test_transforms)

# Test
train_dataset[0]

(tensor([[[-1.5699, -1.6384, -1.6727,  ..., -1.3644, -1.3644, -1.4158],
          [-1.4500, -1.5699, -1.6213,  ..., -1.3644, -1.3644, -1.4158],
          [-1.3644, -1.4843, -1.5528,  ..., -1.3644, -1.3644, -1.4158],
          ...,
          [ 0.3823,  0.3138,  0.3481,  ..., -0.4568, -0.5082, -0.2513],
          [ 0.3823,  0.3138,  0.3481,  ..., -0.4739, -0.5424, -0.2513],
          [ 0.3652,  0.3138,  0.3481,  ..., -0.3541, -0.4397, -0.3883]],
 
         [[-1.6506, -1.5980, -1.5455,  ..., -1.3704, -1.3704, -1.4230],
          [-1.5980, -1.5630, -1.5105,  ..., -1.3704, -1.3704, -1.4230],
          [-1.5455, -1.5455, -1.4930,  ..., -1.3704, -1.3704, -1.4230],
          ...,
          [-0.0924, -0.1450, -0.1099,  ...,  0.2227,  0.1702,  0.4328],
          [-0.0924, -0.1450, -0.1099,  ...,  0.2052,  0.1352,  0.4328],
          [-0.0924, -0.1450, -0.1099,  ...,  0.2927,  0.2052,  0.2577]],
 
         [[-1.6302, -1.5779, -1.4733,  ..., -1.5604, -1.5604, -1.6127],
          [-1.5953, -1.5430,

In [66]:
from torchvision.models import resnet18

# Load the pre-trained model
model = resnet18(weights=None)
model.load_state_dict(torch.load("resnet18-f37072fd.pth"))

# Change the output layer
model.fc = torch.nn.Linear(512, len(categories))
model = model.to(device)

# Define the loss function
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=6, gamma=0.1)

# Define the data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Train the model
model.train()

for epoch in range(15):
    for images, labels in tqdm(train_loader, desc="Train Epoch " + str(epoch)):
        images = images.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels.long())
        loss.backward()
        optimizer.step()

    # Evaluate the model
    model.eval()
    y_true = []
    y_pred = []
    with torch.no_grad():
        for images, labels in tqdm(test_loader, desc="Test Epoch " + str(epoch)):
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())
    model.train()
    print("Epoch: ", epoch)
    print("Accuracy: ", metrics.accuracy_score(y_true, y_pred))
    print("F1 Score: ", metrics.f1_score(y_true, y_pred, average='macro'))
    print("Confusion Matrix: ", metrics.confusion_matrix(y_true, y_pred))

Train Epoch 0:   0%|          | 0/375 [00:00<?, ?it/s]

Test Epoch 0:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch:  0
Accuracy:  0.6665501165501165
F1 Score:  0.6416622809566006
Confusion Matrix:  [[ 28   0   0 ...   1   0   0]
 [  0  74   0 ...   0   0   0]
 [  0   0 112 ...   0   0   0]
 ...
 [  1   0   0 ...  41   6   0]
 [  0   0   0 ...   1  44   0]
 [  0   0   0 ...   0   1  63]]


Train Epoch 1:   0%|          | 0/375 [00:00<?, ?it/s]

Test Epoch 1:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch:  1
Accuracy:  0.7092074592074592
F1 Score:  0.6947704937888362
Confusion Matrix:  [[ 33   0   0 ...   1   0   0]
 [  0  74   0 ...   0   0   0]
 [  0   0 132 ...   0   0   0]
 ...
 [  0   0   0 ...  40  10   0]
 [  0   0   0 ...   1  46   1]
 [  0   0   0 ...   0   2  61]]


Train Epoch 2:   0%|          | 0/375 [00:00<?, ?it/s]

Test Epoch 2:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch:  2
Accuracy:  0.7403263403263404
F1 Score:  0.728980014995554
Confusion Matrix:  [[ 39   0   0 ...   0   0   0]
 [  0  75   0 ...   0   0   0]
 [  0   0 123 ...   0   0   0]
 ...
 [  0   0   0 ...  43   3   0]
 [  0   0   0 ...   1  47   0]
 [  0   0   0 ...   0   1  58]]


Train Epoch 3:   0%|          | 0/375 [00:00<?, ?it/s]

Test Epoch 3:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch:  3
Accuracy:  0.7452214452214452
F1 Score:  0.7332687400667351
Confusion Matrix:  [[ 36   0   0 ...   1   0   0]
 [  0  78   0 ...   0   0   0]
 [  0   0 118 ...   0   0   0]
 ...
 [  0   0   0 ...  42   7   0]
 [  0   0   0 ...   1  48   1]
 [  0   0   0 ...   0   2  64]]


Train Epoch 4:   0%|          | 0/375 [00:00<?, ?it/s]

Test Epoch 4:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch:  4
Accuracy:  0.7407925407925408
F1 Score:  0.7306036438579546
Confusion Matrix:  [[ 37   0   0 ...   2   0   0]
 [  0  81   0 ...   0   0   0]
 [  0   0 105 ...   0   0   0]
 ...
 [  0   0   0 ...  51   0   0]
 [  0   0   0 ...   3  43   1]
 [  0   0   0 ...   0   2  65]]


Train Epoch 5:   0%|          | 0/375 [00:00<?, ?it/s]

Test Epoch 5:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch:  5
Accuracy:  0.7508158508158508
F1 Score:  0.7383518863885412
Confusion Matrix:  [[ 36   0   0 ...   0   0   0]
 [  0  74   0 ...   0   0   0]
 [  0   0 135 ...   0   0   0]
 ...
 [  0   0   0 ...  44   5   0]
 [  0   0   0 ...   1  47   1]
 [  0   0   0 ...   0   2  60]]


Train Epoch 6:   0%|          | 0/375 [00:00<?, ?it/s]

Test Epoch 6:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch:  6
Accuracy:  0.7458041958041958
F1 Score:  0.7355986792846984
Confusion Matrix:  [[ 35   0   0 ...   0   0   0]
 [  0  73   0 ...   0   0   0]
 [  0   0 128 ...   0   0   0]
 ...
 [  0   0   0 ...  33  12   0]
 [  0   0   0 ...   0  48   0]
 [  0   0   0 ...   0   4  60]]


Train Epoch 7:   0%|          | 0/375 [00:00<?, ?it/s]

Test Epoch 7:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch:  7
Accuracy:  0.7353146853146854
F1 Score:  0.7255668359090135
Confusion Matrix:  [[ 39   0   0 ...   0   0   0]
 [  0  75   0 ...   0   0   0]
 [  0   0 126 ...   0   0   0]
 ...
 [  1   0   0 ...  34  10   0]
 [  0   0   0 ...   1  47   1]
 [  0   0   0 ...   0   3  61]]


Train Epoch 8:   0%|          | 0/375 [00:00<?, ?it/s]

Test Epoch 8:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch:  8
Accuracy:  0.7567599067599068
F1 Score:  0.7484446624604898
Confusion Matrix:  [[ 40   0   0 ...   0   0   0]
 [  0  78   0 ...   0   0   0]
 [  0   0 126 ...   0   0   0]
 ...
 [  0   0   0 ...  48   1   1]
 [  0   0   0 ...   4  42   2]
 [  0   0   0 ...   0   0  67]]


Train Epoch 9:   0%|          | 0/375 [00:00<?, ?it/s]

Test Epoch 9:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch:  9
Accuracy:  0.7580419580419581
F1 Score:  0.750557764118338
Confusion Matrix:  [[ 43   0   0 ...   1   0   0]
 [  0  76   0 ...   0   0   0]
 [  0   0 133 ...   0   0   0]
 ...
 [  0   0   0 ...  45   4   0]
 [  0   0   0 ...   2  45   1]
 [  1   0   0 ...   0   1  66]]


Train Epoch 10:   0%|          | 0/375 [00:00<?, ?it/s]

Test Epoch 10:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch:  10
Accuracy:  0.7510489510489511
F1 Score:  0.7399803376737685
Confusion Matrix:  [[ 41   0   1 ...   0   0   0]
 [  0  76   0 ...   0   0   0]
 [  0   0 131 ...   0   0   0]
 ...
 [  0   0   0 ...  41   6   0]
 [  0   0   0 ...   1  48   1]
 [  0   0   0 ...   0   1  63]]


Train Epoch 11:   0%|          | 0/375 [00:00<?, ?it/s]

Test Epoch 11:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch:  11
Accuracy:  0.7487179487179487
F1 Score:  0.7416898027998126
Confusion Matrix:  [[ 40   0   0 ...   1   0   0]
 [  0  77   0 ...   0   0   0]
 [  0   0 118 ...   0   0   0]
 ...
 [  0   0   0 ...  48   0   0]
 [  0   0   0 ...   3  45   1]
 [  0   0   0 ...   1   2  60]]


Train Epoch 12:   0%|          | 0/375 [00:00<?, ?it/s]

Test Epoch 12:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch:  12
Accuracy:  0.7524475524475525
F1 Score:  0.7439707795525562
Confusion Matrix:  [[ 33   0   0 ...   1   0   0]
 [  0  73   0 ...   0   0   0]
 [  0   0 130 ...   0   0   0]
 ...
 [  0   0   0 ...  41   2   0]
 [  0   0   0 ...   3  45   1]
 [  0   0   0 ...   0   2  62]]


Train Epoch 13:   0%|          | 0/375 [00:00<?, ?it/s]

Test Epoch 13:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch:  13
Accuracy:  0.7525641025641026
F1 Score:  0.7422151185298976
Confusion Matrix:  [[ 26   0   0 ...   0   0   0]
 [  0  76   0 ...   0   0   0]
 [  0   0 123 ...   0   0   0]
 ...
 [  0   0   0 ...  45   1   0]
 [  0   0   0 ...   1  47   0]
 [  0   0   0 ...   0   2  63]]


Train Epoch 14:   0%|          | 0/375 [00:00<?, ?it/s]

Test Epoch 14:   0%|          | 0/269 [00:00<?, ?it/s]

Epoch:  14
Accuracy:  0.7573426573426574
F1 Score:  0.7488799597502518
Confusion Matrix:  [[ 37   0   0 ...   0   0   1]
 [  0  74   0 ...   0   0   0]
 [  0   0 116 ...   0   0   0]
 ...
 [  0   0   0 ...  45   0   0]
 [  0   0   0 ...   1  43   2]
 [  0   0   0 ...   0   2  65]]


In [67]:
# Evaluate on the test set

model.eval()
y_true = []
y_pred = []

with torch.no_grad():
    for images, labels in tqdm(test_loader):
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
        y_true.extend(labels.cpu().numpy())
        y_pred.extend(predicted.cpu().numpy())

print("Accuracy: ", metrics.accuracy_score(y_true, y_pred))

  0%|          | 0/269 [00:00<?, ?it/s]

Accuracy:  0.7573426573426574


In [68]:
print(metrics.classification_report(y_true, y_pred, target_names=list(categories.keys())))

                                          precision    recall  f1-score   support

                     n02085620-Chihuahua       0.64      0.71      0.67        52
              n02085782-Japanese_spaniel       0.82      0.87      0.85        85
                   n02085936-Maltese_dog       0.87      0.76      0.81       152
                      n02086079-Pekinese       0.73      0.78      0.75        49
                      n02086240-Shih-Tzu       0.69      0.72      0.70       114
              n02086646-Blenheim_spaniel       0.87      0.86      0.87        88
                      n02086910-papillon       0.97      0.91      0.94        96
                   n02087046-toy_terrier       0.83      0.67      0.74        72
           n02087394-Rhodesian_ridgeback       0.72      0.40      0.52        72
                  n02088094-Afghan_hound       0.95      0.95      0.95       139
                        n02088238-basset       0.71      0.80      0.75        75
               