In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
from torch.optim import Adam
from torch.utils.data import DataLoader, Subset, Dataset
from torchvision.datasets import DatasetFolder, ImageFolder
from torchvision import transforms
from torchvision.models import vgg16
import random
from sklearn.metrics import confusion_matrix, accuracy_score



# 1. preprocessing

In [2]:
"""
plan:
images and box labels are in separate files. -> we have to connect them -> create a dataset for dataloading
"""

'\nplan:\nimages and box labels are in separate files. -> we have to connect them -> create a dataset for dataloading\n'

In [3]:
import scipy
import PIL

In [4]:
# insights on matfile

mat_file = scipy.io.loadmat("./caltech-101/Annotations/Airplanes_Side_2/annotation_0001.mat")
print(mat_file)

{'__header__': b'MATLAB 5.0 MAT-file, Platform: PCWIN, Created on: Tue Dec 14 11:03:29 2004', '__version__': '1.0', '__globals__': [], 'box_coord': array([[ 30, 137,  49, 349]], dtype=uint16), 'obj_contour': array([[  8.54082661,  11.87852823,   1.86542339,   1.56199597,
         31.60131048,  27.65675403,  23.71219758,  18.85735887,
         18.85735887,  31.60131048,  47.68296371,  51.32409274,
         59.51663306,  60.1234879 ,  56.78578629,  78.02570565,
         91.07308468, 178.46018145, 179.97731855, 222.15372984,
        225.79485887, 239.75252016, 265.84727823, 298.92086694,
        300.13457661, 298.3140121 , 265.54385081, 264.63356855,
        270.39868952, 268.88155242, 265.84727823, 264.02671371,
        260.08215726, 255.83417339, 257.6547379 , 261.90272177,
        261.90272177, 160.25453629, 160.25453629, 156.00655242,
        155.39969758, 149.33114919, 142.04889113, 139.31804435,
        139.92489919, 143.26260081, 136.28377016, 128.09122984,
        124.45010081, 12

## 2. build the network

In [5]:
"""
vgg16 requires 224x224 px input images.
7x7x512 on the final conv layer. -> 25088
we chop up the heads, and use only the conv layers.
"""

# load and freeze vgg feature maps.
vgg_model = vgg16(weights="DEFAULT").features
for param in vgg_model.parameters():
    param.requires_grad = False


class Network(nn.Module):

    def __init__(self, n_classes, seed=101):
        super().__init__()
        self.seed = torch.manual_seed(seed)
        self.flatten = nn.Flatten()
        self.features = nn.Sequential(
            vgg_model
        )

        self.n_flattened_neurons = 25088
        
        # classification head
        self.class_head = nn.Sequential(
            nn.Linear(self.n_flattened_neurons, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, n_classes),
            nn.Sigmoid()
        )

        # localization head
        self.localize_head = nn.Sequential(
            nn.Linear(self.n_flattened_neurons, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 4),
            nn.Sigmoid()
        )


    def forward(self, data):
        signal = self.features(data)
        signal = self.flatten(signal)
        #print(signal.size(1))
        classes = self.class_head(signal)
        bounding_boxes = self.localize_head(signal)
        return classes, bounding_boxes

## 3. preprocess dataset

In [6]:
# insights on image and box

mat_file = scipy.io.loadmat("./caltech-101/Annotations/Airplanes_Side_2/annotation_0056.mat")
coords = mat_file["box_coord"].squeeze()
print(coords)

# !! cv returns (h, w) <-> pil returns (w, h)
image = PIL.Image.open("./caltech-101/subset/images/airplanes/image_0056.jpg")
print(image)
w, h = image.size
print("pil: ", w, h)
#image.show()

draw_obj = PIL.ImageDraw.Draw(image)
draw_obj.rectangle([coords[2], coords[0], coords[3], coords[1]], outline="red")
image.show()

"""
rectangle looks like: y1, y2, x1, x2 ->  upper left and bottom right corner

  x--------
 |        |
 |        |
  --------x
  
"""


[ 28 116  71 326]
<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=405x140 at 0x1B341390F10>
pil:  405 140


'\nrectangle looks like: y1, y2, x1, x2 ->  upper left and bottom right corner\n\n  x--------\n |        |\n |        |\n  --------x\n  \n'

In [32]:
# image preprocessing

# The size of each image is roughly 300 x 200 pixels. (according to dataset description) 
def preprocess_image(image, target_size=224, return_scale_and_pad=True):
    """
    Resize while keeping aspect ratio and adding padding to match target_size.
    return_scale_and_pad: useful info for bboxes, if they have to be corrected.
    returns: (image, scale, tuple(padding))
    """
    w, h = image.size
    
    # Compute the new size while keeping the aspect ratio
    scale = target_size / max(w, h)  # Scaling factor
    new_w, new_h = int(w * scale), int(h * scale)
    
    # Resize while preserving aspect ratio
    resize_transform = transforms.Resize((new_h, new_w), interpolation=transforms.InterpolationMode.BILINEAR)
    image = resize_transform(image)

    # Compute padding values (left, top, right, bottom)
    pad_w = target_size - new_w
    pad_h = target_size - new_h
    padding = (pad_w // 2, pad_h // 2, pad_w - pad_w // 2, pad_h - pad_h // 2)

    # Apply padding + transform to tensor
    # transforms.Pad expects left,top,right,bottom IN ORDER
    pad_transform = transforms.Compose([transforms.Pad(padding, fill=255), transforms.ToTensor()])
    image = pad_transform(image)

    if return_scale_and_pad == True:
        return image, scale, padding
    else:
        return image


# the bounding box in the matfile correspond to -> upper left and bottom right corner
# like: y1, y2, x1, x2
# see above the insights
def preprocess_bounding_box(coords, padded_img, scaling_factor, padding, reverse=False):
    # padded img coming as a tensor -> must retransform
    transformer = transforms.ToPILImage()
    reconverted_img = transformer(padded_img)
    padded_img = reconverted_img
    padded_width, padded_height = padded_img.size
    pad_w_left, pad_h_top, pad_w_right, pad_h_bottom = padding      
    
    if reverse == False:
        y1, y2, x1, x2 = coords
        # scale the original coords to get the new coords
        x1 = x1 * scaling_factor
        x2 = x2 * scaling_factor
        y1 = y1 * scaling_factor
        y2 = y2 * scaling_factor
    
        # compute new coords with padding + normalize (0 to 1)
        x1 = (x1 + pad_w_left) / padded_width
        x2 = (x2 + pad_w_right) / padded_width
        y1 = (y1 + pad_h_top) / padded_height
        y2 = (y2 + pad_h_bottom) / padded_height
    
    else:
        # ! coords order is different
        x1, y1, x2, y2 = coords
        x1 = x1 * padded_width
        x2 = x2 * padded_width
        y1 = y1 * padded_height
        y2 = y2 * padded_height

        x1 = x1 - pad_w_left
        x2 = x2 - pad_w_right
        y1 = y1 - pad_h_top 
        y2 = y2 - pad_h_bottom

        x1 = x1 / scaling_factor
        x2 = x2 / scaling_factor
        y1 = y1 / scaling_factor
        y2 = y2 / scaling_factor

    # reorder to draw rectangle later
    coords = np.array((x1, y1, x2, y2))
    coords = torch.from_numpy(coords).float()
    
    return coords



In [8]:
# check if the new preprocessing works correctly

mat_file = scipy.io.loadmat("./caltech-101/subset/annotations/airplanes/annotation_0033.mat")
coords = mat_file["box_coord"].squeeze()
print(coords)

# !! cv returns (h, w) <-> pil returns (w, h)
image = PIL.Image.open("./caltech-101/subset/images/airplanes/image_0033.jpg")
print(image)
w, h = image.size
print("pil: ", w, h)
#image.show()

#draw_obj = PIL.ImageDraw.Draw(image)
#draw_obj.rectangle([coords[2], coords[0], coords[3], coords[1]], outline="red")
#image.show()


# preprocess then show:
padded_image = preprocess_image(image)
boxes = preprocess_bounding_box(coords, padded_image[0], padded_image[1], padded_image[2])
# reconvert to see the test
transformer = transforms.ToPILImage()
reconverted_img = transformer(padded_image[0])
reconverted_width, reconverted_height = reconverted_img.size

boxes = boxes.numpy()
print(boxes)
x1 = boxes[0] * reconverted_width
y1 = boxes[1] * reconverted_height
x2 = boxes[2] * reconverted_width
y2 = boxes[3] * reconverted_height

print(x1, y1, x2, y2)

draw_obj = PIL.ImageDraw.Draw(reconverted_img)
draw_obj.rectangle([x1, y1, x2, y2], outline="red")
reconverted_img.show()


[ 29 133  49 339]
<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=392x159 at 0x1B341410F40>
pil:  392 159
[0.125      0.37308672 0.8647959  0.63839287]
28.0 83.57142543792725 193.71428680419922 143.00000190734863


In [9]:
models = []


## 4. init hyperparameters

In [10]:
batch_size = 16
#learning_rate = 0.001
learning_rate = 0.0001
n_epochs = 8

## 5. create dataset + dataloader

In [11]:
# create custom dataset loader to connect the image + label + bounding box coords
import warnings


class CustomDataset(torch.utils.data.Dataset):
    """
    root_folder: relative path
    """

    def __init__(self, root_folder=None, transform_img=None, transform_annot=None):

        self.root_folder = root_folder
        self.transform_img = transform_img
        self.transform_annot = transform_annot
        self.image_paths, self.class_label, self.annotation_paths = self.__get_paths_and_classes()


    def __get_paths_and_classes(self):
        if self.root_folder:
            print("CustomDataset initializing...")
            image_paths = []
            annotation_paths = []
            classes_to_label = []
    
    
            images_folder = os.path.join(self.root_folder, "images")
            class_label = -1
            for one_class in os.listdir(images_folder):
                class_path = os.path.join(images_folder, one_class)
                class_label += 1
                print("img_classes path: ", class_path)
                if os.path.isdir(class_path):
                    n_images = 0
                    for image in os.listdir(class_path):
                        image_path = os.path.join(class_path, image)
                        image_paths.append(image_path)
                        n_images += 1
                        # int
                        classes_to_label.append(class_label)
                        #print(image_path)
                print("image appended: {}".format(n_images))
    
            annotation_folder = os.path.join(self.root_folder, "annotations")
            for one_class in os.listdir(annotation_folder):
                class_path = os.path.join(annotation_folder, one_class)
                print("annot_classes path: ", class_path)
                if os.path.isdir(class_path):
                    n_annotations = 0
                    for annotation in os.listdir(class_path):
                        annotation_path = os.path.join(class_path, annotation)
                        annotation_paths.append(annotation_path)
                        n_annotations += 1
                        #print(annotation_path)
                    print("annotation appended: {}".format(n_annotations))
    
            # sort + np.array: os.listdir badly shuffles the names + annotations
            image_paths.sort()
            annotation_paths.sort()
            image_paths = np.array(image_paths)
            annotation_paths = np.array(annotation_paths)
            print("CustomDataset initialized \n")
            
            return image_paths, classes_to_label, annotation_paths
        else:
            warnings.warn("No root folder specified. Ignore this warning, if you used '.train_test_split()'")
            return None, None, None


    def __getitem__(self, idx):
        current_img_path = self.image_paths[idx]
        current_annot_path = self.annotation_paths[idx]
        current_class_label = self.class_label[idx]

        # i have found colorless image -> must convert to 3d channel
        current_img = PIL.Image.open(current_img_path).convert("RGB")
        current_annot = scipy.io.loadmat(current_annot_path)
        current_annot = current_annot["box_coord"].squeeze()
        current_class_label = torch.tensor(current_class_label).float().unsqueeze(0)

        # apply transformation
        if self.transform_img and self.transform_annot:
            current_img, current_annot = self.apply_transformation(current_img, current_annot)

        #print(current_img_path)
        return current_img, current_class_label, current_annot

    
    def apply_transformation(self, img, annot):
        img, scale, padding = self.transform_img(img)
        annot = self.transform_annot(annot, img, scale, padding)
        return img, annot

    
    def train_test_split(self, test_size):
        """
        manual, basic split. random selection with no respect to class sizes.
        
        returns: new CustomDataset obj as test set.
        note: returns with the same transforms as arguments on the original dataset
        
        generate random indexes, put those paths to the test_set, delete them from original paths
        """
        total_test_size = int(len(self.image_paths) * test_size)
        print("total test size: ", total_test_size)
        rand_indexes = np.random.randint(0, len(self.image_paths), size=total_test_size)
        rand_indexes = set(rand_indexes)
        while len(rand_indexes) != total_test_size:
            rand_indexes.add(np.random.randint(0, len(self.image_paths)))
        # transform back to list for numpy to handle
        rand_indexes = list(rand_indexes)
        
        test_image_paths = []
        test_class_label = []
        test_annotation_paths = []
        for one_idx in rand_indexes:
            test_image_paths.append(self.image_paths[one_idx])
            test_class_label.append(self.class_label[one_idx])
            test_annotation_paths.append(self.annotation_paths[one_idx])

        self.image_paths = np.delete(self.image_paths, rand_indexes)
        self.class_label = np.delete(self.class_label, rand_indexes)
        self.annotation_paths = np.delete(self.annotation_paths, rand_indexes)

        """
        OLD
        test_set = []
        # in case of when self.image_paths,... etc are simple lists. not np arrays
        for elem in test_set:
            if elem[0] in self.image_paths:
                self.image_paths.remove(elem[0])
            if elem[1] in self.class_label:
                self.class_label.remove(elem[1])
            if elem[2] in self.annotation_paths:
                self.annotation_paths.remove(elem[2])
        """
        new_dataset_obj = CustomDataset(root_folder=None, transform_img=self.transform_img, transform_annot=self.transform_annot)
        new_dataset_obj.image_paths = test_image_paths
        new_dataset_obj.class_label = test_class_label
        new_dataset_obj.annotation_paths = test_annotation_paths
        
        return new_dataset_obj
        
    
    def __len__(self):
        return len(self.image_paths)


In [12]:
# test dataset

root_path = "./caltech-101/subset/"

test_c_dataset = CustomDataset(root_path)

print(test_c_dataset.image_paths[830])
print(test_c_dataset.class_label[830])
print(test_c_dataset.annotation_paths[830])
print("\n")
print(test_c_dataset.image_paths[829])
print(test_c_dataset.class_label[829])
print(test_c_dataset.annotation_paths[829])
print("\n")

print("first elem: \n", test_c_dataset[0], "\n", "-----------------------",  "\n")
test_c_dataset[0][0].show()

CustomDataset initializing...
img_classes path:  ./caltech-101/subset/images\airplanes
image appended: 800
img_classes path:  ./caltech-101/subset/images\Motorbikes
image appended: 798
annot_classes path:  ./caltech-101/subset/annotations\airplanes
annotation appended: 800
annot_classes path:  ./caltech-101/subset/annotations\Motorbikes
annotation appended: 798
CustomDataset initialized 

./caltech-101/subset/images\airplanes\image_0033.jpg
1
./caltech-101/subset/annotations\airplanes\annotation_0033.mat


./caltech-101/subset/images\airplanes\image_0032.jpg
1
./caltech-101/subset/annotations\airplanes\annotation_0032.mat


first elem: 
 (<PIL.Image.Image image mode=RGB size=262x161 at 0x1B34144E0E0>, tensor([0.]), array([ 19, 141,  31, 233], dtype=uint8)) 
 ----------------------- 



In [13]:
root_path = "./caltech-101/subset/"

# !! got no idea how these function passes not results in error...
train_set = CustomDataset(root_path, transform_img=preprocess_image, transform_annot=preprocess_bounding_box)
test_set = train_set.train_test_split(0.1)
loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)


CustomDataset initializing...
img_classes path:  ./caltech-101/subset/images\airplanes
image appended: 800
img_classes path:  ./caltech-101/subset/images\Motorbikes
image appended: 798
annot_classes path:  ./caltech-101/subset/annotations\airplanes
annotation appended: 800
annot_classes path:  ./caltech-101/subset/annotations\Motorbikes
annotation appended: 798
CustomDataset initialized 

total test size:  159




In [14]:
# test loader

print("len train set: ", len(train_set))
print("len test set: ", len(test_set))
print("")

print(train_set.image_paths[797])
print("35. elem: \n", train_set[34], "\n", "-----------------------",  "\n")
print(train_set[34][0][0][0])
transform = transforms.ToPILImage()
reconverted_img = transform(train_set[34][0])
reconverted_img.show()



len train set:  1439
len test set:  159

./caltech-101/subset/images\airplanes\image_0087.jpg
35. elem: 
 (tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]]]), tensor([0.]), tensor([0.1221, 0.3111, 0.8779, 0.6813])) 
 ----------------------- 

tensor([1., 1., 1., 1.

## 6. training

In [15]:
# 1 for binary classif
brain = Network(1)
optimizer = Adam(brain.parameters(), lr=learning_rate)

In [16]:
# training


for epoch in range(1, n_epochs+1):
    losses = []
    for image, label, annotation in loader:
        pred_class, pred_box = brain(image)
        #print(pred_class)
        #print(pred_box)
        #print(label)
        loss_classif = F.binary_cross_entropy(pred_class, label)
        loss_box = F.mse_loss(pred_box, annotation)

        # total loss?
        total_loss = loss_classif + loss_box
        losses.append(total_loss.data)
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()

    print("epoch: ", epoch, "\t loss: ", np.mean(losses))
        

epoch:  1 	 loss:  0.10305125
epoch:  2 	 loss:  0.0054028807
epoch:  3 	 loss:  0.0011817609
epoch:  4 	 loss:  0.0006742767
epoch:  5 	 loss:  0.00044308222
epoch:  6 	 loss:  0.00031977816
epoch:  7 	 loss:  0.0002435609
epoch:  8 	 loss:  0.00019344168


## 7. save

In [17]:
def prep_output_txt():
    structure = Network(1)
    txt = f"""{structure.features}, \n
    flattened neurons: {structure.n_flattened_neurons}, \n 
    classification head: {structure.class_head}, \n
    localization head: {structure.localize_head}, \n
    training batch size: {batch_size}, 
    learning rate: {learning_rate},
    epochs: {n_epochs}
    losses: {np.mean(losses)}"""
    return txt

In [18]:
base_output = "outputs"
save_name = "plane_bike_checkpoint_vgg16_2"
save_file_extension = ".pth"
overwrite = True
full_path = os.path.join(base_output, save_name + save_file_extension)


os.makedirs(base_output, exist_ok=True)
if os.path.exists(full_path):
    if overwrite == True:
        torch.save(brain.state_dict(), full_path)
        print("saved as: ", save_name)
    else:
        print("save failed. file already exists. to overwrite, set overwrite=True")
else:
    torch.save(brain.state_dict(), full_path)
    print("saved as: ", save_name)

with open(base_output + "/" + save_name + ".txt", "w", encoding="utf-8") as textfile:
    textfile.write(prep_output_txt())

saved as:  plane_bike_checkpoint_vgg16_2


## 8. testing + accuracy

In [19]:
# uncomment to load the saved one.

#brain.load_state_dict(torch.load("plane_bike_checkpoint.pth"))

In [None]:
# testing

# due to my mistake and lack of separate test folder, first i did not split the data to train-test.
# so, i downloaded images from google to test.
# i leave this here, because the result is interesting

# 1. mass pred
test_dataset = ImageFolder("./caltech-101/subset/manual_testset/", transform=preprocess_img)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

print("test dataset: ", test_dataset, "\n")

brain.eval()
predictions = []
with torch.no_grad():
    for img, label in test_loader:
        pred_class, pred_box = brain(img)
        predictions.append({"class true" : int(label), "prediction prob" : float(pred_class)})


print("class true \t prediction \t prediction prob")
for elem in predictions:
    if elem["class true"] == 1:
        elem["class true"] = "airplane"
    else:
        elem["class true"] = "motorbike"
    if elem["prediction prob"] >= 0.5:
        elem["prediction"] = "airplane"
    else:
        elem["prediction"] = "motorbike"
    print(elem["class true"], "\t", elem["prediction"], "\t", elem["prediction prob"])

print("")


# 2. single pred
img_path = "./caltech-101/subset/manual_testset/motor/motor1.jpg"
image = PIL.Image.open(img_path)
image.show()
image = preprocess_img(image)
image = image.unsqueeze(0)


brain.eval()
with torch.no_grad():
    pred_class, pred_box = brain(image)

print("single pred: ", pred_class)

In [None]:
"""
conclusion:

the accuracy is garbage. images are way different than the training ones.
"""

## 8. testing2 (correctly?)

In [20]:
# testing

test_loader = DataLoader(test_set, batch_size=1, shuffle=False)
predictions = []

brain.eval()
with torch.no_grad():
    for img, label, annotation in test_loader:
        pred_class, pred_box = brain(img)
        predictions.append({"class true" : int(label),
                            "prediction prob" : float(pred_class),
                            "bounding_box" : pred_box})


print("class true \t prediction \t prediction prob \t bounding box")
for elem in predictions:
    """
    if elem["class true"] == 1:
        elem["class true"] = "airplane"
    else:
        elem["class true"] = "motorbike"
    """
    if elem["prediction prob"] >= 0.5:
        elem["prediction"] = 1
    else:
        elem["prediction"] = 0
    if elem["prediction prob"]:
        elem["prediction prob"] = "{:.6f}".format(elem["prediction prob"])
    print(elem["class true"], "\t\t", elem["prediction"], "\t\t", elem["prediction prob"], "\t\t", elem["bounding_box"])

class true 	 prediction 	 prediction prob 	 bounding box
1 		 1 		 0.999991 		 tensor([[0.1248, 0.3619, 0.8698, 0.6428]])
1 		 1 		 0.999887 		 tensor([[0.1507, 0.4131, 0.8519, 0.5950]])
0 		 0 		 0.000006 		 tensor([[0.1309, 0.3022, 0.8731, 0.7191]])
1 		 1 		 0.999997 		 tensor([[0.1185, 0.3664, 0.8834, 0.6310]])
1 		 1 		 0.999992 		 tensor([[0.1301, 0.3702, 0.8688, 0.6332]])
1 		 1 		 0.999961 		 tensor([[0.1235, 0.3582, 0.8742, 0.6489]])
0 		 0 		 0.000001 		 tensor([[0.1268, 0.2781, 0.8657, 0.7408]])
0 		 0 		 0.000001 		 tensor([[0.1398, 0.2929, 0.8631, 0.7366]])
1 		 1 		 0.999987 		 tensor([[0.1244, 0.4154, 0.8801, 0.5948]])
1 		 1 		 0.999721 		 tensor([[0.1548, 0.3794, 0.8365, 0.6198]])
1 		 1 		 0.999993 		 tensor([[0.1371, 0.3610, 0.8592, 0.6463]])
0 		 0 		 0.000012 		 tensor([[0.1709, 0.3166, 0.8354, 0.7073]])
1 		 1 		 0.999999 		 tensor([[0.1595, 0.4231, 0.8399, 0.6077]])
0 		 0 		 0.000003 		 tensor([[0.1405, 0.2930, 0.8660, 0.7016]])
0 		 0 		 0.000182 		 tensor([[0.

In [21]:
y_true = []
y_pred = []

for elem in predictions:
    y_true.append(elem["class true"])
    y_pred.append(elem["prediction"])
print("sklearn accuracy: ", accuracy_score(y_true, y_pred))

np_acc = np.sum(np.equal(y_true, y_pred)) / len(y_true)
print("np accuracy: ", np_acc)

sklearn accuracy:  0.9937106918238994
np accuracy:  0.9937106918238994


In [22]:
# too high accuracy. -> overfitting?

In [33]:
# check bounding box

    
for img_path, annotation_path in zip(test_set.image_paths, test_set.annotation_paths):
    #raise Exception("bounding box might be good, but need to figure out the correct rectangle drawing. see above at preprocessing")
    # use cv2 to draw the rectangle
    #cv_img = cv2.imread(img_path)
    
    img_orig = PIL.Image.open(img_path)
    img = PIL.Image.open(img_path).convert("RGB")
    annot = scipy.io.loadmat(annotation_path)
    annot = annot["box_coord"].squeeze()
    print("rectangle true: ", [annot[2], annot[0], annot[3], annot[1]])

    img, scale, padding = preprocess_image(img)
    padded_img_width, padded_img_height = 224, 224
    annot = preprocess_bounding_box(annot, img, scale, padding)

    # predict
    brain.eval()
    with torch.no_grad():
        pred_class, pred_box = brain(img.unsqueeze(0))

    pred_box = pred_box.squeeze()
    reverted_bbox = preprocess_bounding_box(pred_box, img, scale, padding, reverse=True)
    x1, y1, x2, y2 = reverted_bbox.numpy()
    #print(pred_box)

    """
    ! should reconvert the img and bbox to see as if it worked on original img.
    the user in practical wanna see the real result, not the result on preprocessed img.
    -> make a "reverse=False" argument on the preprocess bbox function. (for img we should use the original, instead of reconvert)
    """
    # draw rectangle on img
    print("rectangle pred: ", [x1, y1, x2, y2])
    print("class pred: ", f"{float(pred_class):.4f}")
    draw_obj = PIL.ImageDraw.Draw(img_orig)
    draw_obj.rectangle([x1, y1, x2, y2], outline="red")
    img_orig.show()
    # wait for buttonpress
    input("")



rectangle true:  [40, 34, 346, 138]
rectangle pred:  [50.055305, 28.773949, 347.00525, 141.4047]
class pred:  1.0000


 


rectangle true:  [52, 27, 348, 96]
rectangle pred:  [59.52725, 25.624315, 336.505, 97.47928]
class pred:  0.9999


 


rectangle true:  [29, 25, 229, 136]
rectangle pred:  [34.157013, 27.611288, 227.88611, 135.24326]
class pred:  0.0000


KeyboardInterrupt: Interrupted by user