In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
from torch.optim import Adam
from torch.utils.data import DataLoader, Subset, Dataset
from torchvision.datasets import DatasetFolder, ImageFolder
from torchvision import transforms
import random
from sklearn.metrics import confusion_matrix, accuracy_score



# 1. preprocessing

In [2]:
"""
plan:
images and box labels are in separate files. -> we have to connect them -> create a dataset for dataloading
"""

'\nplan:\nimages and box labels are in separate files. -> we have to connect them -> create a dataset for dataloading\n'

In [3]:
import scipy
import PIL

In [4]:
# insights on matfile

mat_file = scipy.io.loadmat("./caltech-101/Annotations/Airplanes_Side_2/annotation_0001.mat")
print(mat_file)

{'__header__': b'MATLAB 5.0 MAT-file, Platform: PCWIN, Created on: Tue Dec 14 11:03:29 2004', '__version__': '1.0', '__globals__': [], 'box_coord': array([[ 30, 137,  49, 349]], dtype=uint16), 'obj_contour': array([[  8.54082661,  11.87852823,   1.86542339,   1.56199597,
         31.60131048,  27.65675403,  23.71219758,  18.85735887,
         18.85735887,  31.60131048,  47.68296371,  51.32409274,
         59.51663306,  60.1234879 ,  56.78578629,  78.02570565,
         91.07308468, 178.46018145, 179.97731855, 222.15372984,
        225.79485887, 239.75252016, 265.84727823, 298.92086694,
        300.13457661, 298.3140121 , 265.54385081, 264.63356855,
        270.39868952, 268.88155242, 265.84727823, 264.02671371,
        260.08215726, 255.83417339, 257.6547379 , 261.90272177,
        261.90272177, 160.25453629, 160.25453629, 156.00655242,
        155.39969758, 149.33114919, 142.04889113, 139.31804435,
        139.92489919, 143.26260081, 136.28377016, 128.09122984,
        124.45010081, 12

## 2. build the network

In [5]:
class Network(nn.Module):

    def __init__(self, n_classes, seed=101):
        super().__init__()
        self.seed = torch.manual_seed(seed)
        self.flatten = nn.Flatten()
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, stride=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(32, 32, kernel_size=3, stride=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(32, 32, kernel_size=3, stride=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )

        self.n_flattened_neurons = 10304
        
        # classification head
        self.class_head = nn.Sequential(
            nn.Linear(self.n_flattened_neurons, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, n_classes),
            nn.Sigmoid()
        )

        # localization head
        self.localize_head = nn.Sequential(
            nn.Linear(self.n_flattened_neurons, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 4),
            nn.Sigmoid()
        )


    def forward(self, data):
        signal = self.features(data)
        signal = self.flatten(signal)
        #print(signal.size(1))
        classes = self.class_head(signal)
        bounding_boxes = self.localize_head(signal)
        return classes, bounding_boxes

## 3. preprocess dataset

In [16]:
# insights on image and box

mat_file = scipy.io.loadmat("./caltech-101/Annotations/Airplanes_Side_2/annotation_0056.mat")
coords = mat_file["box_coord"].squeeze()
print(coords)

# !! cv returns (h, w) <-> pil returns (w, h)
image = PIL.Image.open("./caltech-101/subset/images/airplanes/image_0056.jpg")
print(image)
w, h = image.size
print("pil: ", w, h)
#image.show()

draw_obj = PIL.ImageDraw.Draw(image)
draw_obj.rectangle([coords[2], coords[0], coords[3], coords[1]], outline="red")
image.show()

"""
rectangle looks like: y1, y2, x1, x2 ->  upper left and bottom right corner

  x--------
 |        |
 |        |
  --------x
  
"""


[ 28 116  71 326]
<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=405x140 at 0x1FDD9CFD0F0>
pil:  405 140


'\nrectangle looks like: y1, y2, x1, x2 ->  upper left and bottom right corner\n\n  x--------\n |        |\n |        |\n  --------x\n  \n'

In [15]:
# image preprocessing

"""
gpt
different dimension problem (aspect ratio): losing the aspect ratio, distorting the image can lead potentially to lose important details.

techniques:
Aspect Ratio Preservation:
    When resizing, it's important to maintain the original aspect ratio to prevent distortion.
Region of Interest (ROI) Cropping:
    For multi-class problems where objects have varying sizes, you might consider identifying and cropping regions of interest (ROIs)
    containing the objects before resizing.
    This helps focus on relevant parts of the image and reduces the impact of resizing on unrelated areas.
Letterboxing or Padding:
    Instead of distorting the image, you can pad the image to fit a desired size without altering its aspect ratio.
    This involves adding extra pixels around the image to achieve the desired dimensions.
Data Augmentation:
    Augmentation techniques such as random cropping, rotation, 
    and scaling during training can help the model become more robust to variations in object sizes and orientations.
"""

"""
if we keep the aspect ratio, the images might have different sizes in pixel.
but the first fully connected layer (after flattening) should be initialized with a fixed number of neurons.
the number of outputs after flattening will be different.

solutions:
Resize Images to a Common Size (aspect ratio + with padding?)

Adaptive Pooling:
    Instead of using fully connected layers, you can replace them with adaptive pooling layers
    (e.g., nn.AdaptiveAvgPool2d or nn.AdaptiveMaxPool2d). 
    Adaptive pooling allows you to specify the output size, and it dynamically adjusts to different input sizes. 
    This way, the network can handle inputs of varying dimensions.
you can use a combination of Resize and CenterCrop to resize the image while preserving its aspect ratio.

"""

"""
! we also have to rescale the bounding box coordinates with respect to aspect ratio, + normalize them (to range[0,1])


# Assuming (x1, y1) and (x2, y2) are the original bounding box coordinates
original_width, original_height = original_image_size
resized_width, resized_height = resized_image_size

# Calculate scaling factors
width_scale = resized_width / original_width
height_scale = resized_height / original_height

# Adjust bounding box coordinates
new_x1 = int(x1 * width_scale)
new_y1 = int(y1 * height_scale)
new_x2 = int(x2 * width_scale)
new_y2 = int(y2 * height_scale)
"""


# The size of each image is roughly 300 x 200 pixels. (according to dataset description)
# /1,5 -> target 200x133 to respect the aspect ratio


def preprocess_img(image):
    # ! transforms.Resize(Height, Width) -> reverse 
    transformer = transforms.Compose([transforms.Resize((133, 200)), transforms.ToTensor()])
    return transformer(image)


# the bounding box in the matfile correspond to -> upper left and bottom right corner
# like: y1, y2, x1, x2
# see above the insights
def preprocess_bounding_box(coords, image):
    y1, y2, x1, x2 = coords
    width, height = image.size
    x1 = x1 / width
    y1 = y1 / height
    x2 = x2 / width
    y2 = y2 / height
    # reorder to draw rectangle later
    coords = np.array((x1, y1, x2, y2))
    coords = torch.from_numpy(coords).float()

    
    return coords
    


In [17]:
# !!!! issue: now we ignored the aspect ratio of images.
# -> works here, but problem with multi class with different aspect ratios

In [18]:
models = []


## 4. init hyperparameters

In [19]:
batch_size = 32
#learning_rate = 0.001
learning_rate = 0.0001
n_epochs = 20

## 5. create dataset + dataloader

In [20]:
# create custom dataset loader to connect the image + label + bounding box coords
import warnings


class CustomDataset(torch.utils.data.Dataset):
    """
    root_folder: relative path
    """

    def __init__(self, root_folder=None, transform_img=None, transform_annot=None):

        self.root_folder = root_folder
        self.transform_img = transform_img
        self.transform_annot = transform_annot
        self.image_paths, self.class_label, self.annotation_paths = self.__get_paths_and_classes()


    def __get_paths_and_classes(self):
        if self.root_folder:
            print("CustomDataset initializing...")
            image_paths = []
            annotation_paths = []
            classes_to_label = []
    
    
            images_folder = os.path.join(self.root_folder, "images")
            class_label = -1
            for one_class in os.listdir(images_folder):
                class_path = os.path.join(images_folder, one_class)
                class_label += 1
                print("img_classes path: ", class_path)
                if os.path.isdir(class_path):
                    n_images = 0
                    for image in os.listdir(class_path):
                        image_path = os.path.join(class_path, image)
                        image_paths.append(image_path)
                        n_images += 1
                        # int
                        classes_to_label.append(class_label)
                        #print(image_path)
                print("image appended: {}".format(n_images))
    
            annotation_folder = os.path.join(self.root_folder, "annotations")
            for one_class in os.listdir(annotation_folder):
                class_path = os.path.join(annotation_folder, one_class)
                print("annot_classes path: ", class_path)
                if os.path.isdir(class_path):
                    n_annotations = 0
                    for annotation in os.listdir(class_path):
                        annotation_path = os.path.join(class_path, annotation)
                        annotation_paths.append(annotation_path)
                        n_annotations += 1
                        #print(annotation_path)
                    print("annotation appended: {}".format(n_annotations))
    
            # sort + np.array: os.listdir badly shuffles the names + annotations
            image_paths.sort()
            annotation_paths.sort()
            image_paths = np.array(image_paths)
            annotation_paths = np.array(annotation_paths)
            print("CustomDataset initialized \n")
            
            return image_paths, classes_to_label, annotation_paths
        else:
            warnings.warn("No root folder specified. Ignore this warning, if you used '.train_test_split()'")
            return None, None, None


    def __getitem__(self, idx):
        current_img_path = self.image_paths[idx]
        current_annot_path = self.annotation_paths[idx]
        current_class_label = self.class_label[idx]

        # i have found colorless image -> must convert to 3d channel
        current_img = PIL.Image.open(current_img_path).convert("RGB")
        current_annot = scipy.io.loadmat(current_annot_path)
        current_annot = current_annot["box_coord"].squeeze()
        current_class_label = torch.tensor(current_class_label).float().unsqueeze(0)

        # must transform annotation first (=bounding box), to get the original image sizes
        # to keep the box accurate
        if self.transform_annot:
            current_annot = self.transform_annot(current_annot, current_img)
        if self.transform_img:
            current_img = self.transform_img(current_img)

        #print(current_img_path)
        return current_img, current_class_label, current_annot



    def train_test_split(self, test_size):
        """
        manual, basic split. random selection with no respect to class sizes.
        
        returns: new CustomDataset obj as test set.
        note: returns with the same transforms as arguments on the original dataset
        
        generate random indexes, put those paths to the test_set, delete them from original paths
        """
        total_test_size = int(len(self.image_paths) * test_size)
        print("total test size: ", total_test_size)
        rand_indexes = np.random.randint(0, len(self.image_paths), size=total_test_size)
        rand_indexes = set(rand_indexes)
        while len(rand_indexes) != total_test_size:
            rand_indexes.add(np.random.randint(0, len(self.image_paths)))
        # transform back to list for numpy to handle
        rand_indexes = list(rand_indexes)
        
        test_image_paths = []
        test_class_label = []
        test_annotation_paths = []
        for one_idx in rand_indexes:
            test_image_paths.append(self.image_paths[one_idx])
            test_class_label.append(self.class_label[one_idx])
            test_annotation_paths.append(self.annotation_paths[one_idx])

        self.image_paths = np.delete(self.image_paths, rand_indexes)
        self.class_label = np.delete(self.class_label, rand_indexes)
        self.annotation_paths = np.delete(self.annotation_paths, rand_indexes)

        """
        OLD
        test_set = []
        # in case of when self.image_paths,... etc are simple lists. not np arrays
        for elem in test_set:
            if elem[0] in self.image_paths:
                self.image_paths.remove(elem[0])
            if elem[1] in self.class_label:
                self.class_label.remove(elem[1])
            if elem[2] in self.annotation_paths:
                self.annotation_paths.remove(elem[2])
        """
        new_dataset_obj = CustomDataset(root_folder=None, transform_img=self.transform_img, transform_annot=self.transform_annot)
        new_dataset_obj.image_paths = test_image_paths
        new_dataset_obj.class_label = test_class_label
        new_dataset_obj.annotation_paths = test_annotation_paths
        
        return new_dataset_obj
        
    
    def __len__(self):
        return len(self.image_paths)


In [21]:
# test dataset

root_path = "./caltech-101/subset/"

test_c_dataset = CustomDataset(root_path)

print(test_c_dataset.image_paths[797])
print(test_c_dataset.class_label[797])
print(test_c_dataset.annotation_paths[797])
print("\n")
print(test_c_dataset.image_paths[798])
print(test_c_dataset.class_label[798])
print(test_c_dataset.annotation_paths[798])
print("\n")

print("first elem: \n", test_c_dataset[0], "\n", "-----------------------",  "\n")
test_c_dataset[0][0].show()

CustomDataset initializing...
img_classes path:  ./caltech-101/subset/images\airplanes
image appended: 800
img_classes path:  ./caltech-101/subset/images\Motorbikes
image appended: 798
annot_classes path:  ./caltech-101/subset/annotations\airplanes
annotation appended: 800
annot_classes path:  ./caltech-101/subset/annotations\Motorbikes
annotation appended: 798
CustomDataset initialized 

./caltech-101/subset/images\Motorbikes\image_0798.jpg
0
./caltech-101/subset/annotations\Motorbikes\annotation_0798.mat


./caltech-101/subset/images\airplanes\image_0001.jpg
0
./caltech-101/subset/annotations\airplanes\annotation_0001.mat


first elem: 
 (<PIL.Image.Image image mode=RGB size=262x161 at 0x1FDD9C70D60>, tensor([0.]), array([ 19, 141,  31, 233], dtype=uint8)) 
 ----------------------- 



In [23]:
root_path = "./caltech-101/subset/"

# !! got no idea how these function passes not results in error...
train_set = CustomDataset(root_path, transform_img=preprocess_img, transform_annot=preprocess_bounding_box)
test_set = train_set.train_test_split(0.1)
loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)


CustomDataset initializing...
img_classes path:  ./caltech-101/subset/images\airplanes
image appended: 800
img_classes path:  ./caltech-101/subset/images\Motorbikes
image appended: 798
annot_classes path:  ./caltech-101/subset/annotations\airplanes
annotation appended: 800
annot_classes path:  ./caltech-101/subset/annotations\Motorbikes
annotation appended: 798
CustomDataset initialized 

total test size:  159




In [24]:
# test loader

print("len train set: ", len(train_set))
print("len test set: ", len(test_set))
print("")

print(train_set.image_paths[797])
print("35. elem: \n", train_set[34], "\n", "-----------------------",  "\n")
print(train_set[34][0][0][0])
transform = transforms.ToPILImage()
reconverted_img = transform(train_set[34][0])
reconverted_img.show()



len train set:  1439
len test set:  159

./caltech-101/subset/images\airplanes\image_0078.jpg
35. elem: 
 (tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]]]), tensor([0.]), tensor([0.1221, 0.1221, 0.8779, 0.8626])) 
 ----------------------- 

tensor([1., 1., 1., 1.

## 6. training

In [26]:
# 1 for binary classif
brain = Network(1)
optimizer = Adam(brain.parameters(), lr=learning_rate)

In [27]:
# training


for epoch in range(1, n_epochs+1):
    losses = []
    for image, label, annotation in loader:
        pred_class, pred_box = brain(image)
        #print(pred_class)
        #print(pred_box)
        #print(label)
        loss_classif = F.binary_cross_entropy(pred_class, label)
        loss_box = F.mse_loss(pred_box, annotation)

        # total loss?
        total_loss = loss_classif + loss_box
        losses.append(total_loss.data)
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()

    print("epoch: ", epoch, "\t loss: ", np.mean(losses))
        

epoch:  1 	 loss:  0.22491243
epoch:  2 	 loss:  0.020246012
epoch:  3 	 loss:  0.012180698
epoch:  4 	 loss:  0.011309141
epoch:  5 	 loss:  0.0079008825
epoch:  6 	 loss:  0.0068750475
epoch:  7 	 loss:  0.0056705046
epoch:  8 	 loss:  0.005387346
epoch:  9 	 loss:  0.0048338706
epoch:  10 	 loss:  0.0044836593
epoch:  11 	 loss:  0.0038560603
epoch:  12 	 loss:  0.003512766
epoch:  13 	 loss:  0.002778457
epoch:  14 	 loss:  0.0027950117
epoch:  15 	 loss:  0.0018151009
epoch:  16 	 loss:  0.001189939
epoch:  17 	 loss:  0.0010356811
epoch:  18 	 loss:  0.00084402144
epoch:  19 	 loss:  0.0006722611
epoch:  20 	 loss:  0.0005182153


## 7. save

In [62]:
def prep_output_txt():
    structure = Network(1)
    txt = f"""{structure.features}, \n
    flattened neurons: {structure.n_flattened_neurons}, \n 
    classification head: {structure.class_head}, \n
    localization head: {structure.localize_head}, \n
    training batch size: {batch_size}, 
    learning rate: {learning_rate},
    epochs: {n_epochs}
    losses: {np.mean(losses)}"""
    return txt

In [63]:
base_output = "outputs"
save_name = "plane_bike_checkpoint"
save_file_extension = ".pth"
overwrite = True
full_path = os.path.join(base_output, save_name + save_file_extension)


os.makedirs(base_output, exist_ok=True)
if os.path.exists(full_path):
    if overwrite == True:
        torch.save(brain.state_dict(), full_path)
        print("saved as: ", save_name)
    else:
        print("save failed. file already exists. to overwrite, set overwrite=True")
else:
    torch.save(brain.state_dict(), full_path)
    print("saved as: ", save_name)

with open(base_output + "/" + save_name + ".txt", "w", encoding="utf-8") as textfile:
    textfile.write(prep_output_txt())

saved as:  plane_bike_checkpoint


## 8. testing + accuracy

In [18]:
# uncomment to load the saved one.

#brain.load_state_dict(torch.load("plane_bike_checkpoint.pth"))

In [None]:
# testing

# due to my mistake and lack of separate test folder, first i did not split the data to train-test.
# so, i downloaded images from google to test.
# i leave this here, because the result is interesting

# 1. mass pred
test_dataset = ImageFolder("./caltech-101/subset/manual_testset/", transform=preprocess_img)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

print("test dataset: ", test_dataset, "\n")

brain.eval()
predictions = []
with torch.no_grad():
    for img, label in test_loader:
        pred_class, pred_box = brain(img)
        predictions.append({"class true" : int(label), "prediction prob" : float(pred_class)})


print("class true \t prediction \t prediction prob")
for elem in predictions:
    if elem["class true"] == 1:
        elem["class true"] = "airplane"
    else:
        elem["class true"] = "motorbike"
    if elem["prediction prob"] >= 0.5:
        elem["prediction"] = "airplane"
    else:
        elem["prediction"] = "motorbike"
    print(elem["class true"], "\t", elem["prediction"], "\t", elem["prediction prob"])

print("")


# 2. single pred
img_path = "./caltech-101/subset/manual_testset/motor/motor1.jpg"
image = PIL.Image.open(img_path)
image.show()
image = preprocess_img(image)
image = image.unsqueeze(0)


brain.eval()
with torch.no_grad():
    pred_class, pred_box = brain(image)

print("single pred: ", pred_class)

## 8. testing2 (correctly?)

In [64]:
# testing

test_loader = DataLoader(test_set, batch_size=1, shuffle=False)
predictions = []

brain.eval()
with torch.no_grad():
    for img, label, annotation in test_loader:
        pred_class, pred_box = brain(img)
        predictions.append({"class true" : int(label),
                            "prediction prob" : float(pred_class),
                            "bounding_box" : pred_box})


print("class true \t prediction \t prediction prob \t bounding box")
for elem in predictions:
    """
    if elem["class true"] == 1:
        elem["class true"] = "airplane"
    else:
        elem["class true"] = "motorbike"
    """
    if elem["prediction prob"] >= 0.5:
        elem["prediction"] = 1
    else:
        elem["prediction"] = 0
    if elem["prediction prob"]:
        elem["prediction prob"] = "{:.6f}".format(elem["prediction prob"])
    print(elem["class true"], "\t\t", elem["prediction"], "\t\t", elem["prediction prob"], "\t\t", elem["bounding_box"])

class true 	 prediction 	 prediction prob 	 bounding box
0 		 0 		 0.000000 		 tensor([[0.1438, 0.1699, 0.8667, 0.8671]])
0 		 0 		 0.000000 		 tensor([[0.1646, 0.2258, 0.8488, 0.8314]])
0 		 0 		 0.000000 		 tensor([[0.1430, 0.1568, 0.8709, 0.8781]])
0 		 0 		 0.000062 		 tensor([[0.1594, 0.2561, 0.8435, 0.7940]])
1 		 1 		 0.999996 		 tensor([[0.1197, 0.1566, 0.8944, 0.8434]])
0 		 0 		 0.000000 		 tensor([[0.1311, 0.1131, 0.8893, 0.8882]])
1 		 1 		 0.999794 		 tensor([[0.1358, 0.1961, 0.8627, 0.8074]])
1 		 1 		 0.999945 		 tensor([[0.1403, 0.1919, 0.8524, 0.8103]])
1 		 1 		 0.999812 		 tensor([[0.1187, 0.2103, 0.8794, 0.8115]])
1 		 1 		 0.999265 		 tensor([[0.1321, 0.2012, 0.8623, 0.7927]])
0 		 0 		 0.000000 		 tensor([[0.1500, 0.2086, 0.8701, 0.8371]])
1 		 1 		 0.999993 		 tensor([[0.1227, 0.1725, 0.8777, 0.8260]])
1 		 1 		 0.998223 		 tensor([[0.1657, 0.2804, 0.8325, 0.7865]])
1 		 1 		 0.892955 		 tensor([[0.1584, 0.3071, 0.8585, 0.7984]])
0 		 0 		 0.000000 		 tensor([[0.

In [65]:
y_true = []
y_pred = []

for elem in predictions:
    y_true.append(elem["class true"])
    y_pred.append(elem["prediction"])
print("sklearn accuracy: ", accuracy_score(y_true, y_pred))

np_acc = np.sum(np.equal(y_true, y_pred)) / len(y_true)
print("np accuracy: ", np_acc)

sklearn accuracy:  0.9811320754716981
np accuracy:  0.9811320754716981


In [22]:
# too high accuracy. -> overfitting?

In [66]:
# check bounding box


def check_bounding_box():
    pass
    
for img_path, annotation_path in zip(test_set.image_paths, test_set.annotation_paths):
    #raise Exception("bounding box might be good, but need to figure out the correct rectangle drawing. see above at preprocessing")
    # use cv2 to draw the rectangle
    #cv_img = cv2.imread(img_path)
    
    img_orig = PIL.Image.open(img_path)
    img = PIL.Image.open(img_path).convert("RGB")
    annot = scipy.io.loadmat(annotation_path)
    annot = annot["box_coord"].squeeze()
    print("rectangle true: ", [annot[2], annot[0], annot[3], annot[1]])
    annot = preprocess_bounding_box(annot, img)
    
    # get aspect ratio
    w, h = img.size
    img = preprocess_img(img).unsqueeze(0)

    # predict
    brain.eval()
    with torch.no_grad():
        pred_class, pred_box = brain(img)

    pred_box = pred_box.squeeze()
    #print(pred_box)
    x1 = int(pred_box[0] * w)
    y1 = int(pred_box[1] * h)
    x2 = int(pred_box[2] * w)
    y2 = int(pred_box[3] * h)

    # draw rectangle on img
    print("rectangle pred: ", [x1, y1, x2, y2])
    print("class pred: ", f"{float(pred_class):.4f}")
    draw_obj = PIL.ImageDraw.Draw(img_orig)
    draw_obj.rectangle([x1, y1, x2, y2], outline="red")
    img_orig.show()
    # wait for buttonpress
    input("")



rectangle true:  [30, 15, 231, 132]
rectangle pred:  [37, 25, 227, 130]
class pred:  0.0000


 


rectangle true:  [44, 44, 209, 141]
rectangle pred:  [42, 38, 221, 142]
class pred:  0.0000


 


rectangle true:  [36, 22, 226, 129]
rectangle pred:  [37, 23, 227, 130]
class pred:  0.0000


 


rectangle true:  [35, 41, 223, 137]
rectangle pred:  [41, 45, 220, 142]
class pred:  0.0001


 


rectangle true:  [58, 30, 357, 151]
rectangle pred:  [49, 28, 370, 152]
class pred:  1.0000


 


rectangle true:  [36, 23, 230, 136]
rectangle pred:  [34, 17, 232, 138]
class pred:  0.0000


 


rectangle true:  [50, 32, 348, 145]
rectangle pred:  [54, 34, 345, 142]
class pred:  0.9998


 


rectangle true:  [65, 33, 356, 148]
rectangle pred:  [58, 35, 357, 149]
class pred:  0.9999


KeyboardInterrupt: Interrupted by user