# Per Localization
Datset used: [The Oxford IIIT Pet Dataset](https://www.kaggle.com/devdgohil/the-oxfordiiit-pet-dataset)

### Disclaimer:
**This is just to implement the core concept of basic object localization and is not to be taken serious - this is just experimental and to learn it before going to object detection**


In [1]:
import zipfile
import os
import numpy as np
import csv
import cv2
import glob
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
import random
import tqdm

import torch
import torch.nn as nn
import torch.optim as optim

In [None]:
os.mkdir('data') 

with zipfile.ZipFile("drive/MyDrive/AI/Data/ImageData/oxford_iiit.zip","r") as zip_ref:
    zip_ref.extractall("./data")

In [2]:
def pad_image(img, IMG_SIZE):
    image = cv2.copyMakeBorder(img, 0, IMG_SIZE-img.shape[0], 0, IMG_SIZE-img.shape[1], cv2.BORDER_CONSTANT)
    return image

In [3]:
IMG_SIZE = 550
XMLS = "./data/annotations/annotations/xmls"

training_data = []
xml_files = glob.glob("{}/*xml".format(XMLS))
for i, xml_file in enumerate(xml_files):
    tree = ET.parse(xml_file)

    path = os.path.join('./data/images/images', tree.findtext("./filename"))
    img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)

    if img.shape[0] < IMG_SIZE and img.shape[1] < IMG_SIZE:
        xmin = int(tree.findtext("./object/bndbox/xmin"))
        ymin = int(tree.findtext("./object/bndbox/ymin"))
        xmax = int(tree.findtext("./object/bndbox/xmax"))
        ymax = int(tree.findtext("./object/bndbox/ymax"))

        image = pad_image(img, IMG_SIZE)
        training_data.append([np.array(image), [xmin/IMG_SIZE, ymin/IMG_SIZE, xmax/IMG_SIZE, ymax/IMG_SIZE]])

print('training_data length: ', len(training_data))

training_data length:  3634


In [4]:
X = torch.Tensor([i[0] for i in training_data]).view(-1, IMG_SIZE, IMG_SIZE)
X = X/255.0
y = torch.Tensor([i[1] for i in training_data])

In [5]:
class Net(nn.Module):
    def __init__(self, in_channels, n_classes):
        super(Net, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels=in_channels, out_channels=8, kernel_size=5),
            nn.ReLU(),
            nn.BatchNorm2d(8),

            nn.Conv2d(in_channels=8, out_channels=8, kernel_size=5),
            nn.ReLU(),
            nn.BatchNorm2d(8),

            nn.MaxPool2d(2),

            nn.Conv2d(in_channels=8, out_channels=16, kernel_size=5),
            nn.ReLU(),
            nn.BatchNorm2d(16),

            nn.MaxPool2d(2),

            nn.Conv2d(in_channels=16, out_channels=16, kernel_size=5),
            nn.ReLU(),
            nn.BatchNorm2d(16),

            nn.MaxPool2d(2),

        )
        self.fc = nn.Sequential(
            nn.Linear(16*64*64, 4096),
            nn.ReLU(),
            nn.Dropout(.5),
            nn.Linear(4096, 1024),
            nn.ReLU(),
            nn.Dropout(.2),
            nn.Linear(1024, n_classes),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        x = self.conv(x)
        x = x.view(-1, 16*64*64)
        x = self.fc(x)
        return x

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net(1, 4).to(device)
print(model)

Net(
  (conv): Sequential(
    (0): Conv2d(1, 8, kernel_size=(5, 5), stride=(1, 1))
    (1): ReLU()
    (2): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Conv2d(8, 8, kernel_size=(5, 5), stride=(1, 1))
    (4): ReLU()
    (5): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (7): Conv2d(8, 16, kernel_size=(5, 5), stride=(1, 1))
    (8): ReLU()
    (9): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (11): Conv2d(16, 16, kernel_size=(5, 5), stride=(1, 1))
    (12): ReLU()
    (13): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (14): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc): Sequential(
    (0): Linear(in_features=65536, out_feat

In [15]:
# to double check dimensions
model.conv(X[:1].view(-1, 1, 550, 550).to(device)).size()

torch.Size([1, 16, 64, 64])

In [7]:
BATCH_SIZE = 4
EPOCHS = 10
model.train()
optimizer = optim.SGD(model.parameters(), lr=0.01)
loss_function = nn.MSELoss()

def train(model):
    for epoch in range(EPOCHS):
        for i in tqdm.tqdm(range(0, len(X), BATCH_SIZE)):
            batch_X = X[i:i+BATCH_SIZE].view(-1, 1, 550, 550).to(device)
            batch_y = y[i:i+BATCH_SIZE].to(device)

            model.zero_grad()

            outputs = model(batch_X)
            loss = loss_function(outputs, batch_y)
            loss.backward()
            optimizer.step()    

        print(f"\nEpoch: {epoch}. Loss: {loss}")


train(model)

100%|██████████| 909/909 [00:30<00:00, 29.67it/s]
  0%|          | 4/909 [00:00<00:24, 37.05it/s]


Epoch: 0. Loss: 0.005115551874041557


100%|██████████| 909/909 [00:30<00:00, 29.71it/s]
  0%|          | 4/909 [00:00<00:24, 37.57it/s]


Epoch: 1. Loss: 0.00565384142100811


100%|██████████| 909/909 [00:30<00:00, 29.68it/s]
  0%|          | 4/909 [00:00<00:24, 37.51it/s]


Epoch: 2. Loss: 0.0017478576628491282


100%|██████████| 909/909 [00:30<00:00, 29.74it/s]
  0%|          | 4/909 [00:00<00:24, 37.59it/s]


Epoch: 3. Loss: 0.00245093647390604


100%|██████████| 909/909 [00:30<00:00, 29.72it/s]
  0%|          | 4/909 [00:00<00:24, 37.33it/s]


Epoch: 4. Loss: 0.001998061779886484


100%|██████████| 909/909 [00:30<00:00, 29.71it/s]
  0%|          | 4/909 [00:00<00:24, 37.64it/s]


Epoch: 5. Loss: 0.0021302467212080956


100%|██████████| 909/909 [00:30<00:00, 29.72it/s]
  0%|          | 4/909 [00:00<00:24, 37.52it/s]


Epoch: 6. Loss: 0.0025986104737967253


100%|██████████| 909/909 [00:30<00:00, 29.71it/s]
  0%|          | 4/909 [00:00<00:24, 37.28it/s]


Epoch: 7. Loss: 0.0031612785533070564


100%|██████████| 909/909 [00:30<00:00, 29.70it/s]
  0%|          | 4/909 [00:00<00:24, 37.63it/s]


Epoch: 8. Loss: 0.0011636980343610048


100%|██████████| 909/909 [00:30<00:00, 29.67it/s]


Epoch: 9. Loss: 0.0014444717671722174





In [None]:
e = 1

label = model(X[e].view(-1, 1, 550, 550).to(device))

xmin = label[0][0].item()
ymin = label[0][1].item()
xmax = label[0][2].item()
ymax = label[0][3].item()

img = training_data[e][0]
bnd_img = cv2.rectangle(img, (int(xmin*IMG_SIZE), int(ymin*IMG_SIZE)), (int(xmax*IMG_SIZE), int(ymax*IMG_SIZE)), (255, 0, 0), 2)
plt.imshow(bnd_img, cmap='gray')