In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import torchvision
import torchvision.transforms as transforms
import torch.utils.data as utils
from PIL import Image
from os import listdir
from os.path import isfile, join
import numpy as np
# Import resnet18
import torchvision.models as models
# Local library
import getimagenetclasses as ic
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import sys,os


In [14]:
class DatasetCV(utils.Dataset):
    """Initilise the dataset \n 
    [1]The synset_words directory \n 
    [2]The nm directory 
    [3] Transform type (one, noBilinear, noNormal, five, five_2, ten, (use None for manual center crop))"""
    def __init__(self,synset_words_dir, nm_dir,transform=None):
        indicestosynsets,synsetstoindices,synsetstoclassdescr=ic.parsesynsetwords(filen)
        self.its = indicestosynsets
        self.sti = synsetstoindices
        self.std = synsetstoclassdescr
        self.listOfNm = sorted([f for f in listdir(nm_dir) if isfile(join(nm_dir, f))])
#         print(self.listOfNm)
        data_list =[]
        for i in range(len(self.listOfNm)):
            # Since we only have 2500 images
            if int(self.listOfNm[i][18:-4])>2500:
                continue
            else:
                filename = self.listOfNm[i][:-4]+".JPEG"
                data_list.append([[filename],[ic.parseclasslabel(nm+self.listOfNm[i],self.sti)[0]],[( self.std[ic.parseclasslabel(nm+self.listOfNm[i],self.sti)[1]])]])
        self.data = np.array(data_list)
        self.len = len(self.data)
        self.transform = transform
    """Number of samples per epoch to be returned"""
    def __len__(self):
        return self.len
    """
       [1] Loads the image based on the filename.
       [2] Returns the crop of the index-th image.
    """
    def __getitem__(self,index):
        file_dir = "imagenet_first2500/imagespart/"
        # values taken online
        imagenet_mean = np.array([0.485, 0.456, 0.406]).astype('float32')
        imagenet_std  = np.array([0.229, 0.224, 0.225]).astype('float32')
        label = self.data[index][1]
        
        label = np.asarray(int(label))
#         label = label.astype('float32')
        label = torch.from_numpy(label)
        label = label
        image = Image.open(file_dir + self.data[index][0][0]).convert('RGB')
        # Only single is done manually
        if self.transform == None:
            h, w, c = np.array(image).shape
            # resizing using PIL.Image.resize()
            if (w <= h and w == 224) or (h <= w and h == 224):
                image = image
            if w < h :
                ow = 224
                oh = int(224 * h / w)
                image = image.resize((ow, oh), Image.BILINEAR)
            elif h < w:
                oh = 224
                ow = int(224 * w / h)
                image = image.resize((ow, oh), Image.BILINEAR)
            # [2] Normalise to pixels 0-1.
            pixels = np.asarray(image)
            pixels = pixels.astype('float32')
            pixels /= 255.0
#             pixels = pixels.swapaxes(0,1) #102
#             pixels = pixels.swapaxes(0,2) # 201
            pixels = pixels.transpose(2,0,1)
            pixels = pixels[np.newaxis,]
            # Add more axis so that the two vectors are broadcastable
            # 1,3,224,500 and # 1,3,1,1
            imagenet_mean = imagenet_mean[:,np.newaxis]
            imagenet_mean = imagenet_mean[:,np.newaxis]
            imagenet_std = imagenet_std[:,np.newaxis]
            imagenet_std = imagenet_std[:,np.newaxis]
            """
            [4]
            A common way to perform normalization is to subtract
            the mean of pixel values of the whole dataset from 
            each pixel, and then divide by the standard deviation 
            of the pixels of the whole dataset.
            """
            pixels = pixels - imagenet_mean
            pixels = pixels / imagenet_std
            # [5] Cropping the Center of the Numpy
            crop = 224 /2
            center_width = int(round(pixels.shape[3] / 2))
            center_height = int(round(pixels.shape[2] / 2))
            pixels = pixels[:,:,round(center_height-crop):round(center_height+crop), round(center_width-crop):round(center_width+crop)]
            # [6] Convert this into a Tensor
            image = torch.from_numpy(pixels).float()
#             print(image.shape,"init",label)
            data = [image, label]
            return data
        if self.transform != None:
            image = self.transform(image)
            data = [image, label]
            return data
# All the transform.
one = transforms.Compose([
        transforms.Resize(224),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225])
        ]) 
# remove bilinear
noBilinear = transforms.Compose([
        transforms.Resize(224, interpolation=0), # Since its Default: PIL.Image.BILINEAR
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225])
        ]) 
# Dont normalise
noNormal = transforms.Compose([
        transforms.Resize(224), 
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        ]) 
# five crop
five = transforms.Compose([
        transforms.Resize(280),
        transforms.FiveCrop(224),
        transforms.Lambda(lambda crops: torch.stack([transforms.ToTensor()(crop) for crop in crops])),
        transforms.Lambda(lambda crops: torch.stack([transforms.Normalize(mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225])(crop) for crop in crops])),
        ])
# Ten Crop
ten = transforms.Compose([
        transforms.Resize(330),
        transforms.TenCrop(330),
        transforms.Lambda(lambda crops: torch.stack([transforms.ToTensor()(crop) for crop in crops])),
        transforms.Lambda(lambda crops: torch.stack([transforms.Normalize(mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225])(crop) for crop in crops])),
        ])
# five crop
five_2 = transforms.Compose([
        transforms.Resize((330,330)),
        transforms.FiveCrop(330),
        transforms.Lambda(lambda crops: torch.stack([transforms.ToTensor()(crop) for crop in crops])),
        transforms.Lambda(lambda crops: torch.stack([transforms.Normalize(mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225])(crop) for crop in crops])),
        ]) 

In [8]:
filen='imagenet_first2500/synset_words.txt'
nm ='imagenet_first2500/val/'


In [9]:
"""
call your two dataset classes dl1[k] and dl2[k] with the same image
index k and compare that their output is the same up to 
oating point
problems.
{ compare the difference when removing PIL.Image.BILINEAR in the
resize(). Its quite trashing the numbers, isnt it ??
"""


'\ncall your two dataset classes dl1[k] and dl2[k] with the same image\nindex k and compare that their output is the same up to \noating point\nproblems.\n{ compare the difference when removing PIL.Image.BILINEAR in the\nresize(). Its quite trashing the numbers, isnt it ??\n'

In [13]:
# # Part one
d11 = DatasetCV(filen, nm,transform = None) # Manual by hand
d12 = DatasetCV(filen, nm,transform=one) # Using Transform
d13 = DatasetCV(filen, nm,transform=noBilinear) # No Bilinear
# The different between Manually done by hand and transform is zero, to show that the manual crop is the same as library methods.
print("The different of d11[k] and d12[k] is:",d12.__getitem__(1)[0] - d11.__getitem__(1)[0])
"""
Problem 1.2
Since interpolation technique based on surrounding pixels is used to produce much smoother scaling. 
If the image is not done with Bilinear,  When resizing the image, the values are not smooth and the image may be 
badly represented.
Bilinear interpolation uses a distance-average to estimate with closer cells being given higher weights.
It uses 4 nearest neighbors to generate an output surface. Cubic uses 16.
"""
print("The different of d12[k] and non  bilinear is:",torch.sum(d12.__getitem__(1)[0] - d13.__getitem__(1)[0]))


The different of d11[k] and d12[k] is: tensor([[[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],

         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],

         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]]])
The different of d12[k] and non  bilinear is: tensor(334.7711)


In [373]:
# def five_predict(model, image, b_size):
#     image.squeee(dim=0)
# #     print(image.shape,"predict")
#     n_crops, c, h, w = image.shape
#     preds = model(image.view(-1, c, h, w))
# #     print(preds.shape)
#     preds = preds.view(b_size, n_crops, -1).mean(dim=1)
#     preds = preds.argmax(dim=1).numpy()
# #     print(preds.item(0))
#     return preds


In [33]:
def evalulate(model,dataloader,stop_no=500):
    model.eval()
    model.to(device)
    correct = 0
    count = 0
    print("Start evaluation")
    with torch.no_grad():
        for n in dataloader:
            image = n[0].to(device)
            label = n[1].to(device)
            results = model(image)
            preds = torch.argmax(results)
            label = torch.squeeze(label,dim=0)
            if int(preds.item())== int(label.item()):
                correct += 1
                print("correctly predicted",correct)
#             print(correct)
#             print(int(preds.tolist()[0].,int(label.data.cpu().numpy()[0])
            count +=1
            if count == stop_no:
                break
    accuracy = correct / stop_no
    print("Accuracy = ", accuracy)
    return accuracy

def evaluate_multi(model,dataloader,batch_size,stop_no=250):
    model.eval()
    model.to(device)
    correct = 0
    count = 0
    print("Start evaluation")
    with torch.no_grad():
        for n in dataloader:
            image = n[0].to(device)
            label = n[1].to(device)
            label = label.squeeze(0)
            batchsize, crops, c, h, w = image.size()
            results = model(image.view(-1, c, h, w))
            average = results.view(batchsize, crops, -1).mean(dim=1) # avg over crops
            preds = torch.argmax(average,dim=1)
            preds = preds.cpu().numpy()
            label = label.cpu().numpy()
            correct += np.sum(preds == label)
            print("predicts:",preds,"labels",label)
            count +=1
#             print(correct,"correct")
            if count == stop_no:
                break

    accuracy = correct / (stop_no*batch_size)
    print("Accuracy = ", accuracy)
    return accuracy

In [9]:
# Using resnet18
# Problem 1 part 2, Comparing performance w/o mean and normal vs with
resnet18 = models.resnet18(pretrained=True)
batch_size = 1
dataInstance1 = DatasetCV(filen, nm,transform=noNormal)
dataInstance2 = DatasetCV(filen, nm,transform=one)

# print(dataInstance.__getitem__(1)[1])
dataloader = utils.DataLoader(dataInstance1, batch_size,
                        shuffle=False, num_workers=0)
dataloader2 = utils.DataLoader(dataInstance2, batch_size,
                        shuffle=False, num_workers=0)


In [438]:
accuracyNormal = evalulate(resnet18,dataloader2,250)
accuracyNN = evalulate(resnet18,dataloader,250)

Start evaluation
correctly predicted 1
correctly predicted 2
correctly predicted 3
correctly predicted 4
correctly predicted 5
correctly predicted 6
correctly predicted 7
correctly predicted 8
correctly predicted 9
correctly predicted 10
correctly predicted 11
correctly predicted 12
correctly predicted 13
correctly predicted 14
correctly predicted 15
correctly predicted 16
correctly predicted 17
correctly predicted 18
correctly predicted 19
correctly predicted 20
correctly predicted 21
correctly predicted 22
correctly predicted 23
correctly predicted 24
correctly predicted 25
correctly predicted 26
correctly predicted 27
correctly predicted 28
correctly predicted 29
correctly predicted 30
correctly predicted 31
correctly predicted 32
correctly predicted 33
correctly predicted 34
correctly predicted 35
correctly predicted 36
correctly predicted 37
correctly predicted 38
correctly predicted 39
correctly predicted 40
correctly predicted 41
correctly predicted 42
correctly predicted 43
cor

In [439]:
# Without minus mean and normalisation shld be smaller
print("The accuracy for without minus mean, no normalisation is :",accuracyNN)
print("The accuracy with normalisation is : ", accuracyNormal)
#0.464 (no normal) versus 0.688 with normalisation.

The accuracy for without minus mean,no normalisation is : 0.464
The accuracy with normalisation is :  0.688


In [19]:
batch_size = 5
dataInstance3 = DatasetCV(filen, nm,transform=five)
dataInstance4 = DatasetCV(filen, nm,transform=ten)
# dataInstance3.__getitem__(1)[0].shape

dataloader3 = utils.DataLoader(dataInstance3, batch_size,
                        shuffle=False, num_workers=0)
dataloader4 = utils.DataLoader(dataInstance4, batch_size,
                        shuffle=False, num_workers=0)
accuracyFive = evaluate_multi(resnet18,dataloader3,batch_size,250)
accuracyTen  = evaluate_multi(resnet18,dataloader4,batch_size,250)

Start evaluation
predicts: [ 65 795 230 809 520] labels [ 65 970 230 809 516]
predicts: [ 60 334 934 674 903] labels [ 57 334 415 674 332]
predicts: [109 286 370 757 595] labels [109 286 370 757 595]
predicts: [147   1  23 478 517] labels [147 108  23 478 517]
predicts: [334 173 948 727  23] labels [334 173 948 727  23]
predicts: [846 270 162  55 858] labels [846 270 167  55 858]
predicts: [324 573 360 981 586] labels [324 573 150 981 586]
predicts: [887  26 398 777  74] labels [887  32 398 777  74]
predicts: [431 756 129 198 256] labels [516 756 129 198 256]
predicts: [505 565 162 717 391] labels [725 565 167 717 394]
predicts: [ 92  29 844 591 359] labels [ 92  29 844 591 358]
predicts: [468 154 994 872 588] labels [468 259 994 872 588]
predicts: [474 183   5  46 842] labels [474 183 107  46 842]
predicts: [390 101 887 870 929] labels [390 101 887 870 841]
predicts: [145 149  21 476  80] labels [467 149  21 476  80]
predicts: [424 159 275 175 461] labels [424 159 275 175 461]
predict

predicts: [955 306 425 190 370] labels [955 306 425 190 370]
predicts: [187 971 534 396 657] labels [187 971 534 397 657]
predicts: [422 718 116 707 994] labels [840 718 116 836 994]
predicts: [419 454 214 285 641] labels [419 764 214 285 641]
predicts: [951 827  13 829 765] labels [951 882  13 829 624]
predicts: [216 665 521 268 468] labels [216 665 521 268 468]
predicts: [563 728 191 449 194] labels [418 728 356 449 194]
predicts: [362 952 924 249 524] labels [362 948 924 249 524]
predicts: [992 571 283 608 144] labels [992 571 283 608 129]
predicts: [486 859 498  21 467] labels [486 859 498  21 467]
predicts: [591 924 703  97 398] labels [591 924 556  97 898]
predicts: [586  10 202  67 649] labels [586  10 202  67 501]
predicts: [141 603 727 101 995] labels [141 603 727 101 995]
predicts: [278 965 238 424 489] labels [278 964 240 423 634]
predicts: [533 424 451 569 732] labels [533 424 451 555 732]
predicts: [514 803 310 551 753] labels [514 803 300 551 753]
predicts: [411 315 963 1

predicts: [959 638 646 480 645] labels [959 638 646 664 645]
predicts: [449 483 852 397 311] labels [718 483 852 392 311]
predicts: [457 352  23 934 283] labels [457 352  22 934 283]
predicts: [802 742 276 234 751] labels [802 553 276 236 751]
predicts: [106 592 328 967 251] labels [343 528 328 969 558]
predicts: [163 328 771 726 977] labels [163 328 771 726 977]
predicts: [475 266 686 782 977] labels [875 265 686 590 975]
predicts: [620 637  39 115 998] labels [620 637  39 115 937]
predicts: [274 277 763 579 646] labels [272 277 763 789 646]
predicts: [234 894 647 504 937] labels [213 493 647 504 937]
predicts: [687 781 666 583 158] labels [687 781 666 583 158]
predicts: [825 212 659 257 436] labels [825 212 659 257 436]
predicts: [199 140 248 339 230] labels [196 140 248 339 230]
predicts: [361 909 935 445 656] labels [361 544 935 638 627]
predicts: [289 867 280 103 772] labels [289 867 272 103 584]
predicts: [180 448 449 771 118] labels [180 703 449 771 118]
predicts: [396 922  16 5

predicts: [281 891 956 187 367] labels [282 891 956 201 267]
predicts: [441 200 508 424 907] labels [969 200 673 423 907]
predicts: [ 57  27 982 863 322] labels [ 57  27 459 863 322]
predicts: [934 663 851 687 475] labels [934 663 424 687 837]
predicts: [483 645 124 306 930] labels [958 645 120 306 930]
predicts: [467 580 524 205 137] labels [121 694 524 205 137]
predicts: [849 620 377 586 916] labels [849 681 380 586 916]
predicts: [478 182 829 715 590] labels [478 182 874 715 590]
predicts: [111  19 162 915 730] labels [111  19 161 915 730]
predicts: [424 822 771 699 614] labels [678 822 818 699 601]
predicts: [518 233 921 624 679] labels [673 233 501 624 679]
predicts: [667 581 665 604 633] labels [400 581 665 903 622]
predicts: [611 800 899 669  81] labels [585 800 899 669  81]
predicts: [746 866 935 668 295] labels [746 595 935 668 295]
predicts: [893 266 628 987 367] labels [893 266 628 987 367]
predicts: [294 727  12 435 192] labels [294 727  12 876 186]
predicts: [589  70 129 4

In [20]:
# Performance of five crop > single crop > 10 Crop.
print(accuracyFive)
print(accuracyTen)

0.7352
0.72


In [None]:
"""For what datasets mirroring is a baaad augmentation idea?"""

In [None]:
"""
Data sets that have meaning in their axis. For instance, road signs that is
pointing left, when flipped it is pointing right. If predicting for recognition of
objects, such as a general" traffic sign, it would be great but, if you want to learn directs,
it is a bad idea, as flipping it changed its underlying meaning/representation.
"""

In [36]:
#[3]Problem 3: Densenet seems like a good net to net some results
dataInstance5 = DatasetCV(filen, nm,transform=five_2)
dataloader5 = utils.DataLoader(dataInstance5, 5,
                        shuffle=False, num_workers=0)
# dn = models.densenet161(pretrained=True)
Dense_result = evaluate_multi(dn,dataloader5,5 ,250)
resnet_result  = evaluate_multi(resnet18,dataloader5,5,250)

Start evaluation
predicts: [ 65 795 230 969 516] labels [ 65 970 230 809 516]
predicts: [ 60 334 852 674 153] labels [ 57 334 415 674 332]
predicts: [109 286 370 757 595] labels [109 286 370 757 595]
predicts: [147 327  23 478 517] labels [147 108  23 478 517]
predicts: [334 208 948 727  23] labels [334 173 948 727  23]
predicts: [846 270 166  60 708] labels [846 270 167  55 858]
predicts: [324 573 150 981 586] labels [324 573 150 981 586]
predicts: [887  26 398 597  74] labels [887  32 398 777  74]
predicts: [431 756 129 198 256] labels [516 756 129 198 256]
predicts: [505 565 162 717 467] labels [725 565 167 717 394]
predicts: [ 92  29 844 591 359] labels [ 92  29 844 591 358]
predicts: [468 154 994 840 588] labels [468 259 994 872 588]
predicts: [474 221 107  46 842] labels [474 183 107  46 842]
predicts: [390 101 887 870 841] labels [390 101 887 870 841]
predicts: [467 149  21 476  80] labels [467 149  21 476  80]
predicts: [424 159 275 175 461] labels [424 159 275 175 461]
predict

predicts: [733 681 684  78 668] labels [733 681 512  78 668]
predicts: [699 746  39 828 330] labels [699 746  46 618 330]
predicts: [615 756  62 116 127] labels [615 427  62 116 127]
predicts: [955 306 425 190 370] labels [955 306 425 190 370]
predicts: [187 971 534 396 657] labels [187 971 534 397 657]
predicts: [840 718 116 837 994] labels [840 718 116 836 994]
predicts: [419 860 214 285 641] labels [419 764 214 285 641]
predicts: [951 882  13 829 884] labels [951 882  13 829 624]
predicts: [216 671 521 268 468] labels [216 665 521 268 468]
predicts: [418 728 357 449 194] labels [418 728 356 449 194]
predicts: [362 948 924 248 524] labels [362 948 924 249 524]
predicts: [992 571 283 608 129] labels [992 571 283 608 129]
predicts: [486 859 498  21 467] labels [486 859 498  21 467]
predicts: [591 924 703  97 455] labels [591 924 556  97 898]
predicts: [586  10 202  68 649] labels [586  10 202  67 501]
predicts: [141 603 727 101 995] labels [141 603 727 101 995]
predicts: [278 964 240 4

predicts: [473 159 872 878 201] labels [473 159 872 878 201]
predicts: [578  70 486 632 428] labels [906  70 486 632 608]
predicts: [122 954 227 447 253] labels [122 720 227 686 173]
predicts: [959 638 646 480 645] labels [959 638 646 664 645]
predicts: [449 483 852 397 311] labels [718 483 852 392 311]
predicts: [457 352  23 934 283] labels [457 352  22 934 283]
predicts: [802 742 276 234 751] labels [802 553 276 236 751]
predicts: [106 528 328 969 251] labels [343 528 328 969 558]
predicts: [163 328 771 726 977] labels [163 328 771 726 977]
predicts: [475 266 686 590 977] labels [875 265 686 590 975]
predicts: [620 637  39 115  55] labels [620 637  39 115 937]
predicts: [274 277 763 579 646] labels [272 277 763 789 646]
predicts: [234 894 647 504 937] labels [213 493 647 504 937]
predicts: [687 781 666 583 158] labels [687 781 666 583 158]
predicts: [825 212 659 257 468] labels [825 212 659 257 436]
predicts: [199 140 248 339 231] labels [196 140 248 339 230]
predicts: [361 909 935 4

predicts: [819 664 839 299 501] labels [601 526 839 299 578]
predicts: [112 921 632 867 227] labels [112 960 632 867 273]
predicts: [ 61 427 367 924 880] labels [ 61 427 367 924 413]
predicts: [327 855 654 131 874] labels [ 34 773 654 131 874]
predicts: [281 891 956 187 367] labels [282 891 956 201 267]
predicts: [441 226 508 424 907] labels [969 200 673 423 907]
predicts: [ 57  27 982 863 322] labels [ 57  27 459 863 322]
predicts: [934 663 851 687 475] labels [934 663 424 687 837]
predicts: [483 645 396 306 930] labels [958 645 120 306 930]
predicts: [467 580 524 205 137] labels [121 694 524 205 137]
predicts: [849 620 380 586 916] labels [849 681 380 586 916]
predicts: [478 182 829 715 487] labels [478 182 874 715 590]
predicts: [111  19 162 915 428] labels [111  19 161 915 730]
predicts: [424 822 840 699 887] labels [678 822 818 699 601]
predicts: [607 233 921 624 679] labels [673 233 501 624 679]
predicts: [400 581 665 903 633] labels [400 581 665 903 622]
predicts: [889 800 899 6

In [37]:
print("resnet:",resnet_result,"densenet:",Dense_result)

resnet: 0.7016 densenet: 0.7952
