In [1]:
# dataset
from torchvision import datasets

import joblib
import numpy as np

# visualization
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from tabulate import tabulate

# preprocessing
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms

# model
import torch
from torch import nn
import torchvision.models as models

# dimensionality reduction
from sklearn.decomposition import PCA

# 1. Load Data & Preprocessing

**Dataset**: <a href="https://www.cs.toronto.edu/~kriz/cifar.html" target="_blank">CIFAR-10</a>

**Classes**: airplane, automobile, bird, cat, deer, dog, frog, horse, ship, truck

The following transformations are used:
* **resize**
* **center crop**
* **normalization**: from [0, 255] to [0, 1]
* **standardization**: by substracting the mean and dividing with std according to ImageNet

In [2]:
def load_data(data_dir):
    """ Create train and test pytorch dataset objects from CIFAR-10.
    
    The following tranformations are applied on CIFAR-10:
        * resize images,
        * center crop images,
        * normalization: from [0, 255] to [0, 1] by dividing with 255,
        * standardization: by substracting the mean and dividing with std according to ImageNet
    
    Args:
        data_dir:
            directory where data will be saved, as a string.
    
    Returns:
        train and test dataset, as pytorch dataset objects.
    """
    transform = transforms.Compose([
        transforms.Resize(224), 
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    trainset = datasets.CIFAR10(root=data_dir,
                                train=True, 
                                download=True, 
                                transform=transform)

    testset = datasets.CIFAR10(root=data_dir, 
                               train=False, 
                               download=True, 
                               transform=transform)

    return trainset, testset

In [3]:
trainset, testset = load_data(data_dir='cifar10')

print(f'\nTraining data:\n--------------\n{trainset}')
print(f'Test data:\n--------------\n{testset}')

Files already downloaded and verified
Files already downloaded and verified

Training data:
--------------
Dataset CIFAR10
    Number of datapoints: 50000
    Root location: cifar10
    Split: Train
    StandardTransform
Transform: Compose(
               Resize(size=224, interpolation=bilinear, max_size=None, antialias=None)
               CenterCrop(size=(224, 224))
               ToTensor()
               Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
           )
Test data:
--------------
Dataset CIFAR10
    Number of datapoints: 10000
    Root location: cifar10
    Split: Test
    StandardTransform
Transform: Compose(
               Resize(size=224, interpolation=bilinear, max_size=None, antialias=None)
               CenterCrop(size=(224, 224))
               ToTensor()
               Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
           )


In [4]:
batch_size = 64

train_dataloader = DataLoader(trainset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(testset, batch_size=batch_size, shuffle=False)

# 2. VGG-16

We'll use VGG-16. Since CIFAR-10 dataset has less classes than ImageNet, we'll keep all the layers **freezed**, except of the last one, which will be **unfreezed** and updated with the right number of classes.

We've already trained a VGG-16 in a previous Jupyter notebook. The best model configurations are:
* **learning rate** (lr): 0.001
* **weight decay** (l2): 0.001

The results of our trained model are: 
* Best **epoch**: 61
* Best **train**: **accuracy**: 89.4%, **avg loss**: 0.317020
* Best **test**: **accuracy**: 85.2%, **avg loss**: 0.429216

The purpose of VGG-16, is to extract visual embeddings for each image. <br> 
We define a **visual embedding** as the features of an image, found by a deep-learning model, at the last fully connected layer (prior to a loss layer). <br>
In the case of VGG-16, we extract the visual embeddings from the **5th fully connected layer (fc5)**. Thus, the **embeddings vector is of size 4096**. To achieve this, we register a **hook** at this layer which keeps the embeddings each time it processes a batch of data. <br>
There are many was to train an embedding layer of a visual deep-learning model. The most common methods make use of the **triplet loss** and the **contrastive loss**. However, for simplicity, we have reduced the problem as a classification problem and we make use of **cross entropy loss**.

## 2.1 Define model

In [5]:
def set_parameter_requires_grad(model, feature_extracting):
    """ This helper function sets the .requires_grad attribute of the parameters in the model 
    to False when we are feature extracting. 
    
    When we are feature extracting and only want to compute gradients for the newly initialized layer, 
    then we want all of the other parameters to not require gradients.
    
    Args:
        model: 
            deep learning model, as pytorch object.
        feature_extracting:
            whether or not we're feature extracting, as boolean.
    """
    if feature_extracting:
        for param in model.parameters():
            param.requires_grad = False

In [6]:
def initialize_model(num_labels, feature_extracting, pretrained=True):
    """ Initialize VGG-16 model and reshape the last layer with the correct number of classes.
    
    Since VGG-16 has been pretrained on Imagenet, it has output layers of size 1000, one node for each class. 
    We reshape the last layer to have the same number of inputs as before, and to have the same number of 
    outputs as the number of classes in our the dataset.
    
    Args:
        num_labels:
            number of labels in our dataset, as integer.
        feature_extracting:
          flag for feature extracting (when False, we finetune the whole model, 
          when True we only update the reshaped layer params), as boolean.
        pretrained:
            whether or not we want the pretrained version of AlexNet, as boolean.
    
    Returns:
        VGG-16 model, as pytorch object
    """
    model = models.vgg16(pretrained=pretrained)
    
    set_parameter_requires_grad(model, feature_extracting)
    
    last_layer_in_ftrs = model.classifier[6].in_features
    model.classifier[6] = nn.Linear(last_layer_in_ftrs, num_labels)
    
    return model

## 2.2 Load model

In [7]:
device = torch.device(f"cuda" if torch.cuda.is_available() else "cpu")
print(f'Using {device} device')

Using cuda device


In [8]:
num_labels = 10
feature_extracting = True
pretrained = True

model = initialize_model(num_labels=num_labels, 
                         feature_extracting=feature_extracting, 
                         pretrained=pretrained).to(device)

print(f'Model architecture:\n{model}')

Model architecture:
VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, p

In [9]:
model.load_state_dict(torch.load('../saved-model/vgg16-weights.pth', map_location="cuda:0"))

<All keys matched successfully>

## 2.3 Feature Extraction -- Register Hook

In [10]:
features = []

def get_features():
    def hook(model, input, output):
        global features
        features.append(output.detach().cpu().numpy())
    return hook

model.classifier[5].register_forward_hook(get_features())

<torch.utils.hooks.RemovableHandle at 0x17ce9000b08>

## 2.4 Feature Extraction -- Predict

In [11]:
def predict(dataloader, model, device):
    """ Predict with deep-learning model.

    Args:
        dataloader:
            pytorch DataLoader object.
        model:
            deep learning model, as pytorch object.
        device:
            device where the deep-learning model will run (cpu, gpu), as string.

    Returns:
         predictions, as a list of integers.
    """
    pred_concat = []
    
    model.eval()  # put on evaluation mode
    with torch.no_grad():
        for X,_ in dataloader:
            X = X.to(device)

            # predict class label
            pred = model(X)

            # get predicted class label index
            for label in pred.argmax(1):    
                pred_concat.append(label.item())

    return pred_concat

In [12]:
train_pred = predict(train_dataloader, model, device)

train_features = np.vstack(features)
print(f'Number of train images: {train_features.shape[0]}\nNumber of features per train image: {train_features.shape[1]}')

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


Number of train images: 50000
Number of features per train image: 4096


In [13]:
# reset features list
features = []

In [14]:
test_pred = predict(test_dataloader, model, device)

test_features = np.vstack(features)
print(f'Number of test images: {test_features.shape[0]}\nNumber of features per test image: {test_features.shape[1]}')

Number of test images: 10000
Number of features per test image: 4096


# 3. Principal Component Analysis

The idea in this project is to index the visual embeddings of images in Elasticsearch, so that we can retrieve similar images, with respect to a query image.

However, **Elasticsearch is able to handle only vectors of size 2048**. So, we need to reduce the dimensions of the visual embeddings found by VGG-16, which are of size 4096.

To achieve this, we'll use **dimensionality reduction** techniques. Specifically, we'll make use of **Principal Component Analysis (PCA)** algorithm to reduce the dimensions of the visual embeddings to a **vector of size 2000**.

## 3.1 Define PCA

In [15]:
n_components = 2000

pca = PCA(n_components=n_components)

## 3.2 Train PCA

In [16]:
pca.fit(train_features)

PCA(n_components=2000)

## 3.3 Transform with PCA

In [17]:
train_reduced_ftrs = pca.transform(train_features)

print(f'Number of reduced features per train image: {train_reduced_ftrs.shape[1]}')

Number of reduced features per train image: 2000


In [18]:
test_reduced_ftrs = pca.transform(test_features)

print(f'Number of reduced features per test image: {test_reduced_ftrs.shape[1]}')

Number of reduced features per test image: 2000


## 3.4 Save 

In [20]:
joblib.dump(pca, '../saved-model/pca.joblib')

['../saved-model/pca.joblib']