<a href="https://colab.research.google.com/github/umeshrawat/AI_Math_Vedas/blob/master/CV_3_Assignment_Question.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Library

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision.transforms import ToTensor
from torch.utils.data import DataLoader, Dataset
from torchvision.models import vgg16
from torchvision.datasets import ImageFolder
import numpy as np
import random
import torchvision.transforms as transforms
import torch.nn.functional as F

# Read Data From Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
assets_dir = '/content/drive/MyDrive/CV-3/MCQs and Assignment/assets/'

Mounted at /content/drive


In this assignment, you are tasked with finding the top-k similar images of every image in the test set of the AT&T dataset, using a Siamese Network with a VGG backbone, but instead of the Contrastive Loss Function learnt in class, you will be need to use the Triplet Loss Function. You need to use Cosine Similarity to find the Top-K similar images for each image and display them.

Siamese Networks is something which is not covered in the class yet but feel free to to read up about it and how it can be used for Image Similarity.



## Unzip the dataset

In [None]:
import zipfile

# Path to the zip file you want to unzip
zip_file_path = assets_dir + "AT&T.zip"

# Target folder where you want to extract the contents
target_folder = "/content/dataset"

# Unzip the file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(target_folder)

# assert the folder exists
assert os.path.isdir(target_folder + '/AT&T'), "The unzipped folder cannot be found"

In [None]:
training_dir = "/content/dataset/AT&T/train/"
testing_dir = "/content/dataset/AT&T/test/"

## Siamese Network with VGG backbone

**Task**: The below class defines a VGG backbone for the Siamese Network where you will need to replace the last layer from the VGG model and then add a few dense layers to it along with regularization using Dropout and Batch Normalization.

In [None]:
class SiameseNetwork(nn.Module):
    def __init__(self):
        """
        Siamese Network with VGG-16 backbone for image similarity comparison.

        The network consists of a VGG-16 model with the last layer removed, followed by fully connected layers
        to extract features from input images. The extracted features are then used to compute image similarity.

        """
        super(SiameseNetwork, self).__init__()
        vgg = vgg16(pretrained=True)
        # Get the list of layers in the VGG-16 model.
        layers = list(vgg.children())
        # Remove the last layer from the list of layers.
        layers = layers[:-1]

        # Create a new model from the list of layers.
        self.backbone = torch.nn.Sequential(*layers) # This is the VGG backbone
        self.fc1 = ## TODO - write your solution here
        self.bn1 = ## TODO - write your solution here
        self.bn2 = ## TODO - write your solution here

    def forward_on_single_image(self, x):
        """
        Perform forward pass for a single input image.

        Parameters:
            x (torch.Tensor): The input image tensor.

        Returns:
            torch.Tensor: The output feature tensor obtained after passing through the network layers.

        """
        x = self.fc0(x)
        # flatten the features
        x = x.view(x.size()[0], -1)
        x = self.bn1(x)
        x = self.fc1(x)
        x = self.bn2(x)
        return x

    def forward(self, input1, input2, input3):
        """
        Perform forward pass for a pair of input images.

        Parameters:
            input1 (torch.Tensor): The first input image tensor.
            input2 (torch.Tensor): The second input image tensor.

        Returns:
            tuple: A tuple containing the output feature tensors obtained after passing each input image
                   through the network.

        """
        output1 = self.forward_on_single_image(input1)
        output2 = self.forward_on_single_image(input2)
        output3 = self.forward_on_single_image(input3)
        return output1, output2, output3


## Siamese Dataset
**Task**: You need to create a dataset from the images where it should contain three faces, in which 2 are similar and the 3rd is dissimilar thus helping train the Siamese Network to discern the differences between the 2 images that are different and the similarity between the 2 images which are similar.

In [None]:
# Siamese Network Dataset
class SiameseNetworkDataset(Dataset):
    """
    Custom dataset for Siamese Network training.

    This dataset randomly samples pairs of images from the given input dataset, along with their corresponding labels
    indicating whether the images belong to the same class or not. It is used for training Siamese Networks which
    learn to compare and measure similarity between two input images.

    Parameters:
        dataset (Dataset): The original (PyTorch) dataset containing images and their labels.

    """
    def __init__(self, dataset):
        """
        Initialize the Siamese Network dataset.

        Parameters:
            dataset (Dataset): The original (PyTorch) dataset containing images and their labels.

        """
        self.dataset = dataset
        self.labels = torch.arange(len(dataset))

    def __getitem__(self, index):
        """
        Get a single sample from the Siamese Network dataset.

        Parameters:
            index (int): The index of the sample to retrieve.

        Returns:
            tuple: A tuple containing three images. The first element is the anchor image (img1), the second element is
                   the image which has the same label as the first image i.e. the positive image and the third element
                   is the image which has a different label than the first image i.e. the negative image.

        """
        ## TODO - write your solution here
        # return anchor, positive, negative

    def __len__(self):
        """
        Get the total number of samples in the dataset.

        Returns:
            int: The total number of samples in the dataset.

        """
        return len(self.dataset)


**Task**: Write the forward function in the TripletLoss class that takes the three images (anchor, positive, negative) from the dataset as input and returns the calculated loss.

### Setting the Transformations

In [None]:
# Setting the transformations
transform=transforms.Compose([transforms.Resize((100,100)),
                              transforms.ToTensor()
                            ])

### Dataset

Datasets are the collections of your training, validation, and test data. They consist of input samples and their corresponding target labels (for supervised learning). In PyTorch, datasets are typically created using custom classes inheriting from `torch.utils.data.Dataset`. You load your data into this class, allowing easy access during training.

In [None]:
train_dataset = ImageFolder(training_dir, transform=transform)
test_dataset = ImageFolder(testing_dir, transform=transform)

print(len(train_dataset), len(test_dataset))

# Create Siamese datasets
train_siamese_dataset = SiameseNetworkDataset(train_dataset)
test_siamese_dataset = SiameseNetworkDataset(test_dataset)

370 30


### Dataloaders

Data Loaders wrap your dataset and provide functionalities for iterating through batches of data during training. They handle shuffling, batching, and parallel data loading, optimizing the data pipeline.

In [None]:
# Create data loaders
batch_size = 64
test_batch_size = 1

train_loader = DataLoader(train_siamese_dataset, batch_size=batch_size, shuffle=True, num_workers=8)
test_loader = DataLoader(test_siamese_dataset, batch_size=test_batch_size, shuffle=False)

print("Train Dataset for Siamese Network created.", len(train_siamese_dataset))
print("Test Dataset for Siamese Network created.", len(test_siamese_dataset))

Train Dataset for Siamese Network created. 370
Test Dataset for Siamese Network created. 30




### Loss Function

Loss functions measure the difference between the predicted output and the actual target values. Common loss functions include Cross-Entropy Loss for classification tasks and Mean Squared Error for regression tasks.

### Defining the Triplet Loss Function

The Triplet Loss function is a key component in training Siamese Networks and related architectures for learning similarity and dissimilarity among data points. Similar to the Contrastive Loss, the Triplet Loss aims to shape the embedding space in a way that similar samples are closer together, while dissimilar samples are pushed farther apart.

In the context of the Triplet Loss, each training sample is referred to as an "anchor." For each anchor, we identify a "positive" sample (similar to the anchor) and a "negative" sample (dissimilar to the anchor). The goal of the Triplet Loss is to ensure that the distance between the anchor and the positive sample is smaller than the distance between the anchor and the negative sample by a certain margin.

Mathematically, for an anchor sample \(A\), a positive sample \(P\), and a negative sample \(N\), the Triplet Loss is computed as:

$$L(A, P, N) = max ( | f(A) - f(P) |^2 - | f(A) - f(N) |^2 + margin, 0 )$$

Here, f(.) represents the embedding function learned by the neural network, and \(| . |\) denotes the Euclidean distance between the embedded vectors. The margin is a hyperparameter that specifies the minimum desired separation between positive and negative samples.

In summary, the Triplet Loss guides the network to learn embeddings in such a way that the positive sample is pulled closer to the anchor while the negative sample is pushed away. This encourages the network to map similar samples together in the embedding space and dissimilar samples apart, resulting in meaningful representations that capture the inherent structure of the data's similarity relationships.

In [None]:
class TripletLoss(nn.Module):
    """
    Triplet Loss function for training Siamese Networks with triplet sampling.

    The Triplet Loss encourages the model to minimize the distance between the anchor and the positive example
    while maximizing the distance between the anchor and the negative example. This helps in learning a suitable
    embedding space where similar examples are closer and dissimilar examples are farther apart.

    Parameters:
        margin (float): The margin value that defines the desired separation between positive and negative pairs.

    """
    def __init__(self, margin=1.0):
        super(TripletLoss, self).__init__()
        self.margin = margin


    def forward(self, anchor, positive, negative):
        """
        Compute the Triplet Loss based on the anchor, positive, and negative examples.

        Parameters:
            anchor (torch.Tensor): Embeddings of the anchor examples.
            positive (torch.Tensor): Embeddings of the positive examples.
            negative (torch.Tensor): Embeddings of the negative examples.

        Returns:
            torch.Tensor: Computed Triplet Loss.

        """
        ## TODO - write your solution here
        # return loss

In [None]:
# Define the loss function and optimizer
triplet_loss = TripletLoss()

In [None]:
# Set device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create an instance of the Siamese Network with VGG16 backbone
model = SiameseNetwork().to(device)

### Optimizer

Optimizers are algorithms that adjust the model's parameters during training to minimize the loss function. Common optimizers include SGD (Stochastic Gradient Descent), Adam, and RMSprop.

In [None]:
optimizer = optim.Adam(model.parameters(), lr=0.0005)

In [None]:
def train_batch(epoch, model, optimizer, loss_history):
    print("epoch ", epoch)
    model.train()
    train_loss = 0

    for batch_idx, batch in enumerate(train_loader):

        anchor, positive, negative = batch
        anchor = anchor.to(device) # Move the anchor image to the device
        positive = positive.to(device) # Move the positive image to the device
        negative = negative.to(device) # Move the negative image to the device

        optimizer.zero_grad()  # Zero the gradients to prevent accumulation from previous iterations
        output_anchor, output_pos, output_neg = model(anchor, positive, negative)  # Forward pass: get the output feature vectors for all three images
        loss = triplet_loss(output_anchor, output_pos, output_neg) # Calculate the Triplet Loss
        loss.backward()  # Backward pass: compute gradients of the loss with respect to model parameters
        optimizer.step()  # Update the model's parameters using the computed gradients

        train_loss += loss.item()

    print('Train Loss: %.3f'
                         % (train_loss/(batch_idx+1)))
    loss_history.append(train_loss)


In [None]:
def validate_batch(epoch, model, loss_history):
    global best_acc
    model.eval()
    test_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, batch in enumerate(test_loader):
            anchor, positive, negative = batch
            anchor = anchor.to(device) # Move the anchor image to the device
            positive = positive.to(device) # Move the positive image to the device
            negative = negative.to(device) # Move the negative image to the device

            output_anchor, output_pos, output_neg = model(anchor, positive, negative)  # Forward pass: get the output feature vectors for all three images
            loss = triplet_loss(output_anchor, output_pos, output_neg) # Calculate the Triplet Loss

            test_loss += loss.item()

    print('Val Loss: %.3f'
                 % (test_loss/(batch_idx+1)))
    loss_history.append(test_loss)

### Training the Model

In [None]:
# Initialize lists to track training progress
train_loss_history = [] # List to store the loss value during training
val_loss_history = [] # List to store the loss value during validation

# Training loop
num_epochs = 100

for epoch in range(num_epochs):
    train_batch(epoch, model, optimizer, train_loss_history)
    validate_batch(epoch, model, val_loss_history)

### Plotting the loss

In [None]:
import matplotlib.pyplot as plt

# Plot the loss values using Matplotlib (optional)
epochs = list(range(1, len(train_loss_history) + 1))
plt.figure(figsize=(8, 6))
plt.plot(epochs, train_loss_history, label='Train Loss')
plt.plot(epochs, val_loss_history, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss')
plt.show()

### Evaluating the Model

The Euclidean distance is a commonly used metric to quantify the similarity between two feature vectors in image analysis. It calculates the straight-line distance between the points in the feature space. Smaller distances indicate higher similarity, while larger distances indicate dissimilarity. The Euclidean distance metric is intuitive and straightforward to compute, making it widely used in various applications such as image retrieval and clustering.

In [None]:
import torchvision
from google.colab.patches import cv2_imshow
import matplotlib.pyplot as plt

In [None]:
# Evaluation
model.eval()

with torch.no_grad():
    # Note: Each batch contains only 1 image for test loader as set above
    for i, batch in enumerate(test_loader):

        # setting x0 to the first image which will be used to compare against other 9 images
        if i == 0:
          x0,_,_ = batch
          image1 = x0
          x0 = x0.to(device)
          continue

        # Limiting to 10 images
        if i == 10:
          break

        _,x1,_ = batch
        image2 = x1
        x1 = x1.to(device)
        # Concatenate two images side by side for visualization
        concatenated = torch.cat((image1,image2),0)
        # Forward pass through the Siamese Network to get the output feature vectors for both images
        output1 = model.forward_on_single_image(x0)
        output2 = model.forward_on_single_image(x1)

        # detach the tensor and move it to the CPU
        x1 = x1.detach().cpu()

        # Calculate the Euclidean distance between the output feature vectors
        euclidean_distance = F.pairwise_distance(output1, output2)
        concatenated_img = torchvision.utils.make_grid(concatenated)
        concatenated_img_text = 'Dissimilarity: {:.2f}'.format(euclidean_distance.item())

        npimg = concatenated_img.numpy()
        plt.axis("off")
        if concatenated_img_text:
            plt.text(75, 8, concatenated_img_text, style='italic',fontweight='bold',
                bbox={'facecolor':'white', 'alpha':0.8, 'pad':10})
        plt.imshow(np.transpose(npimg, (1, 2, 0)))
        plt.show()

    x0 = x0.detach().cpu()

Cosine Similarity is a widely employed metric for assessing the similarity between two feature vectors in image analysis and other domains. It quantifies the cosine of the angle between two vectors in the feature space. Higher cosine similarity values indicate greater alignment in direction and orientation, implying higher similarity between the vectors.

Compared to Euclidean distance, which measures the straight-line distance between points, Cosine Similarity focuses on the angle between vectors, disregarding their magnitudes. This means that even if the vectors have different lengths, they can still exhibit high cosine similarity if they are pointing in the same general direction.

Mathematically, the Cosine Similarity between two vectors \(A\) and \(B\) is computed as:

Cosine Similarity(A, B) = A . B / (|A\| * \|B\|)

Where \(A . B\) represents the dot product of the two vectors, and \(\|A\|\) and \(\|B\|\) represent the magnitudes (or lengths) of the respective vectors.

Cosine Similarity values range between -1 and 1. A Cosine Similarity of 1 indicates that the vectors are perfectly aligned and have the same orientation. A Cosine Similarity of -1 indicates that the vectors are perfectly aligned but have opposite orientations. A Cosine Similarity of 0 implies that the vectors are orthogonal, indicating no similarity in direction.

Cosine Similarity finds use in diverse applications like text analysis, recommendation systems, and image retrieval. Its ability to measure the direction of similarity while ignoring magnitude variations makes it particularly suited for scenarios where magnitude differences are not as relevant as the alignment of features.

**Task**: Write the code to perform cosine_similarity between a random image with any 10 images.

In [None]:
# Using Cosine Similarity
model.eval()

with torch.no_grad():
    # Note: Each batch contains only 1 image for test loader as set above
    for i, batch in enumerate(test_loader):
        # setting x0 to the first image which will be used to compare against other 9 images
        if i == 0:
          x0,_,_ = batch
          image1 = x0
          x0 = x0.to(device)
          continue

        # Limiting to 10 images
        if i == 10:
          break

        _,x1,_ = batch
        image2 = x1
        x1 = x1.to(device)

        # Concatenate two images side by side for visualization
        concatenated = torch.cat((image1,image2),0)

        # Forward pass through the Siamese Network to get the output feature vectors for both images
        output1 = model.forward_on_single_image(x0)
        output2 = model.forward_on_single_image(x1)

        # detach the tensor and move it to the CPU
        x1 = x1.detach().cpu()

        # Calculate the Cosine Similarity between the output feature vectors
        cosine_similarity = #TODO: Write your solution here
        concatenated_img = torchvision.utils.make_grid(concatenated)
        concatenated_img_text = 'Similarity: {:.8f}'.format(cosine_similarity.item())

        npimg = concatenated_img.numpy()
        plt.axis("off")
        if concatenated_img_text:
            plt.text(75, 8, concatenated_img_text, style='italic',fontweight='bold',
                bbox={'facecolor':'white', 'alpha':0.8, 'pad':10})
        plt.imshow(np.transpose(npimg, (1, 2, 0)))
        plt.show()

    x0 = x0.detach().cpu()

### Finding Top-5 Images

**Task**: Write the code to find the top-k images for all images in the test folder using cosine_similarity.

In [None]:
# TODO - Write your solution here to find and display the top-5 similar images for each image in the test folder using Cosine Similarity