In [42]:
import torch
from torchvision.datasets import CIFAR10
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Subset
import random
import numpy as np

# Load CIFAR-10 dataset
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])


def extract_features(loader, model):
    features = []
    labels = []
    with torch.no_grad():
        for data, target in loader:
            output = model(data)
            output = output.view(data.size(0), -1)  # Flatten the output
            features.append(output)
            labels.append(target)
    return torch.cat(features), torch.cat(labels)

from torchvision import models
model = models.vgg16(pretrained=True)
model.classifier = torch.nn.Identity()  




In [45]:
np.random.seed(281)
dataset = CIFAR10(root='./data', train=True, transform=transform, download=True)
indices = np.random.permutation(len(dataset)) #with no permutation
graphDB_indices = indices[:50] #50 images as Graph DB
graphDB_subset = Subset(dataset, graphDB_indices)
graphDB_loader = DataLoader(graphDB_subset, batch_size=16, shuffle=True)
query_indices = indices[50:55] #5 images for query
query_subset = Subset(dataset, query_indices)
query_loader = DataLoader(query_subset, batch_size=16, shuffle=True)

graphDB_features, graphDB_labels = extract_features(graphDB_loader, model)
query_featuers, query_labels = extract_features(query_loader, model)

Files already downloaded and verified


In [56]:
#LSH med random projection 
def random_projection_lsh(data:list[list[float]], n_hash:int=20, seed = 1) -> list[int]:
    """"    
    data : a single image as a matrix of floats
    n_hash : size of the hash
    returns: list of binary values
    """
    np.random.seed(seed)
    n_dimensions = data.shape[1]
    random_vectors = np.random.randn(n_hash, n_dimensions)
    projections = np.dot(data, random_vectors.T)
    hash_codes = (projections > 0).astype(int)
    result_tensor = torch.tensor(hash_codes) #numpy arrays to 2 dimensional tensors
    return result_tensor

In [60]:
hamming_codes = random_projection_lsh(graphDB_features)
query_codes = random_projection_lsh(query_featuers)

In [62]:
query_codes

tensor([[1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1],
        [0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1],
        [1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1],
        [1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1],
        [1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1]],
       dtype=torch.int32)

In [102]:
import torch.nn as nn
import torch.optim as optim

def hamming_distance(b1, b2):
    if len(b1) != len(b2):
        raise ValueError("Input not in same length")
    return sum(b1 != b2 for b1,b2 in zip(b1,b2)) #zip create parelle interator like (a,1), (b,2) etc.

def calculate_similarity(query_codes, hamming_codes):
    '''
    Hamming codes: Graph database binary codes
    query codes: binary code for the query picture
    '''
    # Initialize the similarity matrix with zeros
    similarity_matrix = torch.zeros(query_codes.size(0), hamming_codes.size(0))  # shape (num_queries, num_codes)

    for i, query in enumerate(query_codes):  # Iterate over each query
        for j, code in enumerate(hamming_codes):  # Iterate over each Hamming code
            distance = hamming_distance(query, code)  # Calculate the Hamming distance
            similarity_matrix[i, j] = distance  # Fill the similarity matrix

    return similarity_matrix  # Return the similarity matrix

def mean_average_precision(similarity_matrix, query_labels, graphDB_labels): 
    #input: similairty matrix: sim matrix for object i and j
    #query labels, 
    #hamming labels are training labels
    average_precision = []

    query_labels = torch.tensor(query_labels)
    graphDB_labels = torch.tensor(graphDB_labels)

    for i in range(0, 4): #interate thourgh number of queries
        similarities = similarity_matrix[i] #get simlarity for the current query
        sorted_indices = similarities.argsort(descending=True) #for each similarity 
        sorted_labels = graphDB_labels[sorted_indices]

        relevant_indices = (sorted_labels == query_labels[i]).nonzero(as_tuple=True)[0] #store kth indexes in sorted G' where True occurs 

        if len(relevant_indices) > 0:
            precision_at_k = torch.arange(1, len(relevant_indices)+1).float()/(relevant_indices+1) #incrementally compute mean of relevant indices, for example if you have 1, 5, 7, relevant indices, then you get 1 + 0.0...+2/6 +...3/8   
            #torch arrange is Returns a 1-D tensor of size end - start/step, with values from the interval [start, end) taken with common difference step beginning from start.
            average_precision.append(precision_at_k.mean().item())
        else:
            average_precision.append(0.0) 

    return torch.tensor(average_precision).mean().item() #average over all mean average for each query stored in average_precision


In [64]:
sim_matrix = calculate_similarity(query_codes = query_codes, hamming_codes = hamming_codes) #each contains hamming distances for between graphDB and query image

In [103]:
mean_average_precision(sim_matrix, query_labels, graphDB_labels)

  query_labels = torch.tensor(query_labels)
  graphDB_labels = torch.tensor(graphDB_labels)


0.10090954601764679

In [75]:
sim_matrix

tensor([[10.,  7.,  7.,  8.,  7., 10.,  6.,  6.,  6., 10., 11., 11., 12.,  7.,
          4., 10.,  8.,  7., 10.,  5.,  5.,  8.,  6.,  9.,  6.,  5.,  6.,  8.,
          9.,  9.,  9.,  9.,  8., 10.,  8.,  7., 10.,  3.,  7.,  5.,  9., 11.,
          9.,  4.,  9.,  7.,  6.,  6.,  8.,  9.],
        [10.,  7.,  7.,  8.,  9.,  8.,  6.,  6.,  6.,  8., 11., 11., 12.,  9.,
          8.,  8.,  8.,  5.,  6.,  9.,  3., 10.,  8.,  7., 10.,  7.,  6.,  6.,
          7.,  7.,  9.,  7.,  8.,  8., 10.,  9., 10.,  7.,  7.,  7.,  9., 11.,
         13.,  6.,  9.,  7.,  4.,  8.,  8.,  7.],
        [ 9., 10.,  6.,  7.,  6.,  9.,  7.,  7.,  9.,  9., 10., 10., 13., 12.,
          9., 13., 11.,  4.,  7.,  6.,  4.,  9.,  7.,  8.,  9.,  4.,  7.,  5.,
         10.,  8., 12.,  6.,  5., 13.,  9.,  8., 11.,  6., 10.,  6., 10., 12.,
         10.,  5.,  8., 12.,  7.,  7.,  7., 10.],
        [13., 12., 10., 11., 12., 11.,  7.,  9.,  9.,  9., 12., 12., 11.,  8.,
          7., 11.,  9.,  8.,  5., 14.,  8., 11.,  9.,  6., 1

In [70]:
torch.arange(1,10+1)

tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [39]:
#First make a subset for train test validation split, then a subset for query database
#use indices tracking to implement to ensure no replacement
np.random.seed(281)
dataset = CIFAR10(root='./data', train=True, transform=transform, download=True)
indices = np.random.permutation(len(dataset)) #with no permutation

training_indices = indices[:180]
#validation_indices = indices[180:240]
#test_indices = indices[240:300]
query_indices = indices[300:330]
#graphDB_indices = indices[360:460]

train_subset = Subset(dataset, training_indices)
#validation_subset = Subset(dataset, validation_indices)
#test_subset = Subset(dataset, test_indices)
query_subset = Subset(dataset, query_indices)
#graphDB_subset = Subset(dataset, graphDB_indices)

train_loader = DataLoader(train_subset, batch_size=16, shuffle=True)
#valid_loader = DataLoader(validation_subset, batch_size=16, shuffle=True)
#test_loader = DataLoader(test_subset, batch_size=16, shuffle=True)
query_loader = DataLoader(query_subset, batch_size=16, shuffle=True)
#graphDB_loader = DataLoader(graphDB_subset, batch_size=16, shuffle=True)

train_features, train_labels = extract_features(train_loader, model)
#valid_featuers, valid_labels = extract_features(valid_loader, model)
#test_featuers, test_labels = extract_features(test_loader, model)
query_featuers, query_labels = extract_features(query_loader, model)
#DB_featuers, DB_labels = extract_features(graphDB_loader, model)

Files already downloaded and verified


In [None]:
class self_defined_CNN(nn.Module): #kan bruges senere
    def __init__(self, output_dim): #output_dim: length of binary codes generated
        super(HashingNet, self).__init__() 
        self.conv1 = nn.Conv2d(3, 32, 5, stride=1) ##3 convolutaion pooling layers with 32, 32, and 64 5X5 filters 
        self.pool1 = nn.MaxPool2d(3, stride=2) #2D max pool filter 3X3 stride 2

        self.conv2 = nn.Conv2d(32, 32, 5, stride=1)
        self.pool2 = nn.AdaptiveAvgPool2d(3, stride=2)

        self.conv3 = nn.Conv2d(32, 64, 5, stride=1)
        self.pool3 = nn.AdaptiveAvgPool2d(3, stride=2)

        self.fc1 = nn.Linear(64 * 3 * 3, 500) #
        self.fc2 = nn.Linear(500, output_dim) #number of binary codes you'd to generated
    
    def forward(self, x):

        x = self.pool1(torch.relu(self.conv1(x)))  #apply ReLU on reuslts of a convulutional layer
        x = self.pool2(torch.relu(self.conv2(x)))
        x = self.pool3(torch.relu(self.conv3(x)))

        x = x.view(x.size(0), -1) #flatten the tensor to feed to fully connected layer
        x = torch.relu(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        return 

In [None]:
#LSH med random projection 
def random_projection_lsh(data:list[list[float]], n_hash:int=64, seed = 1) -> list[int]:
    """"    
    data : a single image as a matrix of floats

    n_hash : size of the hash
    
    returns: list of binary values
    """
    np.random.seed(seed)
    n_dimensions = data.shape[1]
    random_vectors = np.random.randn(n_hash, n_dimensions)
    projections = np.dot(data, random_vectors.T)
    hash_codes = (projections > 0).astype(int)
    return hash_codes 


In [7]:
import torch.nn as nn
import torch.optim as optim

def hamming_distance(b1, b2):
    if len(b1) != len(b2):
        raise ValueError("Input not in same length")
    return sum(b1 != b2 for b1,b2 in zip(b1,b2)) #zip create parelle interator like (a,1), (b,2) etc.

def calculate_similarity(query_codes, hamming_codes):
    '''
    Hamming codes: Graph database binary codes
    query codes: binary code for the query picture
    '''
    similarity_matrix = []  #similarity matrix generated
    for query in query_codes: #for each vector in feature 
        distances = []
        for code in hamming_codes:
            distance = hamming_distance(query, code) #the hamming distance between feature and hamming space
            distances.append(distance) #distance added to similarility matrice
        similarity_matrix.append(distances)
    return torch.stack(similarity_matrix) #each sim vector will be concanated to the new matrice 50X50

def mean_average_precision(similarity_matrix, query_labels): 
    #input: similairty matrix: sim matrix for object i and j
    #query labels, 
    #hamming labels are training labels
    average_precision = []
    for i, similarities in enumerate(similarity_matrix):
        sorted_indices = similarities.argsort(descending=True) #for each similarity 
        sorted_labels = query_labels[sorted_indices]
        relevant_indices = (sorted_labels == query_labels[i]).nonzero(as_tuple=True)[0] #find the indices where sorted labels match the query labels for caluclate TP TN etc.
        

        precision_at_k = torch.arange(1, len(relevant_indices) + 1).float() / (relevant_indices + 1)
        average_precision.append(precision_at_k.mean().item())
    return torch.tensor(average_precision).mean().item()

