## Imports

In [1]:
import os
import sys
import pathlib

import cv2
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt

import torchvision
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt 
import albumentations as albu

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:100% !important;}</style>"))

# Fix to be able to import python modules inside a notebook
os.chdir('..')

# Useful extensions
# %load_ext watermark
# %watermark -v -n -m -p numpy,torch,albumentations,photosynthesis_metrics

# %load_ext autoreload
# %autoreload 2

# Nice plot formating
%matplotlib inline

In [3]:
!ls

configs  Makefile    preprocess.py  requirements.txt  train.py
data	 notebooks   __pycache__    src
logs	 predict.py  README.md	    tmp.txt


# Diffusion

In [138]:
import faiss
import numpy as np

import os
import time
import argparse
import pickle

from sklearn.preprocessing import normalize

import os
import time
import yaml
import argparse
from pathlib import Path

import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
# import pytorch_tools as pt
from loguru import logger

from src.datasets import get_val_dataloader, get_test_dataloader
from src.callbacks import cmc_score_count, map_at_k
from src.models import Model

In [222]:
# Test kNN and compare results with cdist search
config_path = pathlib.Path("logs/genet_normal_384_hard_arcface80_15")

# Read DF
df_val = pd.read_csv(config_path / "train_val.csv")
val_embeddings = np.array(list(map(eval, df_val["embeddings"].values)))
query_mask = df_val["is_query"].values.astype(np.bool)
val_labels = df_val["label"].values

# Shape (n_embeddings, embedding_dim)
query_embeddings, gallery_embeddings = val_embeddings[query_mask], val_embeddings[~query_mask]
query_labels, gallery_labels = val_labels[query_mask], val_labels[~query_mask]
logger.info(f"Validation query size - {len(query_embeddings)}, gallery size - {len(gallery_embeddings)}")
del val_embeddings

2020-09-22 20:26:04.127 | INFO     | __main__:<module>:13 - Validation query size - 1134, gallery size - 15522


In [162]:
# Compare result of kNN and cdist
knn = kNN(embeddings=gallery_embeddings, distance='cosine')
distances, ids = knn.search(query_embeddings, topk=15522)
distances = torch.tensor(distances)

map10 = map_at_k_2(torch.tensor(ids), conformity_matrix, topk=10)
mapR = map_at_k_2(torch.tensor(ids), conformity_matrix, topk=None)

logger.info(
    f"Val: mAP@10 {map10:0.5f}, mAP@R {mapR:0.5f}")

100%|██████████| 2/2 [00:00<00:00, 33.46it/s]
2020-09-22 19:45:17.243 | INFO     | __main__:<module>:10 - Val: mAP@10 0.95448, mAP@R 0.94316


In [1]:
" diffusion module "

import os
import numpy as np
import scipy.sparse as sparse
import scipy.sparse.linalg as linalg
from tqdm import tqdm
from sklearn import preprocessing

from loguru import logger

import multiprocessing
import functools

trunc_ids = None
trunc_init = None
lap_alpha = None


def get_single_score(idx, truncation_size=5000):
    """Computes result for one image
    Args: 
        idx
        data (tuple): (idx,  trunc_ids[idx])

    """
    # Move this to class? 
#     if idx == 0:
#         print(trunc_ids[idx].shape, lap_alpha.shape, trunc_init[:10])
    ids = trunc_ids[idx]
    trunc_lap = lap_alpha[ids][:, ids]
    scores, _ = linalg.cg(trunc_lap, trunc_init, tol=1e-6, maxiter=50)
    return scores

trunc_ids, trunc_init, lap_alpha = None, None, None


class Diffusion:
    """Performse Diffusion
    Args:
        embeddings: Query and Gallery embedings concatenated
        distance: Distance metric, one of {'cosine', 'euclidean'}
        
    Reference:
        Efficient Image Retrieval via Decoupling Diffusion ...
        https://arxiv.org/pdf/1811.10907.pdf
        
    """
    def __init__(self, embeddings, distance='cosine', gamma=3):
        self.embeddings = embeddings
        self.N = len(embeddings)
        self.knn = kNN(self.embeddings, distance=distance)
        
        self.gamma = gamma

    def get_offline_results(self, truncation_size=1000, kd=50):
        """Compute diffusion results for each gallery embedding
        Args:
            
        """
        global trunc_ids, trunc_init, lap_alpha

        logger.info("Searching neighbours")
        distances, ids = self.knn.search(self.embeddings, truncation_size)
        # We measure cosine simmularity, but for diffusion metric should increase for not-simmilar images
        # thus we revert the metric. Later it's checked to be non-negative
        distances = 1 - distances
        trunc_ids = ids
        logger.info("Computing laplacian")
        lap_alpha = self.laplacian(distances[:, :kd], ids[:, :kd])
        trunc_init = np.zeros(truncation_size)
        trunc_init[0] = 1
        
        logger.info('[offline] 2) gallery-side diffusion')
        
        # Slow, imrove later with multiprocessing
        new_func = functools.partial(get_single_score, truncation_size=truncation_size)
        results = [new_func(i) for i in tqdm(range(self.N))]
        all_scores = np.concatenate(results)

        logger.info('[offline] 3) merge offline results')
        rows = np.repeat(np.arange(self.N), truncation_size)
        offline = sparse.csr_matrix((all_scores, (rows, trunc_ids.reshape(-1))),
                                    shape=(self.N, self.N),
                                    dtype=np.float32)
        return offline

    def laplacian(self, distances, ids, alpha=0.99):
        """Computaion of Laplacian alpha matrix
        Args:
            sims:
            ids: ???
            alpha: Parameter for Laplacian construction
    
        Returns:
            lap_alpha: Matrix of ...
        """
        # Shape (num x num) (self.N x self.N?)
        affinity = self.affinity(distances, ids)
        
        num = affinity.shape[0]
        degrees = affinity @ np.ones(num) + 1e-12
        # mat: degree matrix ^ (-1/2)
        mat = sparse.dia_matrix(
            (degrees ** (-0.5), [0]), shape=(num, num), dtype=np.float32)
        stochastic = mat @ affinity @ mat
        sparse_eye = sparse.dia_matrix(
            (np.ones(num), [0]), shape=(num, num), dtype=np.float32)
        lap_alpha = sparse_eye - alpha * stochastic
        return lap_alpha

    def affinity(self, distances, ids, gamma=3):
        """Create affinity matrix for the mutual kNN graph of the whole dataset
        Args:
            distances: Similarities of kNN
            ids: Indexes of kNN
        Returns:
            affinity: Affinity matrix
        """
#         num = self.N ??
        num = distances.shape[0]
        
        # Distance should be non-negative
        distances[distances < 0] = 0

        distances = distances ** self.gamma
        # vec_ids: feature vectors' ids
        # mut_ids: mutual (reciprocal) nearest neighbors' ids
        # mut_sims: similarites between feature vectors and their mutual nearest neighbors
        vec_ids, mut_ids, mut_sims = [], [], []
        for i in range(num):
            # check reciprocity: i is in j's kNN and j is in i's kNN when i != j
            ismutual = np.isin(ids[ids[i]], i).any(axis=1)
            ismutual[0] = False
            if ismutual.any():
                vec_ids.append(i * np.ones(ismutual.sum(), dtype=int))
                mut_ids.append(ids[i, ismutual])
                mut_sims.append(distances[i, ismutual])
        vec_ids, mut_ids, mut_sims = map(np.concatenate, [vec_ids, mut_ids, mut_sims])
        affinity = sparse.csc_matrix((mut_sims, (vec_ids, mut_ids)),
                                     shape=(num, num), dtype=np.float32)
        return affinity

In [224]:
# query_embeddings, gallery_embeddings = ...
# kd - how many nearest neighbours for each image to search. Rename -> max_neighbours?
kd = 50
kq = 10
gamma = 3
truncation_size = 1000 # 5000? authors say it improves results for bigger datasets

In [228]:

# Search
len_query = query_embeddings.shape[0]


diffusion = Diffusion(
    embeddings = np.vstack([query_embeddings, gallery_embeddings]), 
    distance='cosine',
    gamma=3)

offline = diffusion.get_offline_results(truncation_size, kd=kd)

# diffusion = Diffusion(
#     embeddings = gallery_embeddings, 
#     distance='cosine',
#     gamma=3)

# offline = diffusion.get_offline_results(truncation_size, kd=kd)

100%|██████████| 2/2 [00:00<00:00, 43.04it/s]
2020-09-22 20:28:55.352 | INFO     | __main__:get_offline_results:62 - Searching neighbours
2020-09-22 20:29:01.376 | INFO     | __main__:get_offline_results:65 - Computing laplacian
2020-09-22 20:29:02.074 | INFO     | __main__:get_offline_results:70 - [offline] 2) gallery-side diffusion
100%|██████████| 16656/16656 [00:34<00:00, 477.56it/s]
2020-09-22 20:29:37.043 | INFO     | __main__:get_offline_results:76 - [offline] 3) merge offline results
