# Distance Metric Learning

## Utilities and Imports

In [1]:
from platform import python_version
print('Python Version: {}'.format( python_version() ) )

# Utils
from tqdm import tqdm_notebook # Progress bar

import numpy as np
import matplotlib.pyplot as plt

import json
from   scipy.io import loadmat

# sklearn
from sklearn.decomposition import PCA
from sklearn.neighbors     import KNeighborsClassifier
from sklearn.neighbors     import NearestNeighbors


Python Version: 3.6.7


In [2]:
# Constants
RANDOM_STATE = 42

## Data Manipulation

### Load Data

In [6]:
with open( "PR_data/feature_data.json", "r" ) as file:
    features = json.load( file )
    
data = np.asarray( features )

print( 'Data shape: {}'.format( data.shape ) )

# Load matfile
mat = loadmat( 'PR_data/cuhk03_new_protocol_config_labeled.mat' )

# Load labels
labels = mat[ 'labels' ].flatten()

# Load camId
camIds = mat[ 'camId' ].flatten()

# Load indexes
train_idxs   = mat[ 'train_idx' ].flatten()
query_idxs    = mat[ 'query_idx' ].flatten()
gallery_idxs = mat[ 'gallery_idx' ].flatten()

# Load training indexes
print( "Loading Training indexes : {}".format( train_idxs.shape ) )
print( "Loading Query indexes : {}".format( query_idxs.shape ) )
print( "Loading Gallery indexes : {}".format( gallery_idxs.shape ) )

Data shape: (14096, 2048)
Loading Training indexes : (7368,)
Loading Query indexes : (1400,)
Loading Gallery indexes : (5328,)


### Create Train/Query/Gallery Sets

In [94]:
# Create Train Set
train_set   = []
train_label = []

for i in train_idxs:
    train_set.append( data[ i - 1 ] )
    train_label.append( labels[ i - 1 ] )
    
train_set   = np.asarray( train_set )
train_label = np.asarray( train_label )

print( 'Train Set : {}'.format( train_set.shape ) )
print( 'Train Label : {}'.format( train_label.shape ) )


# Create Query Set
query_set   = []
query_label = []
query_camId = []

for i in query_idxs:
    query_set.append( data[ i - 1] )
    query_label.append( labels[ i - 1 ] )
    query_camId.append( camIds[ i - 1 ] )
    
query_set   = np.asarray( query_set )
query_label = np.asarray( query_label )
query_camId = np.asarray( query_camId )

print( '\nQuery Set : {}'.format( query_set.shape ) )
print( 'Query Label : {}'.format( query_label.shape ) )
print( 'Query CamId : {}'.format( query_camId.shape ) )


# Create Gallery Set
gallery_set   = []
gallery_label = []
gallery_camId = []

for i in gallery_idxs:
    gallery_set.append( data[ i - 1] )
    gallery_label.append( labels[ i - 1 ] )
    gallery_camId.append( camIds[ i - 1 ] )
    
gallery_set   = np.asarray( gallery_set )
gallery_label = np.asarray( gallery_label )
gallery_camId = np.asarray( gallery_camId )

print( '\nGallery Set : {}'.format( gallery_set.shape ) )
print( 'Gallery Label : {}'.format( gallery_label.shape ) )
print( 'Gallery CamId : {}'.format( gallery_camId.shape ) )

Train Set : (7368, 2048)
Train Label : (7368,)

Query Set : (1400, 2048)
Query Label : (1400,)
Query CamId : (1400,)

Gallery Set : (5328, 2048)
Gallery Label : (5328,)
Gallery CamId : (5328,)


### Validation

* Pick 100 random identites from the training set
* Remove all data with those 100 identities from training set and put in validation set

There exists an idea in Computer Vision that you use validation set only to specify the number of iterations that is optimal for your design and then you include your validation set into train set and perform final training (without validation set) for this fixed amount of iterations. 

In [134]:
from sklearn.utils import shuffle

train_unique_labels = np.unique( train_label )

# Select 100 Random Identities
shuffled_validation_labels = shuffle( train_unique_labels, random_state = RANDOM_STATE )[ : 100 ] 
print( 'Selecting *{}* Random Identities For Validation Set'.format( shuffled_validation_labels.shape[ 0 ] ) )

train_validate = np.zeros( ( 1, 2049 ) )
train_set_validate_removed = np.vstack( ( train_set.T, train_label ) ).T

for identity in tqdm_notebook( shuffled_validation_labels ):
    
    # Go through data and remove rows with that identity
    validation = train_set_validate_removed[ np.where( train_set_validate_removed[ :, -1 ] == identity ) ]
    
    train_validate = np.vstack( ( train_validate, validation ) )
    
    train_set_validate_removed = train_set_validate_removed[ np.where( train_set_validate_removed[ :, -1 ] != identity )]
    
print( 'Training Set with Validation removed: {}'.format( train_set_validate_removed.shape ) )
print( 'Validation Set: {} ( Has an extra row due to np.zeros )'.format( train_validate.shape ) )

Selecting *100* Random Identities For Validation Set


HBox(children=(IntProgress(value=0), HTML(value='')))

Training Set with Validation removed: (6399, 2049)
Validation Set: (970, 2049) ( Has an extra row due to np.zeros )


In [135]:
cv_train_set   = train_set_validate_removed.T[ : -1 ].T
cv_train_label = train_set_validate_removed.T[ -1 : ].T

cv_validation_set = train_validate.T[ : -1 ].T[ 1: ]
cv_validation_label = train_validate.T[ -1 : ].T[ 1 : ]

print( 'CV Train Set: {} \nTrain Labels: {}'.format( cv_train_set.shape, cv_train_label.shape ) )
print( 'CV Validation Set: {} \nValidation Labels: {}'.format( cv_validation_set.shape, cv_validation_label.shape ) )

CV Train Set: (6399, 2048) 
Train Labels: (6399, 1)
CV Validation Set: (969, 2048) 
Validation Labels: (969, 1)


## Mahalanobis

### Covariance Matrix

In [204]:
S = np.dot( train_set, train_set.T )

S.shape

(7368, 7368)

In [205]:
w, v = np.linalg.eig( S )

In [208]:
G = np.dot( np.diag( np.sqrt( w ) ), v.T )

In [209]:
G.shape

(7368, 7368)

In [219]:
S2 = np.linalg.inv( np.dot( train_set.T, train_set ) )

w2, v2 = np.linalg.eig( S2 )

In [220]:
G2 = np.dot( np.diag( np.sqrt( w2 ) ), v2.T )

In [221]:
qs = np.dot( G2.T, query_set.T )
gs = np.dot( G2.T, gallery_set.T )

print( qs.shape )
print( gs.shape )

(2048, 1400)
(2048, 5328)


In [224]:
# Query Augmented


query_augmented = np.vstack( ( qs, query_camId, query_label ) )
query_augmented = query_augmented.T

# Gallery Augmented


gallery_augmented = np.vstack( ( gs, gallery_camId, gallery_label ) )
gallery_augmented = gallery_augmented.T

print( 'Query Augmented: {}'.format( query_augmented.shape ) )
print( 'Gallery Augmented: {}'.format( gallery_augmented.shape ) )

Query Augmented: (1400, 2050)
Gallery Augmented: (5328, 2050)


In [225]:
knn_n_neighbors = 20
knn_metric = 'euclidean'

KNN = NearestNeighbors( n_neighbors = knn_n_neighbors, metric = knn_metric )
KNN.fit( gallery_augmented[ :, : -2 ], gallery_augmented[ :, -1 : ] )

NearestNeighbors(algorithm='auto', leaf_size=30, metric='euclidean',
         metric_params=None, n_jobs=None, n_neighbors=20, p=2, radius=1.0)

In [226]:
query_rank_list = []

# for i in range( 2,3 ):
for i in tqdm_notebook( range( query_augmented.shape[ 0 ] ) ):

    
    query_label = query_augmented[ i, -1 ].astype( int )

    # Test query point
    X_test = query_augmented[ i ][ : -2 ].reshape( 1, -1 ) # Remove last 2 columns ( camId and label )
    
    distances, indices = KNN.kneighbors( X_test ) # Neighbours are ordered closest to furthest
    
    # Compare
    distances = distances.flatten()
    indices   = indices.flatten()
    
    removed_indices = []
    
    # Remove indices with same camId and Row
    for ind in indices:
        if( ~( gallery_augmented[ ind, -1 ] == query_label and 
           gallery_augmented[ ind, -2 ] == query_augmented[ i, -2 ].astype( int ) ) ):
            
            removed_indices.append( ind )
    
    removed_indices = np.asarray( removed_indices )
            
    
    rank_list = [ gallery_augmented[ ind, -1 ].astype( int ) == query_label for ind in removed_indices[ : 10 ] ]
    query_rank_list.append( rank_list )
    
query_rank_list = np.asarray( query_rank_list )

HBox(children=(IntProgress(value=0, max=1400), HTML(value='')))

In [227]:
rankAt1  = query_rank_list.T[ 0 ]
rankAt5  = query_rank_list.T[ : 5 ].T
rankAt10 = query_rank_list.T[ : 10 ].T

cmc1  = rankAt1
cmc5  = np.sum( rankAt5, axis = 1 ) > 0 
cmc10 = np.sum( rankAt10, axis = 1 ) > 0

print( 'rank@1: {}%'.format( np.sum( cmc1 ) / cmc1.shape[ 0 ] * 100 ) )
print( 'rank@5: {}%'.format( np.sum( cmc5 ) / cmc5.shape[ 0 ] * 100 ) )
print( 'rank@10: {}%'.format( np.sum( cmc10 ) / cmc10.shape[ 0 ] * 100 ) )

rank@1: 46.57142857142857%
rank@5: 65.64285714285715%
rank@10: 74.35714285714286%


___

In [18]:
from sklearn.decomposition import PCA

PCA = PCA( n_components = 110, whiten = True )
PCA.fit( train_set )

PCA(copy=True, iterated_power='auto', n_components=110, random_state=None,
  svd_solver='auto', tol=0.0, whiten=True)

In [19]:
pca_train_set = PCA.transform( train_set )
pca_gallery_set = PCA.transform( gallery_set )
pca_query_set = PCA.transform( query_set )

In [421]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis()
lda.fit( pca_train_set, train_label )

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)

In [422]:
lda_train_set = lda.transform( pca_train_set )
lda_gallery_set = lda.transform( pca_gallery_set )
lda_query_set = lda.transform( pca_query_set )

____

## LMNN

In [424]:
from metric_learn import LMNN

lmnn = LMNN( k = 5, min_iter = 20, max_iter = 200, learn_rate = 1e-6, verbose = True  )
lmnn.fit( lda_train_set, train_label )



2 2680791.7849133373 -327865.8640466016 32 1.0099999999999999e-06
3 2373417.7525565587 -307374.0323567786 22 1.0200999999999998e-06
4 2082377.8480223005 -291039.9045342582 15 1.0303009999999997e-06
5 1808155.1803297447 -274222.6676925558 12 1.0406040099999998e-06
6 1551203.988358576 -256951.19197116862 12 1.0510100500999999e-06
7 1312076.9391843034 -239127.0491742727 11 1.061520150601e-06
8 1091364.1519528725 -220712.78723143088 12 1.07213535210701e-06
9 889664.4418852046 -201699.71006766788 11 1.08285670562808e-06
10 707596.0949832664 -182068.3469019382 11 1.0936852726843608e-06
11 545787.6503094791 -161808.44467378734 10 1.1046221254112045e-06
12 404885.95367982704 -140901.69662965205 10 1.1156683466653166e-06
13 285551.8507404328 -119334.10293939424 10 1.1268250301319698e-06
14 188463.36441767536 -97088.48632275744 8 1.1380932804332895e-06
15 114314.39407013908 -74148.97034753628 10 1.1494742132376223e-06
16 63812.82782729218 -50501.5662428469 10 1.1609689553699985e-06
17 37689.1605

python_LMNN(convergence_tol=0.001, k=5, learn_rate=1e-06, max_iter=200,
      min_iter=20, regularization=0.5, use_pca=True, verbose=True)

In [10]:
def square_euclidean( x1, x2 ):
  
    diff = np.subtract( x1, x2 )
    return np.sum( np.dot( diff.T, diff ) )

In [426]:
# Query Augmented
qs = lmnn.transform( lda_query_set ).T

query_augmented = np.vstack( ( qs, query_camId, query_label ) )
query_augmented = query_augmented.T

# Gallery Augmented
gs = lmnn.transform( lda_gallery_set ).T

gallery_augmented = np.vstack( ( gs, gallery_camId, gallery_label ) )
gallery_augmented = gallery_augmented.T

print( 'Query Augmented: {}'.format( query_augmented.shape ) )
print( 'Gallery Augmented: {}'.format( gallery_augmented.shape ) )

Query Augmented: (1400, 112)
Gallery Augmented: (5328, 112)


In [427]:
knn_n_neighbors = 20
knn_metric = 'euclidean'

KNN = NearestNeighbors( algorithm = 'ball_tree',
                       n_neighbors = knn_n_neighbors,
                       metric = square_euclidean )
                       
KNN.fit( gallery_augmented[ :, : -2 ], gallery_augmented[ :, -1 : ] )

NearestNeighbors(algorithm='ball_tree', leaf_size=30,
         metric=<function square_euclidean at 0x7f9176b897b8>,
         metric_params=None, n_jobs=None, n_neighbors=20, p=2, radius=1.0)

In [428]:
query_rank_list = []

# for i in range( 2,3 ):
for i in tqdm_notebook( range( query_augmented.shape[ 0 ] ) ):

    
    query_label = query_augmented[ i, -1 ].astype( int )

    # Test query point
    X_test = query_augmented[ i ][ : -2 ].reshape( 1, -1 ) # Remove last 2 columns ( camId and label )
    
    distances, indices = KNN.kneighbors( X_test ) # Neighbours are ordered closest to furthest
    
    # Compare
    distances = distances.flatten()
    indices   = indices.flatten()
    
    removed_indices = []
    
    # Remove indices with same camId and Row
    for ind in indices:
        if( ~( gallery_augmented[ ind, -1 ] == query_label and 
           gallery_augmented[ ind, -2 ] == query_augmented[ i, -2 ].astype( int ) ) ):
            
            removed_indices.append( ind )
    
    removed_indices = np.asarray( removed_indices )
            
    
    rank_list = [ gallery_augmented[ ind, -1 ].astype( int ) == query_label for ind in removed_indices[ : 10 ] ]
    query_rank_list.append( rank_list )
    
query_rank_list = np.asarray( query_rank_list )

HBox(children=(IntProgress(value=0, max=1400), HTML(value='')))

In [429]:
rankAt1  = query_rank_list.T[ 0 ]
rankAt5  = query_rank_list.T[ : 5 ].T
rankAt10 = query_rank_list.T[ : 10 ].T

cmc1  = rankAt1
cmc5  = np.sum( rankAt5, axis = 1 ) > 0 
cmc10 = np.sum( rankAt10, axis = 1 ) > 0

print( 'rank@1: {}%'.format( np.sum( cmc1 ) / cmc1.shape[ 0 ] * 100 ) )
print( 'rank@5: {}%'.format( np.sum( cmc5 ) / cmc5.shape[ 0 ] * 100 ) )
print( 'rank@10: {}%'.format( np.sum( cmc10 ) / cmc10.shape[ 0 ] * 100 ) )

rank@1: 39.0%
rank@5: 58.857142857142854%
rank@10: 66.64285714285715%


TRY:

* Kernel PCA with non linear kernels ( RBF? )
* Stacked with LMNN ( since we now in higher dimension )

In [91]:
from metric_learn import NCA

nca = NCA( max_iter = 100, verbose = True )
nca.fit( train_set, train_label )

[NCA]
[NCA]  Iteration      Objective Value    Time(s)
[NCA] ------------------------------------------
[NCA]          0         7.366959e+03      20.74
[NCA]          1         7.367001e+03      20.57
[NCA]          2         7.367999e+03      21.05
[NCA]          3         7.368000e+03      20.63
[NCA]          4         7.368000e+03      21.33
[NCA] Training took   108.03s.


NCA(learning_rate='deprecated', max_iter=100, num_dims=None, tol=None,
  verbose=True)

In [95]:
nca_train_set = nca.transform( train_set )
nca_gallery_set = nca.transform( gallery_set )
nca_query_set = nca.transform( query_set )

In [96]:
# Query Augmented
qs = nca_query_set.T

query_augmented = np.vstack( ( qs, query_camId, query_label ) )
query_augmented = query_augmented.T

# Gallery Augmented
gs = nca_gallery_set.T

gallery_augmented = np.vstack( ( gs, gallery_camId, gallery_label ) )
gallery_augmented = gallery_augmented.T

print( 'Query Augmented: {}'.format( query_augmented.shape ) )
print( 'Gallery Augmented: {}'.format( gallery_augmented.shape ) )

Query Augmented: (1400, 2050)
Gallery Augmented: (5328, 2050)


In [97]:
knn_n_neighbors = 20
knn_metric = 'euclidean'

KNN = NearestNeighbors( algorithm = 'ball_tree',
                       n_neighbors = knn_n_neighbors,
                       metric = knn_metric )
                       
KNN.fit( gallery_augmented[ :, : -2 ], gallery_augmented[ :, -1 : ] )

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='euclidean',
         metric_params=None, n_jobs=None, n_neighbors=20, p=2, radius=1.0)

In [98]:
query_rank_list = []

# for i in range( 2,3 ):
for i in tqdm_notebook( range( query_augmented.shape[ 0 ] ) ):

    
    query_label = query_augmented[ i, -1 ].astype( int )

    # Test query point
    X_test = query_augmented[ i ][ : -2 ].reshape( 1, -1 ) # Remove last 2 columns ( camId and label )
    
    distances, indices = KNN.kneighbors( X_test ) # Neighbours are ordered closest to furthest
    
    # Compare
    distances = distances.flatten()
    indices   = indices.flatten()
    
    removed_indices = []
    
    # Remove indices with same camId and Row
    for ind in indices:
        if( ~( gallery_augmented[ ind, -1 ] == query_label and 
           gallery_augmented[ ind, -2 ] == query_augmented[ i, -2 ].astype( int ) ) ):
            
            removed_indices.append( ind )
    
    removed_indices = np.asarray( removed_indices )
            
    
    rank_list = [ gallery_augmented[ ind, -1 ].astype( int ) == query_label for ind in removed_indices[ : 10 ] ]
    query_rank_list.append( rank_list )
    
query_rank_list = np.asarray( query_rank_list )

HBox(children=(IntProgress(value=0, max=1400), HTML(value='')))




In [99]:
rankAt1  = query_rank_list.T[ 0 ]
rankAt5  = query_rank_list.T[ : 5 ].T
rankAt10 = query_rank_list.T[ : 10 ].T

cmc1  = rankAt1
cmc5  = np.sum( rankAt5, axis = 1 ) > 0 
cmc10 = np.sum( rankAt10, axis = 1 ) > 0

print( 'rank@1: {}%'.format( np.sum( cmc1 ) / cmc1.shape[ 0 ] * 100 ) )
print( 'rank@5: {}%'.format( np.sum( cmc5 ) / cmc5.shape[ 0 ] * 100 ) )
print( 'rank@10: {}%'.format( np.sum( cmc10 ) / cmc10.shape[ 0 ] * 100 ) )

rank@1: 44.642857142857146%
rank@5: 65.92857142857143%
rank@10: 73.85714285714286%


___

## Kernel PCA (Cosine)

In [76]:
from sklearn.decomposition import KernelPCA

KPCA = KernelPCA( n_components = 1000, kernel = 'cosine' )
KPCA.fit( train_set )

KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',
     fit_inverse_transform=False, gamma=None, kernel='cosine',
     kernel_params=None, max_iter=None, n_components=1000, n_jobs=None,
     random_state=None, remove_zero_eig=False, tol=0)

In [77]:
kpca_train_set = KPCA.transform( train_set )
kpca_gallery_set = KPCA.transform( gallery_set )
kpca_query_set = KPCA.transform( query_set )



In [78]:
# Query Augmented
qs = kpca_query_set.T

query_augmented = np.vstack( ( qs, query_camId, query_label ) )
query_augmented = query_augmented.T

# Gallery Augmented
gs = kpca_gallery_set.T

gallery_augmented = np.vstack( ( gs, gallery_camId, gallery_label ) )
gallery_augmented = gallery_augmented.T

print( 'Query Augmented: {}'.format( query_augmented.shape ) )
print( 'Gallery Augmented: {}'.format( gallery_augmented.shape ) )

Query Augmented: (1400, 1002)
Gallery Augmented: (5328, 1002)


In [79]:
knn_n_neighbors = 20
knn_metric = 'euclidean'

KNN = NearestNeighbors( algorithm = 'ball_tree',
                       n_neighbors = knn_n_neighbors,
                       metric = knn_metric )
                       
KNN.fit( gallery_augmented[ :, : -2 ], gallery_augmented[ :, -1 : ] )

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='euclidean',
         metric_params=None, n_jobs=None, n_neighbors=20, p=2, radius=1.0)

In [80]:
query_rank_list = []

# for i in range( 2,3 ):
for i in tqdm_notebook( range( query_augmented.shape[ 0 ] ) ):

    
    query_label = query_augmented[ i, -1 ].astype( int )

    # Test query point
    X_test = query_augmented[ i ][ : -2 ].reshape( 1, -1 ) # Remove last 2 columns ( camId and label )
    
    distances, indices = KNN.kneighbors( X_test ) # Neighbours are ordered closest to furthest
    
    # Compare
    distances = distances.flatten()
    indices   = indices.flatten()
    
    removed_indices = []
    
    # Remove indices with same camId and Row
    for ind in indices:
        if( ~( gallery_augmented[ ind, -1 ] == query_label and 
           gallery_augmented[ ind, -2 ] == query_augmented[ i, -2 ].astype( int ) ) ):
            
            removed_indices.append( ind )
    
    removed_indices = np.asarray( removed_indices )
            
    
    rank_list = [ gallery_augmented[ ind, -1 ].astype( int ) == query_label for ind in removed_indices[ : 10 ] ]
    query_rank_list.append( rank_list )
    
query_rank_list = np.asarray( query_rank_list )

HBox(children=(IntProgress(value=0, max=1400), HTML(value='')))




In [81]:
rankAt1  = query_rank_list.T[ 0 ]
rankAt5  = query_rank_list.T[ : 5 ].T
rankAt10 = query_rank_list.T[ : 10 ].T

cmc1  = rankAt1
cmc5  = np.sum( rankAt5, axis = 1 ) > 0 
cmc10 = np.sum( rankAt10, axis = 1 ) > 0

print( 'rank@1: {}%'.format( np.sum( cmc1 ) / cmc1.shape[ 0 ] * 100 ) )
print( 'rank@5: {}%'.format( np.sum( cmc5 ) / cmc5.shape[ 0 ] * 100 ) )
print( 'rank@10: {}%'.format( np.sum( cmc10 ) / cmc10.shape[ 0 ] * 100 ) )

rank@1: 47.35714285714286%
rank@5: 67.14285714285714%
rank@10: 75.07142857142857%


In [82]:
from metric_learn import NCA

nca = NCA( max_iter = 100, verbose = True )
nca.fit( kpca_train_set, train_label )

[NCA]
[NCA]  Iteration      Objective Value    Time(s)
[NCA] ------------------------------------------
[NCA]          0         7.364199e+03      11.53
[NCA]          1         7.366398e+03      11.41
[NCA]          2         7.367468e+03      11.36
[NCA]          3         7.367879e+03      11.37
[NCA]          4         7.367897e+03      11.38
[NCA]          5         7.367952e+03      11.37
[NCA]          6         7.367973e+03      11.37
[NCA]          7         7.367987e+03      11.31
[NCA]          8         7.367993e+03      11.33
[NCA]          9         7.367997e+03      11.27
[NCA]         10         7.367998e+03      11.34
[NCA]         11         7.367999e+03      11.32
[NCA]         12         7.368000e+03      11.26
[NCA]         13         7.368000e+03      11.30
[NCA]         14         7.368000e+03      11.31
[NCA]         15         7.368000e+03      11.37
[NCA] Training took   185.01s.


NCA(learning_rate='deprecated', max_iter=100, num_dims=None, tol=None,
  verbose=True)

In [83]:
nca_train_set = nca.transform( kpca_train_set )
nca_gallery_set = nca.transform( kpca_gallery_set )
nca_query_set = nca.transform( kpca_query_set )

In [85]:
# Query Augmented
qs = nca_query_set.T

query_augmented = np.vstack( ( qs, query_camId, query_label ) )
query_augmented = query_augmented.T

# Gallery Augmented
gs = nca_gallery_set.T

gallery_augmented = np.vstack( ( gs, gallery_camId, gallery_label ) )
gallery_augmented = gallery_augmented.T

print( 'Query Augmented: {}'.format( query_augmented.shape ) )
print( 'Gallery Augmented: {}'.format( gallery_augmented.shape ) )

Query Augmented: (1400, 1002)
Gallery Augmented: (5328, 1002)


In [86]:
knn_n_neighbors = 20
knn_metric = 'euclidean'

KNN = NearestNeighbors( algorithm = 'ball_tree',
                       n_neighbors = knn_n_neighbors,
                       metric = knn_metric )
                       
KNN.fit( gallery_augmented[ :, : -2 ], gallery_augmented[ :, -1 : ] )

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='euclidean',
         metric_params=None, n_jobs=None, n_neighbors=20, p=2, radius=1.0)

In [87]:
query_rank_list = []

# for i in range( 2,3 ):
for i in tqdm_notebook( range( query_augmented.shape[ 0 ] ) ):

    
    query_label = query_augmented[ i, -1 ].astype( int )

    # Test query point
    X_test = query_augmented[ i ][ : -2 ].reshape( 1, -1 ) # Remove last 2 columns ( camId and label )
    
    distances, indices = KNN.kneighbors( X_test ) # Neighbours are ordered closest to furthest
    
    # Compare
    distances = distances.flatten()
    indices   = indices.flatten()
    
    removed_indices = []
    
    # Remove indices with same camId and Row
    for ind in indices:
        if( ~( gallery_augmented[ ind, -1 ] == query_label and 
           gallery_augmented[ ind, -2 ] == query_augmented[ i, -2 ].astype( int ) ) ):
            
            removed_indices.append( ind )
    
    removed_indices = np.asarray( removed_indices )
            
    
    rank_list = [ gallery_augmented[ ind, -1 ].astype( int ) == query_label for ind in removed_indices[ : 10 ] ]
    query_rank_list.append( rank_list )
    
query_rank_list = np.asarray( query_rank_list )

HBox(children=(IntProgress(value=0, max=1400), HTML(value='')))




In [88]:
rankAt1  = query_rank_list.T[ 0 ]
rankAt5  = query_rank_list.T[ : 5 ].T
rankAt10 = query_rank_list.T[ : 10 ].T

cmc1  = rankAt1
cmc5  = np.sum( rankAt5, axis = 1 ) > 0 
cmc10 = np.sum( rankAt10, axis = 1 ) > 0

print( 'rank@1: {}%'.format( np.sum( cmc1 ) / cmc1.shape[ 0 ] * 100 ) )
print( 'rank@5: {}%'.format( np.sum( cmc5 ) / cmc5.shape[ 0 ] * 100 ) )
print( 'rank@10: {}%'.format( np.sum( cmc10 ) / cmc10.shape[ 0 ] * 100 ) )

rank@1: 43.642857142857146%
rank@5: 63.714285714285715%
rank@10: 71.5%


| Method | rank@1 | rank@5 | rank@10 |
| --- | --- | --- | --- |
Baseline | 47.0% | 66.85714285714286% | 74.92857142857143%
KPCA cosine ( 110 components )| 47.42857142857143% | 67.0% | 74.85714285714286%
KPCA cosine 400 | 47.785714285714285% | 67.14285714285714% | 75.0%
KPCA cosine 1000 | 47.35714285714286% | 67.14285714285714% | 75.07142857142857%
NCA | 44.642857142857146% | 65.92857142857143% | 73.85714285714286%
KPCA cosine ( 100 ) + NCA | 45.42857142857143 | 66.21428571428571 | 75.35714285714286
kpca cosine 400 + NCA | 46.42857142857143% | 67.28571428571428% | 74.57142857142857%
kpca cosine 1000 + NCA | 43.642857142857146% | 63.714285714285715% | 71.5%