In [1]:
import pandas as pd
import numpy as np
from mnist import MNIST
from tqdm import tqdm
from scipy.sparse.linalg import eigsh
from pretrainedModel import pretrainedModel
from tensorflow import keras
from PIL import Image
from sklearn.preprocessing import StandardScaler

In [2]:
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

In [3]:
scaler = StandardScaler()
x_test = scaler.fit_transform(x_test.reshape(x_test.shape[-0], -1)).reshape(x_test.shape)

In [4]:
model = pretrainedModel()
images =  [Image.fromarray(image).convert('RGB') for image in x_test]

In [5]:
image_features = [list(model(image)) for image in tqdm(images)]

  3%|▎         | 295/10000 [00:23<12:53, 12.55it/s]


KeyboardInterrupt: 

In [10]:
def cos_sim(a,b):
    return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))

In [11]:
sim_mat = np.zeros([10000,10000])
for i in tqdm(range(10000)):
    for j in range(i):
        sim_mat[i, j] = cos_sim(image_features[i], image_features[j])

  0%|          | 35/10000 [00:24<3:31:53,  1.28s/it]Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x13050f740>>
Traceback (most recent call last):
  File "/Users/kristiansjorslevnielsen/Library/Python/3.12/lib/python/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 
  0%|          | 43/10000 [00:36<4:25:31,  1.60s/it]

In [11]:
sim_mat = sim_mat + sim_mat.T

In [8]:
def test_tal(sim_mat, bits, threshold):    
    dim = sim_mat.shape[0]
    #threshold = np.min(np.max(sim_mat, axis=1))

    diag = np.sum((sim_mat > threshold), axis=1)
    D = np.zeros([dim,dim])
    for i in range(dim):
        D[i,i] = diag[i]
    L = D- sim_mat

    eigenvalues, eigenvectors = eigsh(L, k=bits, which="SM")
    threshold1 = np.median(eigenvectors)
    eigenvectors_bin = np.where(eigenvectors > threshold1, 1, 0)
    return eigenvectors_bin

In [9]:
def test_tbl(sim_mat, bits, threshold):    
    dim = sim_mat.shape[0]
    #threshold = np.min(np.max(sim_mat, axis=1))

    sim_mat = np.where(sim_mat>=threshold,1,0)
    D = np.zeros([dim,dim])
    for i in range(dim):
        D[i,i] = sim_mat[i].sum()
    L = D- sim_mat

    eigenvalues, eigenvectors_bin = eigsh(L, k=bits, which="SM")
    return eigenvectors_bin

In [10]:
def test_tbl_summed_diag(sim_mat, bits):    
    dim = sim_mat.shape[0]

    diag = np.sum(sim_mat, axis=1)
    D = np.zeros([dim,dim])
    for i in range(dim):
        D[i,i] = diag[i]
    L = D- sim_mat

    eigenvalues, eigenvectors = eigsh(L, k=bits, which="SM")
    threshold1 = np.median(eigenvectors)
    eigenvectors_bin = np.where(eigenvectors > threshold1, 1, 0)
    return eigenvectors_bin

In [13]:
hash_codes = test_tbl(sim_mat, 32, threshold = np.min(np.max(sim_mat, axis=1)))

In [141]:
def map(eigenvectors_bin, labels, k):
    labels_dic = {i:0 for i in range(10)}
    labels_size_dic = {i:0 for i in range(10)}
    for i, eigenvector in enumerate(eigenvectors_bin):
        tp = 0
        fp = 0
        for j, eigenvector2 in enumerate(eigenvectors_bin):
            if np.abs(eigenvector - eigenvector2).sum() <= k:
                if labels[i] == labels[j]:
                    tp += 1
                else:
                    fp += 1
        labels_dic[labels[i]] += tp/(tp+fp)
        labels_size_dic[labels[i]] += 1
    ap_dic = {i:labels_dic[i]/labels_size_dic[i] for i in range(10)}
    map_value = np.array(list(ap_dic.values())).mean()
    return map_value

In [153]:
min_threshold = int(np.round(np.min(np.max(sim_mat, axis=1))*100))
min_threshold

58

In [158]:
results_df = pd.DataFrame(columns=["algorithm", "threshold", "hamming_distance", "bits", "map"])
c = 0
for bits in [32, 64, 128]:
    for k in range(1,4):
        for threshold in tqdm(range(min_threshold,95,2)):
            hashes_tbl = test_tbl(sim_mat,bits,threshold/100)
            hashes_tal = test_tal(sim_mat,bits,threshold/100)   
            map_tbl = map(hashes_tbl, labels, k)
            map_tal = map(hashes_tal, labels, k)
            results_df.loc[c] = ["tbl", threshold/100, k, bits, map_tbl]
            results_df.loc[c] = ["tal", threshold/100, k, bits, map_tal]
            c +=1
        hashes_tbl_sd = test_tbl(sim_mat,bits,threshold/100)
        map_tbl_sd = map(hashes_tbl, labels, k)
        results_df.loc[c] = ["tbl_sd", pd.NA, k, bits, map_tbl_sd]
        c+=1
        results_df.to_csv("spectral_hashing_test_results.csv", index=False)
    

  0%|          | 0/19 [00:00<?, ?it/s]


KeyboardInterrupt: 

In [None]:
def spectral_tal(images, bits):
    dim = images.shape[0]
    sim_mat = np.zeros([dim,dim])
    for i in tqdm(range(dim)):
        for j in range(i):
            sim_mat[i, j] = cos_sim(images[i], images[j])
    sim_mat = sim_mat + sim_mat.T
    
    threshold = np.min(np.max(sim_mat, axis=1))

    diag = np.sum((sim_mat > threshold), axis=1)
    D = np.zeros([dim,dim])
    for i in range(dim):
        D[i,i] = diag[i]
    L = D- sim_mat

    eigenvalues, eigenvectors = eigsh(L, k=bits, which="SM")
    threshold1 = np.median(eigenvectors)
    eigenvectors_bin = np.where(eigenvectors > threshold1, 1, 0)
    return eigenvectors_bin

In [None]:
def spectral_tbl(images, bits):
    dim = images.shape[0]
    sim_mat = np.zeros([dim,dim])
    for i in tqdm(range(dim)):
        for j in range(i):
            sim_mat[i, j] = cos_sim(images[i], images[j])
    sim_mat = sim_mat + sim_mat.T
    
    threshold = np.min(np.max(sim_mat, axis=1))

    sim_mat = np.where(sim_mat>=threshold,1,0)
    D = np.zeros([dim,dim])
    for i in range(dim):
        D[i,i] = sim_mat[i].sum()
    L = D- sim_mat

    eigenvalues, eigenvectors_bin = eigsh(L, k=bits, which="SM")
    return eigenvectors_bin

In [None]:
def spectral_tbl_summed_diag(images, bits):
    dim = images.shape[0]
    sim_mat = np.zeros([dim,dim])
    for i in tqdm(range(dim)):
        for j in range(i):
            sim_mat[i, j] = cos_sim(images[i], images[j])
    sim_mat = sim_mat + sim_mat.T
    
    threshold = np.min(np.max(sim_mat, axis=1))

    diag = np.sum(sim_mat, axis=1)
    D = np.zeros([dim,dim])
    for i in range(dim):
        D[i,i] = diag[i]
    L = D- sim_mat

    eigenvalues, eigenvectors = eigsh(L, k=bits, which="SM")
    threshold1 = np.median(eigenvectors)
    eigenvectors_bin = np.where(eigenvectors > threshold1, 1, 0)
    return eigenvectors_bin

In [116]:
threshold = np.min(np.max(sim_mat, axis=1))

In [117]:
diag = np.sum((sim_mat > threshold), axis=1)
D = np.zeros([10000,10000])
for i in range(10000):
    D[i,i] = diag[i]

L = D- sim_mat

In [118]:
diag2 = np.sum(sim_mat, axis=1)
D2 = np.zeros([10000,10000])
for i in range(10000):
    D2[i,i] = diag2[i]

L2 = D2- sim_mat

In [119]:
eigenvalues, eigenvectors = eigsh(L, k=32, which="SM")

In [120]:
eigenvalues2, eigenvectors2 = eigsh(L2, k=32, which="SM")

In [121]:
threshold1 = np.median(eigenvectors)
threshold2 = np.median(eigenvectors2)

In [122]:
eigenvectors_bin = np.where(eigenvectors > threshold1, 1, 0)
eigenvectors_bin2 = np.where(eigenvectors2 > threshold2, 1, 0)

In [123]:
eigenvectors_bin

array([[1, 0, 1, ..., 1, 0, 1],
       [0, 1, 1, ..., 1, 1, 0],
       [0, 1, 0, ..., 0, 1, 0],
       ...,
       [0, 0, 0, ..., 0, 1, 1],
       [0, 0, 0, ..., 0, 1, 0],
       [1, 0, 1, ..., 0, 1, 0]])

In [124]:
print(len(np.unique(eigenvectors_bin,axis = 0)))
print(len(np.unique(eigenvectors_bin2,axis = 0)))

6911
3211


In [129]:
def map(eigenvectors_bin, labels, k):
    labels_dic = {i:0 for i in range(10)}
    labels_size_dic = {i:0 for i in range(10)}
    for i, eigenvector in enumerate(tqdm(eigenvectors_bin)):
        tp = 0
        fp = 0
        for j, eigenvector2 in enumerate(eigenvectors_bin):
            if np.abs(eigenvector - eigenvector2).sum() <= k:
                if labels[i] == labels[j]:
                    tp += 1
                else:
                    fp += 1
        labels_dic[labels[i]] += tp/(tp+fp)
        labels_size_dic[labels[i]] += 1
    ap_dic = {i:labels_dic[i]/labels_size_dic[i] for i in range(10)}
    map_value = np.array(list(ap_dic.values())).mean()
    return map_value

In [130]:
map(eigenvectors_bin, labels, 2)

100%|██████████| 10000/10000 [04:08<00:00, 40.17it/s]


0.7537130294700397

In [131]:
map(eigenvectors_bin2, labels, 2)

100%|██████████| 10000/10000 [04:03<00:00, 41.12it/s]


0.37877593377796936