In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from scipy.sparse.linalg import eigsh
from pretrainedModel import pretrainedModel
from tensorflow import keras
from PIL import Image
from sklearn.preprocessing import StandardScaler
import torch
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import NearestNeighbors

In [2]:
# In this file, we will do spectral hashing

# Preprocessing

In [4]:
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()


In [11]:
#scaler = StandardScaler()
#x_train = scaler.fit_transform(x_train.reshape(x_train.shape[-0], -1)).reshape(x_train.shape)
#x_test = scaler.fit_transform(x_test.reshape(x_test.shape[-0], -1)).reshape(x_test.shape)

In [3]:
training_features = x_train.reshape(x_train.shape[-0], -1)
test_features = x_test.reshape(x_test.shape[-0], -1)

In [4]:
# Instantiate the model
model = pretrainedModel()
print("model loaded")
# Process the images and move the output to the CPU for use in a list
train_images = [Image.fromarray(image).convert('RGB') for image in x_train]
test_images = [Image.fromarray(image).convert('RGB') for image in x_test]
print("images done")
# Ensure the model is in evaluation mode
model.eval()

# Disable gradient calculations for faster inference
with torch.no_grad():
    training_features = np.array([np.array(model(image).cpu()) for image in tqdm(train_images)])

with torch.no_grad():
    test_features = np.array([np.array(model(image).cpu()) for image in tqdm(test_images)])

model loaded
images done


100%|██████████| 60000/60000 [07:52<00:00, 126.99it/s]
100%|██████████| 10000/10000 [01:18<00:00, 127.42it/s]


In [5]:
np.save(r'C:\Users\Test\Desktop\p7\Spectral\features\training_features_cnn.npy', training_features)
np.save(r'C:\Users\Test\Desktop\p7\Spectral\features\test_features_cnn.npy', test_features)

In [6]:
training_features = np.load(r'C:\Users\Test\Desktop\p7\Spectral\features\training_features_cnn.npy')
test_features = np.load(r'C:\Users\Test\Desktop\p7\Spectral\features\test_features_cnn.npy')

### Standardization and PCA

In [5]:
scaler = StandardScaler()
training_features = scaler.fit_transform(training_features)  # Standardize the data
test_features = scaler.transform(test_features)

In [6]:
pca = PCA(n_components=100)  # Set the number of components to keep
training_features = pca.fit_transform(training_features)  # Fit PCA on the standardized data and transform
test_features = pca.transform(test_features)

### Similarity matrix

In [21]:
def cos_sim(a,b):
    return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))

In [7]:
nbrs = NearestNeighbors(n_neighbors=50).fit(training_features)

# Find the nearest neighbors
distances, indices = nbrs.kneighbors(training_features)

# Create an adjacency matrix
n_samples = training_features.shape[0]
adjacency_matrix = np.zeros((n_samples, n_samples))

# Populate the adjacency matrix
for i, neighbors in enumerate(indices):
    for neighbor in neighbors:
        adjacency_matrix[i, neighbor] = 1
        adjacency_matrix[neighbor, i] = 1  # Ensure symmetry for an undirected graph



In [22]:
training_features = np.array(training_features)

# Initialize the similarity matrix on the GPU
sim_mat = np.zeros((training_features.shape[0], training_features.shape[0]))

# Compute the cosine similarity using the GPU
for i in tqdm(range(training_features.shape[0])):
    for j in range(i):
        sim_mat[i, j] = cos_sim(training_features[i], training_features[j])

sim_mat = sim_mat + sim_mat.T
np.save(r'C:\Users\Test\Desktop\p7\Spectral\sim_mat\similarity_matrix.npy', sim_mat)
#sim_mat = np.load(r'C:\Users\Test\Desktop\p7\Spectral\sim_mat\similarity_matrix.npy')



100%|██████████| 60000/60000 [2:15:22<00:00,  7.39it/s]  


### Spectral Hashing

In [8]:
#threshold = np.min(np.max(sim_mat, axis=0))
#sim_mat = np.where(sim_mat>=threshold,1,0)
dim=adjacency_matrix.shape[0]
D = np.zeros([dim,dim])
for i in range(dim):
    D[i,i] = adjacency_matrix[i].sum()
L = D- adjacency_matrix

In [9]:
eigenvalues, eigenvectors = eigsh(L, k=32, which="SM") # overvej max_iter, tolerance?
np.save(r'C:\Users\Test\Desktop\p7\Spectral\eigenvectors\eigenvectors_knn.npy', eigenvectors)

In [12]:
eigenvectors = np.load(r'C:\Users\Test\Desktop\p7\Spectral\eigenvectors\eigenvectors_knn.npy')

In [2]:
threshold1 = 0
eigenvectors_bin = np.where(eigenvectors > threshold1, 1, 0)

NameError: name 'np' is not defined

CLASSIFIER

In [11]:
clf = MLPClassifier().fit(training_features, eigenvectors_bin)



In [12]:
test_hashes = clf.predict(test_features)

In [51]:
print(test_hashes.shape)
print(y_test.shape)
print(test_hashes[0].shape)

(10000, 32)
(10000,)
(32,)


In [13]:
def mean_average_precision(test_hashes, labels):
    aps = []
    for i, test_hash in enumerate(tqdm(test_hashes)):
        label = labels[i]
        distances = np.abs(test_hashes - test_hashes[i]).sum(axis=1)
        tp = np.where(labels==label, 1, 0)
        hash_df = pd.DataFrame({"distances":distances, "tp":tp}).reset_index()
        hash_df = hash_df.drop(index=i)
        hash_df = hash_df.sort_values(["distances", "index"]).reset_index(drop=True)
        hash_df = hash_df.drop(["index", "distances"], axis=1).reset_index()
        hash_df = hash_df[hash_df["tp"]==1]
        hash_df["tp"] = hash_df["tp"].cumsum()
        hash_df["index"] = hash_df["index"] +1 
        precision = np.array(hash_df["tp"]) / np.array(hash_df["index"])
        ap = precision.mean()
        aps.append(ap)
    
    return np.array(aps).mean()


In [36]:
eigenvectors_bin.mean()

0.5096208333333333

In [14]:
aps = mean_average_precision(test_hashes, y_test)

100%|██████████| 10000/10000 [00:31<00:00, 318.81it/s]


In [1]:
aps
# JEG PREDICTED ET L (BOZOKODE )

NameError: name 'aps' is not defined

## Extra Kristians hyggekode

In [8]:
def test_tal(sim_mat, bits, threshold):    
    dim = sim_mat.shape[0]
    #threshold = np.min(np.max(sim_mat, axis=1))

    diag = np.sum((sim_mat > threshold), axis=1)
    D = np.zeros([dim,dim])
    for i in range(dim):
        D[i,i] = diag[i]
    L = D- sim_mat

    eigenvalues, eigenvectors = eigsh(L, k=bits, which="SM")
    threshold1 = np.median(eigenvectors)
    eigenvectors_bin = np.where(eigenvectors > threshold1, 1, 0)
    return eigenvectors_bin

In [9]:
def test_tbl(sim_mat, bits, threshold):    
    dim = sim_mat.shape[0]
    #threshold = np.min(np.max(sim_mat, axis=1))

    sim_mat = np.where(sim_mat>=threshold,1,0)
    D = np.zeros([dim,dim])
    for i in range(dim):
        D[i,i] = sim_mat[i].sum()
    L = D- sim_mat

    eigenvalues, eigenvectors_bin = eigsh(L, k=bits, which="SM")
    return eigenvectors_bin

In [10]:
def test_tbl_summed_diag(sim_mat, bits):    
    dim = sim_mat.shape[0]

    diag = np.sum(sim_mat, axis=1)
    D = np.zeros([dim,dim])
    for i in range(dim):
        D[i,i] = diag[i]
    L = D- sim_mat

    eigenvalues, eigenvectors = eigsh(L, k=bits, which="SM")
    threshold1 = np.median(eigenvectors)
    eigenvectors_bin = np.where(eigenvectors > threshold1, 1, 0)
    return eigenvectors_bin

In [13]:
hash_codes = test_tbl(sim_mat, 32, threshold = np.min(np.max(sim_mat, axis=1)))

In [141]:
def map(eigenvectors_bin, labels, k):
    labels_dic = {i:0 for i in range(10)}
    labels_size_dic = {i:0 for i in range(10)}
    for i, eigenvector in enumerate(eigenvectors_bin):
        tp = 0
        fp = 0
        for j, eigenvector2 in enumerate(eigenvectors_bin):
            if np.abs(eigenvector - eigenvector2).sum() <= k:
                if labels[i] == labels[j]:
                    tp += 1
                else:
                    fp += 1
        labels_dic[labels[i]] += tp/(tp+fp)
        labels_size_dic[labels[i]] += 1
    ap_dic = {i:labels_dic[i]/labels_size_dic[i] for i in range(10)}
    map_value = np.array(list(ap_dic.values())).mean()
    return map_value

In [153]:
min_threshold = int(np.round(np.min(np.max(sim_mat, axis=1))*100))
min_threshold

58

In [158]:
results_df = pd.DataFrame(columns=["algorithm", "threshold", "hamming_distance", "bits", "map"])
c = 0
for bits in [32, 64, 128]:
    for k in range(1,4):
        for threshold in tqdm(range(min_threshold,95,2)):
            hashes_tbl = test_tbl(sim_mat,bits,threshold/100)
            hashes_tal = test_tal(sim_mat,bits,threshold/100)   
            map_tbl = map(hashes_tbl, labels, k)
            map_tal = map(hashes_tal, labels, k)
            results_df.loc[c] = ["tbl", threshold/100, k, bits, map_tbl]
            results_df.loc[c] = ["tal", threshold/100, k, bits, map_tal]
            c +=1
        hashes_tbl_sd = test_tbl(sim_mat,bits,threshold/100)
        map_tbl_sd = map(hashes_tbl, labels, k)
        results_df.loc[c] = ["tbl_sd", pd.NA, k, bits, map_tbl_sd]
        c+=1
        results_df.to_csv("spectral_hashing_test_results.csv", index=False)
    

  0%|          | 0/19 [00:00<?, ?it/s]


KeyboardInterrupt: 

In [None]:
def spectral_tal(images, bits):
    dim = images.shape[0]
    sim_mat = np.zeros([dim,dim])
    for i in tqdm(range(dim)):
        for j in range(i):
            sim_mat[i, j] = cos_sim(images[i], images[j])
    sim_mat = sim_mat + sim_mat.T
    
    threshold = np.min(np.max(sim_mat, axis=1))

    diag = np.sum((sim_mat > threshold), axis=1)
    D = np.zeros([dim,dim])
    for i in range(dim):
        D[i,i] = diag[i]
    L = D- sim_mat

    eigenvalues, eigenvectors = eigsh(L, k=bits, which="SM")
    threshold1 = np.median(eigenvectors)
    eigenvectors_bin = np.where(eigenvectors > threshold1, 1, 0)
    return eigenvectors_bin

In [None]:
def spectral_tbl(images, bits):
    dim = images.shape[0]
    sim_mat = np.zeros([dim,dim])
    for i in tqdm(range(dim)):
        for j in range(i):
            sim_mat[i, j] = cos_sim(images[i], images[j])
    sim_mat = sim_mat + sim_mat.T
    
    threshold = np.min(np.max(sim_mat, axis=1))

    sim_mat = np.where(sim_mat>=threshold,1,0)
    D = np.zeros([dim,dim])
    for i in range(dim):
        D[i,i] = sim_mat[i].sum()
    L = D- sim_mat

    eigenvalues, eigenvectors_bin = eigsh(L, k=bits, which="SM")
    return eigenvectors_bin

In [None]:
def spectral_tbl_summed_diag(images, bits):
    dim = images.shape[0]
    sim_mat = np.zeros([dim,dim])
    for i in tqdm(range(dim)):
        for j in range(i):
            sim_mat[i, j] = cos_sim(images[i], images[j])
    sim_mat = sim_mat + sim_mat.T
    
    threshold = np.min(np.max(sim_mat, axis=1))

    diag = np.sum(sim_mat, axis=1)
    D = np.zeros([dim,dim])
    for i in range(dim):
        D[i,i] = diag[i]
    L = D- sim_mat

    eigenvalues, eigenvectors = eigsh(L, k=bits, which="SM")
    threshold1 = np.median(eigenvectors)
    eigenvectors_bin = np.where(eigenvectors > threshold1, 1, 0)
    return eigenvectors_bin

In [116]:
threshold = np.min(np.max(sim_mat, axis=1))

In [117]:
diag = np.sum((sim_mat > threshold), axis=1)
D = np.zeros([10000,10000])
for i in range(10000):
    D[i,i] = diag[i]

L = D- sim_mat

In [118]:
diag2 = np.sum(sim_mat, axis=1)
D2 = np.zeros([10000,10000])
for i in range(10000):
    D2[i,i] = diag2[i]

L2 = D2- sim_mat

In [119]:
eigenvalues, eigenvectors = eigsh(L, k=32, which="SM")

In [120]:
eigenvalues2, eigenvectors2 = eigsh(L2, k=32, which="SM")

In [121]:
threshold1 = np.median(eigenvectors)
threshold2 = np.median(eigenvectors2)

In [122]:
eigenvectors_bin = np.where(eigenvectors > threshold1, 1, 0)
eigenvectors_bin2 = np.where(eigenvectors2 > threshold2, 1, 0)

In [123]:
eigenvectors_bin

array([[1, 0, 1, ..., 1, 0, 1],
       [0, 1, 1, ..., 1, 1, 0],
       [0, 1, 0, ..., 0, 1, 0],
       ...,
       [0, 0, 0, ..., 0, 1, 1],
       [0, 0, 0, ..., 0, 1, 0],
       [1, 0, 1, ..., 0, 1, 0]])

In [124]:
print(len(np.unique(eigenvectors_bin,axis = 0)))
print(len(np.unique(eigenvectors_bin2,axis = 0)))

6911
3211


In [129]:
def map(eigenvectors_bin, labels, k):
    labels_dic = {i:0 for i in range(10)}
    labels_size_dic = {i:0 for i in range(10)}
    for i, eigenvector in enumerate(tqdm(eigenvectors_bin)):
        tp = 0
        fp = 0
        for j, eigenvector2 in enumerate(eigenvectors_bin):
            if np.abs(eigenvector - eigenvector2).sum() <= k:
                if labels[i] == labels[j]:
                    tp += 1
                else:
                    fp += 1
        labels_dic[labels[i]] += tp/(tp+fp)
        labels_size_dic[labels[i]] += 1
    ap_dic = {i:labels_dic[i]/labels_size_dic[i] for i in range(10)}
    map_value = np.array(list(ap_dic.values())).mean()
    return map_value

In [130]:
map(eigenvectors_bin, labels, 2)

100%|██████████| 10000/10000 [04:08<00:00, 40.17it/s]


0.7537130294700397

In [131]:
map(eigenvectors_bin2, labels, 2)

100%|██████████| 10000/10000 [04:03<00:00, 41.12it/s]


0.37877593377796936