# Exact calculation for a set of randomly chosen images

In [None]:
import os
import numpy as np
from PIL import Image
from io import BytesIO
import requests
from scipy.spatial.distance import jaccard
import pandas as pd
from elasticsearch import Elasticsearch, helpers
from sklearn.cluster import KMeans
from tqdm import tqdm
from scipy.spatial.distance import cosine, cdist

# load features

In [None]:
feature_vector_dir = "/Users/pimh/Desktop/feature_vectors/"

In [None]:
# feature_vector_ids = np.random.choice(os.listdir(feature_vector_dir), 25_000)
feature_vector_ids = os.listdir(feature_vector_dir)

In [None]:
feature_vector_paths = [
    os.path.join(feature_vector_dir, id) for id in feature_vector_ids
]

In [None]:
feature_vectors = []
for path in feature_vector_paths:
    with open(path) as f:
        feature_vector = np.fromfile(f, dtype=np.float32)
        feature_vectors.append(feature_vector)

feature_vectors = np.stack(feature_vectors)

In [None]:
feature_vectors

# images

In [None]:
def get_image(query_id):
    base_url = (
        "https://iiif.wellcomecollection.org/image/{}.jpg/full/,300/0/default.jpg"
    )
    response = requests.get(base_url.format(query_id))
    image = Image.open(BytesIO(response.content))
    return image


def stack_images(images):
    return Image.fromarray(
        np.concatenate([np.array(image) for image in images], axis=1)
    )


def images_from_ids(ids, n=10):
    neighbour_images = [get_image(id) for id in ids[:n]]
    return stack_images(neighbour_images)

In [None]:
def nearest_neighbours(query_id):
    query_index = np.where(np.array(feature_vector_ids) == query_id)[0][0]
    query = feature_vectors[query_index].reshape(1, -1)
    distances = cdist(query, feature_vectors, "cosine")
    ordered_indexes = np.argsort(distances)[0]
    ordered_ids = [feature_vector_ids[index] for index in ordered_indexes]

    return ordered_ids

In [None]:
query_id = np.random.choice(feature_vector_ids)

print(query_id)
get_image(query_id)

In [None]:
neighbour_ids = nearest_neighbours(query_id)
images_from_ids(neighbour_ids)

In [None]:
query_ids = np.random.choice(feature_vector_ids, 1000, replace=False)
exact_nearest_neighbour_dict = {}

for _ in tqdm(query_ids):
    query_id = np.random.choice(feature_vector_ids)
    exact_nearest_neighbour_dict[query_id] = nearest_neighbours(query_id)

In [None]:
import pickle

with open("exact_nearest_neighbour.pkl", "wb") as f:
    pickle.dump(exact_nearest_neighbour_dict, f)