In [None]:
import os
import numpy as np
from PIL import Image
from io import BytesIO
import requests
from scipy.spatial.distance import jaccard
import pandas as pd
from elasticsearch import Elasticsearch, helpers
from sklearn.cluster import KMeans, MeanShift
from tqdm import tqdm
import pickle

# load features

In [None]:
feature_vector_dir = "/Users/pimh/Desktop/feature_vectors/"

In [None]:
feature_vector_ids = np.random.choice(
    os.listdir(feature_vector_dir), 10_000, replace=False
)
# feature_vector_ids = os.listdir(feature_vector_dir)

In [None]:
feature_vector_paths = [
    os.path.join(feature_vector_dir, id) for id in feature_vector_ids
]

In [None]:
feature_vectors = []
for path in feature_vector_paths:
    with open(path) as f:
        feature_vector = np.fromfile(f, dtype=np.float32)
        feature_vectors.append(feature_vector)

feature_vectors = np.stack(feature_vectors)

In [None]:
feature_vectors

# images

In [None]:
def get_image(query_id):
    base_url = (
        "https://iiif.wellcomecollection.org/image/{}.jpg/full/,300/0/default.jpg"
    )
    response = requests.get(base_url.format(query_id))
    image = Image.open(BytesIO(response.content))
    return image

In [None]:
query_id = np.random.choice(feature_vector_ids)

get_image(query_id)

# divide and binarize from sub-clusters 

In [None]:
feature_groups = np.split(feature_vectors, indices_or_sections=256, axis=1)

In [None]:
for i, feature_group in enumerate(tqdm(feature_groups)):
    clustering_alg = MeanShift(n_clusters=32).fit(feature_group)
    with open(f"models/kmeans_{i}.pkl", "wb") as f:
        pickle.dump(clustering_alg, f)

# encode _all_ features using clustering models trained on subset

In [None]:
feature_vector_ids = os.listdir(feature_vector_dir)

feature_vector_paths = [
    os.path.join(feature_vector_dir, id) for id in feature_vector_ids
]

feature_vectors = []
for path in feature_vector_paths:
    with open(path) as f:
        feature_vector = np.fromfile(f, dtype=np.float32)
        feature_vectors.append(feature_vector)

feature_vectors = np.stack(feature_vectors)
feature_vectors.shape

In [None]:
clusters = []
feature_groups = np.split(feature_vectors, indices_or_sections=256, axis=1)

for i, feature_group in enumerate(tqdm(feature_groups)):
    with open(f"models/kmeans_{i}.pkl", "rb") as f:
        kmeans = pickle.load(f)

    labels = kmeans.predict(feature_group)
    clusters.append(labels)

In [None]:
clusters = np.vstack(clusters).T

# send data to elasticsearch

In [None]:
def listify_for_es(cluster_array):
    return [f"{i}-{val}" for i, val in enumerate(cluster_array)]


def get_es_client():
    username = ""
    password = ""
    url = ""
    return Elasticsearch(url, http_auth=(username, password))

In [None]:
es = get_es_client()

In [None]:
index_name = "image-similarity-256-32-agg"
# es.indices.delete(index=index_name)

In [None]:
es.indices.create(index=index_name)

In [None]:
actions = [
    {
        "_index": index_name,
        "_type": "feature_vector",
        "_id": feature_vector_id,
        "_source": {"feature_vector": listify_for_es(cluster_array)},
    }
    for feature_vector_id, cluster_array in tqdm(zip(feature_vector_ids, clusters))
]

In [None]:
helpers.bulk(es, actions)

# search

In [None]:
def stack_images(images):
    return Image.fromarray(
        np.concatenate([np.array(image) for image in images], axis=1)
    )


def get_neighbour_images(query_id, index_name, n=10):
    res = es.search(
        index=index_name,
        size=n,
        body={
            "query": {
                "more_like_this": {
                    "fields": ["feature_vector.keyword"],
                    "like": [{"_index": index_name, "_id": query_id}],
                    "min_term_freq": 1,
                }
            }
        },
    )

    neighbour_ids = [hit["_id"] for hit in res["hits"]["hits"]]
    print(res["hits"]["total"]["value"])
    print(neighbour_ids)
    neighbour_images = [get_image(id) for id in neighbour_ids]
    return stack_images(neighbour_images)

In [None]:
query_id = np.random.choice(feature_vector_ids)
print(query_id)

get_image(query_id)

In [None]:
get_neighbour_images(query_id, index_name)

# evaluate

In [None]:
import pickle
import math

In [None]:
with open("data/exact_nearest_neighbour.pkl", "rb") as f:
    exact_nearest_neighbour_dict = pickle.load(f)

query_ids = np.array(list(exact_nearest_neighbour_dict.keys()))

In [None]:
def calculate_badness(preds, targets):
    total_badness = 0
    shared_hashes = list(set(preds) & set(targets))
    for work_id in shared_hashes:
        pred = np.where(preds == work_id)[0][0]
        target = np.where(targets == work_id)[0][0]
        badness = abs(pred - target) / math.log(target + 2)
        total_badness += badness

    return total_badness / len(shared_hashes)

In [None]:
def get_neighbour_ids(query_id):
    res = es.search(
        index="image-similarity",
        size=1000,
        body={
            "query": {
                "more_like_this": {
                    "fields": ["feature_vector.keyword"],
                    "like": [{"_index": "image-similarity-256-256", "_id": query_id}],
                    "min_term_freq": 1,
                }
            }
        },
    )

    neighbour_ids = [hit["_id"] for hit in res["hits"]["hits"]]
    return neighbour_ids

In [None]:
badnesses = {}

for i, query_id in enumerate(query_ids):
    preds = np.array(get_neighbour_ids(query_id))[:100]
    targets = np.array(exact_nearest_neighbour_dict[query_id])[:100]
    badness = calculate_badness(preds, targets)
    badnesses[query_id] = badness
    print(i, "\t", badness)