In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (20, 20)

In [None]:
import os
import numpy as np
from PIL import Image
from io import BytesIO
import requests
from scipy.spatial.distance import jaccard
import pandas as pd
from elasticsearch import Elasticsearch, helpers
from sklearn.cluster import KMeans, AgglomerativeClustering
from tqdm import tqdm
import pickle

# load features

In [None]:
feature_vector_dir = "/Users/pimh/Desktop/feature_vectors/"

In [None]:
feature_vector_ids = np.random.choice(os.listdir(feature_vector_dir), 25_000)
# feature_vector_ids = os.listdir(feature_vector_dir)

In [None]:
feature_vector_paths = [
    os.path.join(feature_vector_dir, id) for id in feature_vector_ids
]

In [None]:
feature_vectors = []
for path in feature_vector_paths:
    with open(path) as f:
        feature_vector = np.fromfile(f, dtype=np.float32)
        feature_vectors.append(feature_vector)

feature_vectors = np.stack(feature_vectors)

In [None]:
feature_vectors.shape

# load column labels 

In [None]:
with open("data/column_labels.npy", "rb") as f:
    column_labels = np.load(f)

# split feature vectors by label, and find clusters within groups

In [None]:
clusters = []

for i in tqdm(np.unique(column_labels)):
    feature_group = feature_vectors[:, column_labels == 1]
    kmeans = KMeans(n_clusters=32).fit(feature_group)
    clusters.append(kmeans.labels_)

    with open(f"models/kmeans_{i}.pkl", "wb") as f:
        pickle.dump(kmeans, f)

# encode _all_ features using clustering models trained on subset

In [None]:
feature_vector_ids = os.listdir(feature_vector_dir)

feature_vector_paths = [
    os.path.join(feature_vector_dir, id) for id in feature_vector_ids
]

feature_vectors = []
for path in feature_vector_paths:
    with open(path) as f:
        feature_vector = np.fromfile(f, dtype=np.float32)
        feature_vectors.append(feature_vector)

feature_vectors = np.stack(feature_vectors)

In [None]:
feature_vectors.shape

In [None]:
clusters = []

for i in tqdm(np.unique(column_labels)):
    with open(f"models/kmeans_{i}.pkl", "rb") as f:
        kmeans = pickle.load(f)

    feature_group = feature_vectors[:, column_labels == 1]
    labels = kmeans.predict(feature_group)
    clusters.append(labels)

# send data to elasticsearch

In [None]:
clusters = np.vstack(clusters).T

In [None]:
clusters.shape

In [None]:
def listify_for_es(cluster_array):
    return [f"{i}-{val}" for i, val in enumerate(cluster_array)]

In [None]:
def get_es_client():
    username = ""
    password = ""
    url = ""
    return Elasticsearch(url, http_auth=(username, password))

In [None]:
index_name = "image-similarity-256-32"

es = get_es_client()
es.indices.delete(index=index_name)

In [None]:
es.indices.create(index=index_name)

In [None]:
actions = [
    {
        "_index": index_name,
        "_type": "feature_vector",
        "_id": feature_vector_id,
        "_source": {"feature_vector": listify_for_es(cluster_array)},
    }
    for feature_vector_id, cluster_array in tqdm(zip(feature_vector_ids, clusters))
]

In [None]:
helpers.bulk(es, actions)

# search

In [None]:
def get_image(query_id):
    base_url = (
        "https://iiif.wellcomecollection.org/image/{}.jpg/full/,300/0/default.jpg"
    )
    response = requests.get(base_url.format(query_id))
    image = Image.open(BytesIO(response.content))
    return image


def stack_images(images):
    return Image.fromarray(
        np.concatenate([np.array(image) for image in images], axis=1)
    )


def get_neighbour_images(query_id, index_name, n=10):
    res = es.search(
        index=index_name,
        size=n,
        body={
            "query": {
                "more_like_this": {
                    "fields": ["feature_vector.keyword"],
                    "like": [{"_index": index_name, "_id": query_id}],
                    "min_term_freq": 1,
                }
            }
        },
    )

    neighbour_ids = [hit["_id"] for hit in res["hits"]["hits"]]
    print(res["hits"]["total"]["value"])
    neighbour_images = [get_image(id) for id in neighbour_ids]
    return stack_images(neighbour_images)

In [None]:
query_id = np.random.choice(feature_vector_ids)
print(query_id)

get_image(query_id)

In [None]:
get_neighbour_images(query_id, index_name)