In [None]:
import os
import numpy as np
from PIL import Image
from io import BytesIO
import requests
from scipy.spatial.distance import jaccard
import pandas as pd
from elasticsearch import Elasticsearch, helpers
from sklearn.cluster import KMeans
from tqdm import tqdm

# load features

In [None]:
feature_vector_dir = "/Users/pimh/Desktop/palette_vectors/"

In [None]:
with open(os.path.join(feature_vector_dir, "image_ids.npy"), "rb") as f:
    feature_vector_ids = np.load(f)

In [None]:
with open(os.path.join(feature_vector_dir, "palette_embeddings.npy"), "rb") as f:
    feature_vectors = np.load(f)

In [None]:
feature_vectors

In [None]:
feature_vectors.shape

# images

In [None]:
def get_image(query_id):
    base_url = (
        "https://iiif.wellcomecollection.org/image/{}.jpg/full/,300/0/default.jpg"
    )
    response = requests.get(base_url.format(query_id))
    image = Image.open(BytesIO(response.content))
    return image

In [None]:
images = [get_image(feature_vector_ids[np.random.choice(25_000)]) for _ in range(10)]

In [None]:
query_id = np.random.choice(feature_vector_ids)

get_image(query_id)

# divide and binarize from sub-clusters 

In [None]:
feature_sections = np.split(feature_vectors, indices_or_sections=10, axis=1)

In [None]:
clusters = []
for i, section in enumerate(feature_sections):
    kmeans = KMeans(n_clusters=64).fit(section)
    clusters.append(kmeans.labels_)
    print(i)

In [None]:
clusters = np.vstack(clusters).T

In [None]:
clusters.shape

In [None]:
def listify_for_es(cluster_array):
    return [f"{i}-{val}" for i, val in enumerate(cluster_array)]

# send data to elasticsearch

In [None]:
def get_es_client():
    username = ""
    password = ""
    url = ""
    return Elasticsearch(url, http_auth=(username, password))

In [None]:
es = get_es_client()
es.indices.delete(index="palette-similarity")

In [None]:
es.indices.create(index="palette-similarity")

In [None]:
actions = [
    {
        "_index": "palette-similarity",
        "_type": "feature_vector",
        "_id": feature_vector_id,
        "_source": {"feature_vector": listify_for_es(cluster_array)},
    }
    for feature_vector_id, cluster_array in tqdm(zip(feature_vector_ids, clusters))
]

In [None]:
helpers.bulk(es, actions)

# search

In [None]:
def stack_images(images):
    return Image.fromarray(
        np.concatenate([np.array(image) for image in images], axis=1)
    )


def get_neighbour_images(query_id, n=10):
    res = es.search(
        index="palette-similarity",
        size=n,
        body={
            "query": {
                "more_like_this": {
                    "fields": ["feature_vector.keyword"],
                    "like": [{"_index": "palette-similarity", "_id": query_id}],
                    "min_term_freq": 1,
                }
            }
        },
    )

    neighbour_ids = [hit["_id"] for hit in res["hits"]["hits"]]
    print(res["hits"]["total"]["value"])
    neighbour_images = [get_image(id) for id in neighbour_ids]
    return stack_images(neighbour_images)

In [None]:
query_id = np.random.choice(feature_vector_ids)
print(query_id)

get_image(query_id)

In [None]:
get_neighbour_images(query_id)