In [None]:
import os
import numpy as np
from PIL import Image
from io import BytesIO
import requests
from scipy.spatial.distance import jaccard
import pandas as pd
from elasticsearch import Elasticsearch, helpers
from sklearn.cluster import KMeans
from tqdm import tqdm

# load features

In [None]:
feature_vector_dir = "/Users/pimh/Desktop/feature_vectors/"

In [None]:
feature_vector_ids = np.random.choice(os.listdir(feature_vector_dir), 2_000)
# feature_vector_ids = os.listdir(feature_vector_dir)

In [None]:
feature_vector_paths = [
    os.path.join(feature_vector_dir, id) for id in feature_vector_ids
]

In [None]:
feature_vectors = []
for path in feature_vector_paths:
    with open(path) as f:
        feature_vector = np.fromfile(f, dtype=np.float32)
        feature_vectors.append(feature_vector)

feature_vectors = np.stack(feature_vectors)

In [None]:
feature_vectors

# images

In [None]:
def get_image(query_id):
    base_url = (
        "https://iiif.wellcomecollection.org/image/{}.jpg/full/,300/0/default.jpg"
    )
    response = requests.get(base_url.format(query_id))
    image = Image.open(BytesIO(response.content))
    return image

In [None]:
query_ix = np.random.choice(len(feature_vectors))
query_id = feature_vector_ids[query_ix]

get_image(query_id)

# reduce dims to 512

In [None]:
from sklearn.decomposition import PCA

In [None]:
reduced_dim_feature_vectors = PCA(n_components=256).fit_transform(feature_vectors)

# send data to elasticsearch

In [None]:
def get_es_client():
    username = ""
    password = ""
    url = ""
    return Elasticsearch(url, http_auth=(username, password))

In [None]:
es = get_es_client()
es.indices.delete(index="dense-vectors")

In [None]:
es.indices.create(
    index="dense-vectors",
    body={
        "mappings": {
            "properties": {"feature_vector": {"type": "dense_vector", "dims": 256}}
        }
    },
)

In [None]:
from pprint import pprint

pprint(es.indices.get_field_mapping(index="dense-vectors", fields=["feature_vector"]))

In [None]:
actions = [
    {
        "_index": "dense-vectors",
        "_type": "feature_vector",
        "_id": feature_vector_id,
        "_source": {
            "feature_vector": feature_vector.tolist(),
            "another_field": "some text",
        },
    }
    for feature_vector_id, feature_vector in tqdm(
        zip(feature_vector_ids, reduced_dim_feature_vectors)
    )
]

In [None]:
helpers.bulk(es, actions)