In [None]:
import os
import random as random
from tqdm.notebook import tqdm
from elasticsearch import Elasticsearch
import pandas as pd

In [None]:
client = Elasticsearch(
    os.environ["ELASTIC_PIPELINE_HOST"],
    basic_auth=(
        os.environ["ELASTIC_PIPELINE_USERNAME"],
        os.environ["ELASTIC_PIPELINE_PASSWORD"],
    ),
)

In [None]:
work_doc = client.get(index="works-indexed-2022-07-04", id="df3sedyg")
image_doc = client.get(index="images-indexed-2022-07-04", id="fdgrjrwb")

In [None]:
bad_image_matches = [
    {"source_id": "fdgrjrwb", "target_id": "v75jmdmc"},
    {"source_id": "dwhuv3ph", "target_id": "cg7hzgv8"},
]

In [None]:
def get_similar_images(
    image_id: str, features: bool = True, colors: bool = True, size: int = 6
) -> list:
    fields = []
    if features:
        fields.append(
            "state.inferredData.lshEncodedFeatures",
        )
    if colors:
        fields.append("state.inferredData.palette")

    similar_images = client.search(
        index="images-indexed-2022-07-04",
        query={
            "more_like_this": {
                "fields": fields,
                "like": [
                    {
                        "_index": "images-indexed-2022-07-04",
                        "_id": image_id,
                    }
                ],
                "min_term_freq": 1,
                "min_doc_freq": 1,
                "max_query_terms": 1000,
                "minimum_should_match": "1",
            }
        },
        size=size,
        _source=["_id"],
    )
    return similar_images["hits"]["hits"]

In [None]:
get_similar_images(bad_image_matches[0]["source_id"])

In [None]:
def get_random_image_id():
    random_image = client.search(
        index="images-indexed-2022-07-04",
        query={
            "function_score": {
                "random_score": {
                    "seed": random.randint(0, 1000000),
                    "field": "_id",
                }
            }
        },
        size=1,
        _source=["_id"],
    )

    return random_image["hits"]["hits"][0]["_id"]

In [None]:
random_image_id = get_random_image_id()
print(
    f"https://api.wellcomecollection.org/catalogue/v2/images/{random_image_id}"
)

similar_images = get_similar_images(random_image_id)

In [None]:
[
    (match["_id"], match["_score"]) for match in similar_images
]

In [None]:
scores = []

for i in tqdm(range(1000)):
    random_image_id = get_random_image_id()
    similar_images = get_similar_images(random_image_id)
    scores.append(
        {i: match["_score"] for i, match in enumerate(similar_images)}
    )

In [None]:
scores = pd.DataFrame(scores)
scores

In [None]:
scores.to_json("data/top-6-scores.json", orient="records")

In [None]:
scores.describe()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="darkgrid")

sns.histplot(data=scores, x=0, color="blue", label=0, alpha=0.05, kde=True, binwidth=20)
sns.histplot(data=scores, x=1, color="red", label=1, alpha=0.05, kde=True, binwidth=20)
sns.histplot(data=scores, x=2, color="green", label=2, alpha=0.05, kde=True, binwidth=20)
sns.histplot(data=scores, x=3, color="purple", label=3, alpha=0.05, kde=True, binwidth=20)
sns.histplot(data=scores, x=4, color="orange", label=4, alpha=0.05, kde=True, binwidth=20)
sns.histplot(data=scores, x=5, color="black", label=5, alpha=0.05, kde=True, binwidth=20)

plt.legend() 
plt.show()

In [None]:
scores.hist(bins=100)

In [None]:
match = bad_image_matches[1]

explanation = client.explain(
    index="images-indexed-2022-07-04",
    id=match["target_id"],
    query={
        "more_like_this": {
            "fields": [
                "state.inferredData.lshEncodedFeatures",
                "state.inferredData.palette"
            ],
            "like": [
                {
                    "_index": "images-indexed-2022-07-04",
                    "_id": match["source_id"],
                }
            ],
            "min_term_freq": 1,
            "min_doc_freq": 1,
            "max_query_terms": 1000,
            "minimum_should_match": "1",
        }
    },
    _source=["_id"],
)
import json
print(json.dumps(explanation['explanation'], indent=2))