In [1]:
%env AWS_PROFILE=platform-developer

env: AWS_PROFILE=platform-developer


In [2]:
from utils.aws import get_secret
import elasticsearch

import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

ES_ENDPOINT = "https://semantic-playground-b28f61.es.eu-west-1.aws.elastic.cloud:443"
ES_API_KEY = get_secret("agnes/elasticsearch/semantic-playground")
ES_CLIENT = elasticsearch.Elasticsearch(ES_ENDPOINT, api_key=ES_API_KEY, request_timeout=120)

In [3]:
# Pretty print utils
import re

BOLD = "\033[1m"
RESET = "\033[0m"
RESET_COLOR = "\033[39m"


def get_work_url(work_id: str):
    return f"https://wellcomecollection.org/works/{work_id}"


def highlight_terms(text, terms, color="\033[92m"):
    for term in terms:
        pattern = re.compile(re.escape(term), re.IGNORECASE)
        text = pattern.sub(lambda m: f"{color}{m.group(0)}{RESET_COLOR}", text)

    return text


def print_bold(text: str):
    print(f"{BOLD}{text}{RESET}")


In [154]:
import math


def get_production_query(query: str, *args):
    return {
        "bool": {
            "should": [
                {
                    "multi_match": {
                        "_name": "text_strict",
                        "query": f"{query}",
                        "fields": [
                            "query.title.*^5",
                            "query.title.cased^10",
                            "query.contributors.agent.label^10",
                            "query.subjects.concepts.label^10",
                            "query.genres.concepts.label^10",
                            "query.production.label.*^10",
                            "query.partOf.title.*^10",
                            "query.alternativeTitles.*",
                            "query.description.*",
                            "query.edition",
                            "query.languages.label",
                            "query.lettering.*",
                            "query.notes.contents.*",
                            "query.physicalDescription.*"
                        ],
                        "type": "cross_fields",
                        "minimum_should_match": "3<-20%",
                        "operator": "Or"
                    }
                },
                {
                    "match_phrase_prefix": {
                        "query.title.normalized_whole_phrase": {
                            "_name": "title_prefix",
                            "query": f"{query}",
                            "boost": 50
                        }
                    }
                },
                {
                    "multi_match": {
                        "_name": "ids_lax",
                        "query": f"{query}",
                        "analyzer": "lowercase_whitespace_tokens",
                        "fields": [
                            "query.id^5",
                            "query.sourceIdentifier.value^5",
                            "query.identifiers.value",
                            "query.items.id",
                            "query.items.identifiers.value",
                            "query.images.id",
                            "query.images.identifiers.value",
                            "query.referenceNumber*"
                        ],
                        "type": "cross_fields",
                        "boost": 100,
                        "operator": "OR",
                        "minimum_should_match": 1
                    }
                },
                {
                    "multi_match": {
                        "_name": "ids_with_path_lax",
                        "query": f"{query}",
                        "analyzer": "lowercase_whitespace_tokens",
                        "fields": ["query.items.shelfmark*", "query.collectionPath*"],
                        "type": "cross_fields",
                        "boost": 50,
                        "operator": "OR",
                        "minimum_should_match": 1
                    }
                },
                {
                    "bool": {
                        "must": [
                            {
                                "multi_match": {
                                    "_name": "text_lax",
                                    "query": f"{query}",
                                    "fields": [
                                        "query.title.*^5",
                                        "query.title.cased^10",
                                        "query.contributors.agent.label^10",
                                        "query.subjects.concepts.label^10",
                                        "query.genres.concepts.label^10",
                                        "query.production.label.*^10",
                                        "query.partOf.title.*^10",
                                        "query.alternativeTitles.*",
                                        "query.description.*",
                                        "query.edition",
                                        "query.languages.label",
                                        "query.lettering.*",
                                        "query.notes.contents.*",
                                        "query.physicalDescription.*"
                                    ],
                                    "type": "cross_fields",
                                    "tie_breaker": 0.4,
                                    "minimum_should_match": 1,
                                    "operator": "OR"
                                }
                            }
                        ],
                        "filter": [
                            {
                                "multi_match": {
                                    "query": f"{query}",
                                    "analyzer": "lowercase_whitespace_tokens",
                                    "fields": [
                                        "query.id^5",
                                        "query.sourceIdentifier.value^5",
                                        "query.referenceNumber*^5",
                                        "query.identifiers.value",
                                        "query.items.id",
                                        "query.items.identifiers.value",
                                        "query.items.shelfmark*",
                                        "query.images.id",
                                        "query.images.identifiers.value",
                                        "query.collectionPath*"
                                    ],
                                    "type": "cross_fields",
                                    "operator": "OR",
                                    "minimum_should_match": 1
                                }
                            }
                        ]
                    }
                }
            ]
        }
    }


def get_basic_query(query: str, fields: list[str], *args) -> dict:
    return {
        "bool": {
            "should": [
                {"match": {field: {"query": query}}}
                for field in fields
            ]
        }
    }


def get_text_expansion_query(query: str, fields: list[str]) -> dict:
    return {
        "bool": {
            "should": [
                {
                    "text_expansion": {
                        field: {
                            "model_id": ".elser_model_2_linux-x86_64",
                            "model_text": query
                        }
                    }
                }
                for field in fields
            ]
        }
    }


def get_full_semantic_query(query: str, fields: list[str]) -> dict:
    full_query = get_production_query(query)
    semantic_query = {
        "bool": {
            "should": [
                {"match": {field: {"query": query}}}
                for field in fields
            ]
        }
    }

    semantic_with_lax_text = {
        "bool": {
            "_name": "text_lax_with_semantics",
            "must": [
                {
                    "multi_match": {
                        "_name": "text_lax",
                        "query": query,
                        "fields": [
                            "query.title.*^5",
                            "query.title.cased^10",
                            "query.contributors.agent.label^10",
                            "query.subjects.concepts.label^10",
                            "query.genres.concepts.label^10",
                            "query.production.label.*^10",
                            "query.partOf.title.*^10",
                            "query.alternativeTitles.*",
                            "query.description.*",
                            "query.edition",
                            "query.languages.label",
                            "query.lettering.*",
                            "query.notes.contents.*",
                            "query.physicalDescription.*"
                        ],
                        "type": "cross_fields",
                        "operator": "OR",
                        "minimum_should_match": 1
                    }
                }
            ],
            "should": [semantic_query]
        }
    }

    full_query["bool"]["should"].append(semantic_with_lax_text)
    return full_query


def get_basic_knn_query(query: str, fields: list[str], model_id: str, *args) -> dict:
    search_query = {
        "bool": {
            "should": []
        }
    }

    for field in fields:
        search_query["bool"]["should"].append({
            "knn": {
                "field": field,
                "k": 50,
                "num_candidates": 500,
                # "boost": 1.0,
                "query_vector_builder": {
                    "text_embedding": {
                        "model_id": model_id,
                        "model_text": query
                    }
                }
            }
        }
        )

    return search_query


def get_openai_knn_query(query: str, fields: list[str], *args):
    return get_basic_knn_query(query, fields, "openai-text_embedding-muvikv9j5f")


def get_titan_knn_query(query: str, fields: list[str], *args):
    return get_basic_knn_query(query, fields, "amazon-bedrock-titan-embeddings")


def get_basic_sparse_vector_query(query: str, fields: list[str]) -> dict:
    search_query = {
        "bool": {
            "should": [],
            #   "minimum_should_match": math.ceil(len(fields) / 2)
        }
    }

    for field in fields:
        search_query["bool"]["should"].append({
            "sparse_vector": {
                "field": field,
                "query": query,
                "prune": True,
                "pruning_config": {
                    "tokens_freq_ratio_threshold": 2,
                    "tokens_weight_threshold": 0.4,
                    "only_score_pruned_tokens": False
                }
            }
        }
        )

    return search_query


def get_rrf_query(query: str, semantic_query, min_score) -> dict:
    full_query = get_production_query(query)

    return {
        "retriever": {
            "rrf": {
                "retrievers": [
                    {
                        "standard": {
                            "query": full_query
                        }
                    },
                    {
                        "standard": {
                            "query": semantic_query,
                            "min_score": min_score,
                        }
                    }
                ],
                "rank_window_size": 10000,
                "rank_constant": 20
            }
        }
    }


def get_rrf_query_with_min_should_match(query: str, fields: list[str], min_score) -> dict:
    semantic_query = get_basic_sparse_vector_query(query, fields)
    return get_rrf_query(query, semantic_query, min_score)

def get_rrf_query_open_ai(query: str, fields: list[str], min_score) -> dict:
    semantic_query = get_openai_knn_query(query, fields)
    return get_rrf_query(query, semantic_query, min_score)


def get_rrf_query_with_multi_match(query: str, fields: list[str], min_score) -> dict:
    semantic_query = get_full_semantic_query(query, fields)
    return get_rrf_query(query, semantic_query, min_score)

In [80]:
def in_colour(text: str, colour: str):
    print(f"{colour}{text}{RESET_COLOR}", end=" ")


def get_es_request_body(query: str, config: dict):
    body = {"size": SIZE, "track_total_hits": True}

    full_query = config["get_query_function"](query, config.get("semantic_fields"), config.get("semantic_min_score"))

    if "retriever" in full_query:
        body = {**full_query, **body}
    else:
        body["query"] = full_query

    return body


def get_combined_query_results(query: str):
    results = {}
    for config in TO_COMPARE:
        body = get_es_request_body(query, config)

        response = dict(ES_CLIENT.search(index=config["index"], body=body))
        results[config["label"]] = response

        hits = response["hits"]["hits"]
        results[config["label"]]["ranking"] = {hit["_id"]: i + 1 for i, hit in enumerate(hits)}
        results[config["label"]]["ids"] = set(hit["_id"] for hit in hits)

    return results


def compare_query_results(query: str):
    print(f"{BOLD}Query:{RESET} {query}")
    query_terms = query.split(" ")

    results = get_combined_query_results(query)
    print(f"{BOLD}Total results:{RESET}", end=" ")
    for config in TO_COMPARE:
        text = f"{config["label"]} {results[config["label"]]["hits"]["total"]["value"]}"
        in_colour(text, colour=config["colour"])

    print("\n")

    seen = set()
    for i in range(PRINT_LIMIT):
        print(f"{BOLD}————— {i + 1} —————{RESET}\n")
        for config in TO_COMPARE:
            hits = results[config["label"]]["hits"]["hits"]

            if len(hits) > i:
                hit = hits[i]
                work_id = hit["_id"]
                if work_id not in seen:
                    seen.add(work_id)
                    print(get_work_url(work_id))
                    print_bold(highlight_terms(hit["_source"]["display"]["title"], query_terms))
                    if "description" in hit["_source"]["display"]:
                        print(highlight_terms(hit["_source"]["display"]["description"], query_terms))

                    for config_2 in TO_COMPARE:
                        text = f"{config_2["label"]} {results[config_2["label"]]["ranking"].get(work_id, "-")}"
                        in_colour(text, colour=config_2["colour"])

                    print("\n")


def find_needle_in_haystack(query: str, work_id: str):
    for config in TO_COMPARE:
        body = get_es_request_body(query, config)
        response = dict(ES_CLIENT.search(index=config["index"], body=body))

        ids = [h["_id"] for h in response["hits"]["hits"]]

        if work_id in ids:
            in_colour(f"{config["label"]} {ids.index(work_id) + 1}", colour=config["colour"])
        else:
            in_colour(f"{config["label"]} -", colour=config["colour"])

    print("\n")

In [180]:
# 100K sample tests
ELSER_100K = {
    "label": "ELSER",
    "index": "works-elser-title-description",
    "get_query_function": get_basic_query,
    "colour": "\033[34m",
    "semantic_fields": ["query.titleSemantic", "query.descriptionSemantic"]
}

TITAN_100K = {
    "label": "Titan",
    "index": "works-titan-title-description",
    "get_query_function": get_titan_knn_query,
    "colour": "\033[91m",
    "semantic_fields": ["query.titleSemantic", "query.descriptionSemantic"]
}

OPEN_AI_SEMANTIC_100K = {
    "label": "OpenAI",
    "index": "works-openai-title-description",
    "get_query_function": get_openai_knn_query,
    "colour": "\033[93m",
    "semantic_fields": ["query.titleSemantic", "query.descriptionSemantic"]
}

NON_SEMANTIC_100K = {
    "label": "Control",
    "index": "works-titan-title-description",
    "get_query_function": get_basic_query,
    "colour": "\033[95m",
    "semantic_fields": ["query.title", "query.description"]
}

TO_COMPARE = [ELSER_100K, TITAN_100K, OPEN_AI_SEMANTIC_100K, NON_SEMANTIC_100K]
SIZE = 1000
PRINT_LIMIT = 10

# https://www.elastic.co/search-labs/blog/semantic-search-match-knn-sparse-vector
# "token pruning is about pruning irrelevant tokens to improve pefind_needle_in_haystack("flower magazine", "c2jj7zfd")rformance, not drastically change recall or relevance"
# Vector search is meant to improve recall. Lexical search will help with precision.

QUERY = "ancient manuscript on astronomy"
QUERY = "czech republic capital"
# QUERY = "surgery knife"

# Testing for problematic connections
QUERY = "photos of savages"
QUERY = "backward cultures"
QUERY = "photos of inferior race"

#find_needle_in_haystack(QUERY, "a24brmcv")
compare_query_results(QUERY)

# 1) Both models improve recall and ranking
# 2) ELSER matches too many documents (low precision). Might not be a big issue.
# 3) Titan tends to make problematic connections

[1mQuery:[0m photos of inferior race
[1mTotal results:[0m [34mELSER 61682[39m [91mTitan 556[39m [93mOpenAI 561[39m [95mControl 52195[39m 

[1m————— 1 —————[0m

https://wellcomecollection.org/works/j5v38qn9
[1mPapers by Dicks on [92mrace[39m issues[0m
<p>Comprises:
</p><li>PP/HVD/E/2/1: “Psychological factors on prejudice”, draft [92mof[39m a paper published in [92mRace[39m Relations, 1959.</li>
<li>PP/HVD/E/2/2: Outline [92mof[39m lecture given at the London Hospital, June 1963.</li>
<li>PP/HVD/E/2/3: “[92mRace[39m prejudice”, notes for a lecture given in Birmingham, c.1960s.</li>
<li>PP/HVD/E/2/4: “Psychology [92mof[39m [92mrace[39m prejudice”, c.1960s.</li>
<li>PP/HVD/E/2/5: “Thoughts on the relation between psycho-analysis and social science”, paper given at Sussex University, c.1960s.</li><p></p>
[34mELSER 1[39m [91mTitan -[39m [93mOpenAI -[39m [95mControl 3[39m 

https://wellcomecollection.org/works/ybeeu89x
[1mNegatives depicting personnel a

In [189]:
# 1M full dataset tests using hybrid search (reciprocal rank fusion)
ELSER_1M_TITLE_DESCRIPTION = {
    "label": "ELSER",
    "index": "works-elser-full",
    "get_query_function": get_rrf_query_with_min_should_match,
    "colour": "\033[34m",
    "semantic_fields": ["query.titleSemantic", "query.descriptionSemantic"],
    "semantic_min_score": 15
}

OPENAI_1M_FULL = {
    "label": "OpenAI",
    "index": "works-openai-full",
    "get_query_function": get_rrf_query_open_ai,
    "colour": "\033[36m",
    "semantic_fields": ["query.titleSemantic", "query.descriptionSemantic"],
    "semantic_min_score": 0
}

ELSER_1M_FULL = {
    "label": "ELSER (full)",
    "index": "works-elser-full",
    "get_query_function": get_rrf_query_with_min_should_match,
    "colour": "\033[36m",
    "semantic_fields": ["query.titleSemantic", "query.descriptionSemantic", "query.alternativeTitlesSemantic",
                        "query.contributorsSemantic", "query.genresSemantic", "query.subjectsSemantic",
                        "query.notesSemantic"],
    "semantic_min_score": 30
}

NON_SEMANTIC_1M = {
    "label": "Prod",
    "index": "works-elser-full",
    "get_query_function": get_production_query,
    "colour": "\033[95m"
}

TO_COMPARE = [ELSER_1M_TITLE_DESCRIPTION, ELSER_1M_FULL, OPENAI_1M_FULL, NON_SEMANTIC_1M]
SIZE = 10000
PRINT_LIMIT = 10

QUERY = "ancient manuscript on astronomy"
QUERY = "cardiac failure"
QUERY = "smart large black bird"
#QUERY = "lung neoplasm"

QUERY = "fox species with large ears"

QUERY = "large bird"

#find_needle_in_haystack(QUERY, "a24brmcv")
compare_query_results(QUERY)


[1mQuery:[0m large bird
[1mTotal results:[0m [34mELSER 243[39m [36mELSER (full) 185[39m [36mOpenAI 664[39m [95mProd 151[39m 

[1m————— 1 —————[0m

https://wellcomecollection.org/works/t3aembuu
[1mA huntsman and a setter searching for a game-[92mbird[39m that is hiding under [92mlarge[39m leaves in a garden. Etching by J. Scott after P. Reinagle.[0m
<p>The huntsmen is peering over [92mlarge[39m-leafed plants while the setter is standing right next to its prey, the [92mbird[39m</p>
[34mELSER 1[39m [36mELSER (full) 30[39m [36mOpenAI 39[39m [95mProd 16[39m 

https://wellcomecollection.org/works/kv9q66rn
[1mA [92mlarge[39m tree with a small [92mbird[39m flying towards it[0m
Hospital ward, creator's name and date of creation are inscribed on the reverse, probably by Edward Adamson.
[34mELSER 6[39m [36mELSER (full) 1[39m [36mOpenAI 2[39m [95mProd 1[39m 

https://wellcomecollection.org/works/dqtfg953
[1mA cassowary [92mbird[39m standing on a roc

In [196]:
# QUERY = "HIV"
# QUERY = "violent criminal"
#QUERY = "cardiac failure"
#QUERY = "lung neoplasm inflammation"
#QUERY = "lung neoplasm"
# QUERY = "edo japan"
QUERY = "tanuki"  # Expected 'raccoon dog' results, not good
QUERY = "animal anatomy treatise"
QUERY = "the blitz"  # Not good
QUERY = "how to make meth"  # Relevance score too low
QUERY = "Black Death"  # case-sensitive, second ELSER result is bad
#QUERY = "black death"
# QUERY = "consumption"
# QUERY = "tuberculosis"
# QUERY = "surgery knife" # h79mx8vz (iridectomy knives), more relevants results, better ordering of existing results

QUERY = "large bird"  # A large tree with a small bird flying towards it (prod), non-semantic struggles with adjectives in between
# QUERY = "large black bird smart"
# QUERY = "child doctor"
# QUERY = "Thames city"
# QUERY = "Lady with the Lamp"
QUERY = "czech republic capital"  # Good
QUERY = "cardiac failure"
QUERY = "nutritious plant-based meals"

In [54]:
find_needle_in_haystack("lung neoplasm", "s3e28zhn")

[34mELSER 183[39m [34mELSER (full) 186[39m [95mProd 92[39m 

In [175]:
find_needle_in_haystack("cardiac failure", "e37qcyfm")

[34mELSER 71[39m [36mELSER (full) -[39m [36mOpenAI 157[39m [95mProd -[39m 



In [190]:
find_needle_in_haystack("animal anatomy treatise", "b5kqccbb")

[34mELSER -[39m [36mELSER (full) -[39m [36mOpenAI 72[39m [95mProd -[39m 



In [176]:
find_needle_in_haystack("the blitz", "a227y9ye")

[34mELSER -[39m [36mELSER (full) -[39m [36mOpenAI 10[39m [95mProd -[39m 



In [178]:
# https://wellcomecollection.org/works/a24brmcv
find_needle_in_haystack("how to make meth", "a24brmcv")

[34mELSER -[39m [36mELSER (full) -[39m [36mOpenAI 2[39m [95mProd -[39m 



In [162]:
# https://wellcomecollection.org/works/ynub7cjf
find_needle_in_haystack("fish brain", "ynub7cjf")

[34mELSER 218[39m [36mELSER (full) 469[39m [36mOpenAI 275[39m [95mProd -[39m 



In [161]:
find_needle_in_haystack("pig mum with babies", "xkm6ubyq")

[34mELSER -[39m [36mELSER (full) -[39m [36mOpenAI 82[39m [95mProd -[39m 



In [158]:
find_needle_in_haystack("brain pieces", "jdefejup")

[34mELSER -[39m [36mELSER (full) -[39m [36mOpenAI -[39m [95mProd -[39m 



In [163]:
# https://wellcomecollection.org/works/ag3zz4dx/images?id=a2yxbhw5&resultPosition=16
find_needle_in_haystack("bear eating seal", "ag3zz4dx")

[34mELSER 18[39m [36mELSER (full) -[39m [36mOpenAI 61[39m [95mProd -[39m 



In [165]:
find_needle_in_haystack("large flightless bird", "njacsf2g")

[34mELSER -[39m [36mELSER (full) -[39m [36mOpenAI 81[39m [95mProd -[39m 



In [166]:
# https://wellcomecollection.org/works/d8qqspwv/images?id=wjzph6wv
find_needle_in_haystack("fox species with large ears", "d8qqspwv")

[34mELSER -[39m [36mELSER (full) -[39m [36mOpenAI 80[39m [95mProd -[39m 



In [194]:
# https://wellcomecollection.org/works/rt7bk7dt/images?id=a57y2s4z&resultPosition=29
find_needle_in_haystack("dog eye", "rt7bk7dt")

[34mELSER 26[39m [36mELSER (full) 63[39m [36mOpenAI 4[39m [95mProd -[39m 



In [168]:
# https://wellcomecollection.org/works/nfgzazqm/images?id=wzr92r6d
find_needle_in_haystack("mouse with long nose", "nfgzazqm")

[34mELSER 143[39m [36mELSER (full) -[39m [36mOpenAI 13[39m [95mProd -[39m 



In [169]:
# https://wellcomecollection.org/works/dpte8snu/items
find_needle_in_haystack("ant baby", "dpte8snu")

[34mELSER -[39m [36mELSER (full) -[39m [36mOpenAI 121[39m [95mProd -[39m 



In [171]:
# https://wellcomecollection.org/works/jvbc3r5f/images?id=ab5ywfpy
find_needle_in_haystack("man riding a pig", "jvbc3r5f")

[34mELSER -[39m [36mELSER (full) -[39m [36mOpenAI 46[39m [95mProd -[39m 



In [172]:
# https://wellcomecollection.org/works/jvbc3r5f/images?id=ab5ywfpy
find_needle_in_haystack("butcher riding a pig", "jvbc3r5f")

[34mELSER 6[39m [36mELSER (full) 4[39m [36mOpenAI 1[39m [95mProd -[39m 



In [192]:
find_needle_in_haystack("boucher sur un cochon", "jvbc3r5f")

[34mELSER -[39m [36mELSER (full) -[39m [36mOpenAI 15[39m [95mProd -[39m 



In [193]:
find_needle_in_haystack("řezník na praseti", "jvbc3r5f")

[34mELSER -[39m [36mELSER (full) -[39m [36mOpenAI 4[39m [95mProd -[39m 



In [None]:
# Traité des maladies du coeur
find_needle_in_haystack("treaty heart diseases", "a239wxjg")