In [1]:
import json
from elasticsearch import Elasticsearch, helpers


In [2]:
with open("docs_with_q_4o-mini.json", "rt") as f_in:
    docs_4o_mini = json.load(f_in)

In [3]:
with open("docs_with_q_lama.json", "rt") as f_in:
    docs_llama = json.load(f_in)

In [4]:
from tqdm.auto import tqdm

In [5]:
from sentence_transformers import SentenceTransformer

In [6]:
model_mini_lm = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")
model_distilbert = SentenceTransformer("multi-qa-distilbert-cos-v1")



In [7]:
lenv_mini = len(model_mini_lm.encode("test"))
lenv_distilbert = len(model_distilbert.encode("test"))

In [8]:
es_client = Elasticsearch("http://localhost:9200")

In [9]:
docs_4o_mini[0]

{'source': 'https://www.reddit.com/r/germany/wiki/autobahn_safety',
 'content': 'The Autobahn is a [network of interstate highways in Germany](https://en.m.wikipedia.org/wiki/Autobahn#/media/File%3AAutobahnen_in_Deutschland.svg) with a total length of more than 8,000 miles. [65%](https://en.wikipedia.org/wiki/Autobahn#Speed_limits) of the Autobahn has no speed limit. How safe can that be?\nVehicles traveled 147 billion miles on the Autobahn in 2015. 322 people died = 2.19 deaths per billion miles.\nIn the US, vehicles travelled 757 billion miles on interstate highways. 3,837 people died = 5.07 deaths per billion miles.\nThat means: If you drive on the interstate, your likelihood to die is 131% higher than for the same distance on the Autobahn.\n*sources:*\nStatistisches Bundesamt: [Unfallentwicklung auf deutschen Straßen 2015](https://www.destatis.de/DE/PresseService/Presse/Pressekonferenzen/2016/Unfallentwicklung_2015/Pressebroschuere_unfallentwicklung.pdf?__blob=publicationFile)\nNat

In [10]:
def index_sett(v):

    index_settings = {
        "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 0
        },
        "mappings": {
            "properties": {
                "source": {"type": "text"},
                "content": {"type": "text"},
                "headline": {"type": "text"},
                "id": {"type": "keyword"},
                "length": {"type": "integer"},
                "question": {"type": "text"},
                "question_vector": {
                    "type": "dense_vector",
                    "dims": v,
                    "index": True,
                    "similarity": "cosine"
                },
                "content_vector": {
                    "type": "dense_vector",
                    "dims": v,
                    "index": True,
                    "similarity": "cosine"
                },
                "question_content_vector": {
                    "type": "dense_vector",
                    "dims": v,
                    "index": True,
                    "similarity": "cosine"
                },
            }
        }
    }

    return index_settings

In [11]:
from itertools import product



In [12]:
emb = {
    "model": {
        model_mini_lm:lenv_mini,
        model_distilbert :lenv_distilbert,

    },
    "dataset": {
        "docs_4o_mini": docs_4o_mini,
        "docs_llama" : docs_llama
    }
}

In [13]:
datasets = emb["dataset"]
"384_docs_4o_mini"

'384_docs_4o_mini'

In [14]:
emb["dataset"].keys()

dict_keys(['docs_4o_mini', 'docs_llama'])

In [15]:
models = list(emb["model"].items())
datasets = list(emb["dataset"].items())

for (model_key, model_value), (dataset_key, dataset_value) in product(models, datasets):
    print(f"Model Key: {model_key}, Model Value: {model_value}, Dataset: {dataset_key}")



Model Key: SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), Model Value: 384, Dataset: docs_4o_mini
Model Key: SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), Model Value: 384, Dataset: docs_llama
Model Key: Sentenc

In [16]:
for (model_key, model_value), (dataset_key, dataset_value) in product(models, datasets):

    index_name = f"{str(model_value)}_{dataset_key}"
    settings = index_sett(model_value)
    print(index_name, model_value)
    es_client.indices.delete(index=index_name, ignore_unavailable=True)
    es_client.indices.create(index=index_name, body=settings)

384_docs_4o_mini 384
384_docs_llama 384
768_docs_4o_mini 768
768_docs_llama 768


In [17]:
from tqdm.auto import tqdm

In [18]:
docs_4o_mini[0]

{'source': 'https://www.reddit.com/r/germany/wiki/autobahn_safety',
 'content': 'The Autobahn is a [network of interstate highways in Germany](https://en.m.wikipedia.org/wiki/Autobahn#/media/File%3AAutobahnen_in_Deutschland.svg) with a total length of more than 8,000 miles. [65%](https://en.wikipedia.org/wiki/Autobahn#Speed_limits) of the Autobahn has no speed limit. How safe can that be?\nVehicles traveled 147 billion miles on the Autobahn in 2015. 322 people died = 2.19 deaths per billion miles.\nIn the US, vehicles travelled 757 billion miles on interstate highways. 3,837 people died = 5.07 deaths per billion miles.\nThat means: If you drive on the interstate, your likelihood to die is 131% higher than for the same distance on the Autobahn.\n*sources:*\nStatistisches Bundesamt: [Unfallentwicklung auf deutschen Straßen 2015](https://www.destatis.de/DE/PresseService/Presse/Pressekonferenzen/2016/Unfallentwicklung_2015/Pressebroschuere_unfallentwicklung.pdf?__blob=publicationFile)\nNat

In [19]:
def import_doc(model_key, dataset):
    for doc in tqdm(dataset):
        question = doc["question"]
        content = doc["content"]
        qt = question + ' ' + content
        doc["question_vector"] = model_key.encode(question)
        doc["content_vector"] = model_key.encode(content)
        doc["question_content_vector"] = model_key.encode(qt)
    return

In [20]:
print(docs_llama[395]["content"])

[See also this thread for more information on "du" and "Sie"](https://www.reddit.com/r/AskAGerman/comments/rsb0xu/is_there_a_moment_when_a_new_friendwork_colleague/).


In [21]:
dataset_key

'docs_llama'

In [22]:
def generate_actions(index_name, data):
    for i, record in enumerate(data):
        yield {
            "_index": index_name,
            "_id": i,
            "_source": record,
        }


In [23]:
from tqdm.auto import tqdm

In [24]:
import_doc(model_mini_lm, docs_4o_mini)
actions = generate_actions(index_name="384_docs_4o_mini", data=docs_4o_mini)
helpers.bulk(es_client, actions)


  0%|          | 0/661 [00:00<?, ?it/s]

(661, [])

In [25]:
import_doc(model_mini_lm, docs_llama)
actions = generate_actions(index_name="384_docs_llama", data=docs_llama)
helpers.bulk(es_client, actions)


  0%|          | 0/661 [00:00<?, ?it/s]

(661, [])

In [26]:
import_doc(model_distilbert, docs_4o_mini)
actions = generate_actions(index_name="768_docs_4o_mini", data=docs_4o_mini)
helpers.bulk(es_client, actions)

  0%|          | 0/661 [00:00<?, ?it/s]

(661, [])

In [27]:
import_doc(model_distilbert, docs_llama)
actions = generate_actions(index_name="768_docs_llama", data=docs_llama)
helpers.bulk(es_client, actions)

  0%|          | 0/661 [00:00<?, ?it/s]

(661, [])

In [28]:
import pandas as pd

In [29]:
df_ground_truth = pd.read_csv("gp4o-mini-questions.csv")
ground_truth = df_ground_truth.to_dict(orient='records')
ground_truth[0]

{'question': 'What is the total length of the Autobahn network in Germany?',
 'headline': 'How safe is the Autobahn?',
 'content': '9d8370cf-a2c8-4c54-9f9c-476b9c09a933'}

In [30]:
def elastic_search_knn(field, vector, index_name):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        }

    search_query = {
        "knn": knn,
        "_source": ["content", "headline", "question", "source", "length", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [31]:


def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)


In [32]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)


In [33]:
def evaluate(ground_truth, search_function, model, index_name):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q["content"]
        results = search_function(q, model, index_name)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [34]:
def question_vector_knn(q, model, index_name):
    question = q['question']

    v_q = model.encode(question)

    return elastic_search_knn('question_vector', v_q, index_name)

In [35]:
for (model_key, model_value), (dataset_key, dataset_value) in product(models, datasets):
    index_name = f"{model_value}_{dataset_key}"
    print(index_name, model_value)
    print(evaluate(ground_truth, question_vector_knn, model_key, index_name), index_name)
    

384_docs_4o_mini 384


  0%|          | 0/3305 [00:00<?, ?it/s]

{'hit_rate': 0.822087745839637, 'mrr': 0.6860010085728689} 384_docs_4o_mini
384_docs_llama 384


  0%|          | 0/3305 [00:00<?, ?it/s]

{'hit_rate': 0.822087745839637, 'mrr': 0.6860010085728689} 384_docs_llama
768_docs_4o_mini 768


  0%|          | 0/3305 [00:00<?, ?it/s]

{'hit_rate': 0.8311649016641453, 'mrr': 0.6961018658598076} 768_docs_4o_mini
768_docs_llama 768


  0%|          | 0/3305 [00:00<?, ?it/s]

{'hit_rate': 0.8311649016641453, 'mrr': 0.6961018658598076} 768_docs_llama


In [38]:
def content_vector_knn(q, model, index_name):
    question = q['question']

    v_q = model.encode(question)

    return elastic_search_knn("content_vector", v_q, index_name)

In [39]:
for (model_key, model_value), (dataset_key, dataset_value) in product(models, datasets):
    index_name = f"{model_value}_{dataset_key}"
    print(evaluate(ground_truth, content_vector_knn, model_key, index_name), index_name)
    

  0%|          | 0/3305 [00:00<?, ?it/s]

{'hit_rate': 0.8717095310136157, 'mrr': 0.717297024710035} 384_docs_4o_mini


  0%|          | 0/3305 [00:00<?, ?it/s]

{'hit_rate': 0.8720121028744326, 'mrr': 0.7173726676752392} 384_docs_llama


  0%|          | 0/3305 [00:00<?, ?it/s]

{'hit_rate': 0.8925869894099848, 'mrr': 0.7341805345436204} 768_docs_4o_mini


  0%|          | 0/3305 [00:00<?, ?it/s]

{'hit_rate': 0.8925869894099848, 'mrr': 0.7341805345436204} 768_docs_llama


In [40]:
def question_text_vector_knn(q, model, index_name):
    question = q['question']

    v_q = model.encode(question)

    return elastic_search_knn("question_content_vector", v_q, index_name)



In [41]:
for (model_key, model_value), (dataset_key, dataset_value) in product(models, datasets):
    index_name = f"{model_value}_{dataset_key}"
    print(evaluate(ground_truth, question_text_vector_knn, model_key, index_name), index_name)

  0%|          | 0/3305 [00:00<?, ?it/s]

{'hit_rate': 0.9128593040847202, 'mrr': 0.7873424104891578} 384_docs_4o_mini


  0%|          | 0/3305 [00:00<?, ?it/s]

{'hit_rate': 0.9131618759455371, 'mrr': 0.787418053454362} 384_docs_llama


  0%|          | 0/3305 [00:00<?, ?it/s]

{'hit_rate': 0.9231467473524962, 'mrr': 0.7947503782148263} 768_docs_4o_mini


  0%|          | 0/3305 [00:00<?, ?it/s]

{'hit_rate': 0.9228441754916793, 'mrr': 0.7948562783661121} 768_docs_llama
