In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
PATH_TEST_COLLECTION_DATA="drive/MyDrive/Uni/Master/Masterthesis/Data/test_collection/"
EXPERIMENTS_DATA="drive/MyDrive/Uni/Master/Masterthesis/Experiments/Runs/Data/"
PATH_LEXICAL_INDEX="drive/MyDrive/Uni/Master/Masterthesis/Experiments/Indexing/lexical_index/"

In [3]:
!pip install rank-bm25

Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2


#Load and process data

## Load query data

In [4]:
import json

# importieren der Pooling results
search_queries_file = PATH_TEST_COLLECTION_DATA + "suchanfragen_testkollektion_final.json"

# JSON direkt einlesen
with open(search_queries_file, "r", encoding="utf-8") as f:
    search_queries = json.load(f)

search_queries[4]

{'id': 'c6',
 'cluster': 'cluster_6',
 'topic': 'Digitales Marketing und Markenkommunikation',
 'single': [{'id': 'c6_1_single',
   'keyword': 'Marketing',
   'variations': [{'id': 'c6_1_single_keyword',
     'query': 'Marketing',
     'type': 'single_keyword'},
    {'id': 'c6_1_single_freetext',
     'query': 'Gesucht wird ein Experte mit Fähigkeiten im Bereich Marketing',
     'type': 'single_free'},
    {'id': 'c6_1_single_keyword_synonym',
     'query': 'Absatzförderung',
     'type': 'single_keyword_synonym'},
    {'id': 'c6_1_single_freetext_synonym',
     'query': 'Gesucht wird ein Experte mit Fähigkeiten im Bereich Absatzförderung',
     'type': 'single_free_synonym'}]},
  {'id': 'c6_2_single',
   'keyword': 'Brand Management',
   'variations': [{'id': 'c6_2_single_keyword',
     'query': 'Brand Management',
     'type': 'single_keyword'},
    {'id': 'c6_2_single_freetext',
     'query': 'Gesucht wird ein Experte mit Fähigkeiten im Bereich Brand Management',
     'type': 'singl

## Process search queries

In [5]:
def process_query(search_queries_data: list) -> dict:
  """
    This function extracts the different types of search queries
    and put them into a combined list.

    Parameters:
    search_queries_data (list): Raw search queries data

    Returns:
    list[dict]: combines search queries
  """

  queries_data = []

  for search_query in search_queries_data:

    # get single objects
    if search_query["single"]:
        for single_query in search_query["single"]:
          for variation in single_query["variations"]:
            single_query_id = variation["id"]
            single_query_text = variation["query"]
            single_query_type = variation["type"]

            query_object = {
                "id": single_query_id,
                "query": single_query_text,
                "type": single_query_type
            }

            queries_data.append(query_object)

    # get combination objects
    if search_query["combination"]:
        for combination_query in search_query["combination"]:
          for variation in combination_query["variations"]:
            combination_query_id = variation["id"]
            combination_query_text = variation["query"]
            combination_query_type =  variation["type"]

            query_object = {
                "id": combination_query_id,
                "query": combination_query_text,
                "type": combination_query_type
            }

            queries_data.append(query_object)

  return queries_data



In [6]:
combined_search_queries = process_query(search_queries)
print(len(combined_search_queries))
combined_search_queries

336


[{'id': 'c0_1_single_keyword',
  'query': 'IT-Systemadministration',
  'type': 'single_keyword'},
 {'id': 'c0_1_single_freetext',
  'query': 'Gesucht wird ein Experte mit Fähigkeiten im Bereich IT-Systemadministration',
  'type': 'single_free'},
 {'id': 'c0_1_single_keyword_synonym',
  'query': 'Serveradministration',
  'type': 'single_keyword_synonym'},
 {'id': 'c0_1_single_freetext_synonym',
  'query': 'Gesucht wird ein Experte mit Fähigkeiten im Bereich Serveradministration',
  'type': 'single_free_synonym'},
 {'id': 'c0_2_single_keyword',
  'query': 'Information Technology',
  'type': 'single_keyword'},
 {'id': 'c0_2_single_freetext',
  'query': 'Gesucht wird ein Experte mit Fähigkeiten im Bereich Information Technology',
  'type': 'single_free'},
 {'id': 'c0_2_single_keyword_synonym',
  'query': 'Computing',
  'type': 'single_keyword_synonym'},
 {'id': 'c0_2_single_freetext_synonym',
  'query': 'Gesucht wird ein Experte mit Fähigkeiten im Bereich Computing',
  'type': 'single_free

## Load document data

In [7]:
import json

# importieren der Pooling results
pooling_file = PATH_TEST_COLLECTION_DATA + "pooling_results_final.json"

# JSON direkt einlesen
with open(pooling_file, "r", encoding="utf-8") as f:
    pooling_results = json.load(f)

pooling_results[0]

{'doc_id': '67251b202f496742be0ea207',
 'doc_raw': '{"branches": [], "companyLocationCity": "Wuppertal", "companyLocationStreet": "Schwelmer Straße 245", "companyLocationZip": 42389.0, "companyName": "Salto Systems GmbH", "companyTypes": [], "companyWebsite": "saltosystems.com/de", "description": "Ich bin ein Enthusiast für elektronische Zutrittskontrolle und Schließsysteme und es begeistert mich, wenn durch integrative Lösungen Probleme gelöst und Prozesse optimiert werden können.", "employeeOfInstitutionNames": [], "firstName": "Bela", "gender": "MALE", "id": "67251b202f496742be0ea207", "jobTitle": "System- und Projektberater", "lastName": "Marahrens", "projectsDescription": "Elektronische Zutrittskontrolle\\nElektronische Schließsysteme\\nBesuchermanagement\\nCloudbasierte Zutrittskontrolle", "skills": [], "title": null, "full_text": "Bela Marahrens ist System- und Projektberater. Er ist Enthusiast für elektronische Zutrittskontrolle und Schließsysteme und überzeugt davon, dass inte

## Process document data

In [8]:
### create copy without fulltext
import copy
import json

pooling_results_copy = copy.deepcopy(pooling_results)

pooling_results_without_fulltext = []

for obj in pooling_results_copy:
    new_obj = obj.copy()

    if "profile_dict" in new_obj and isinstance(new_obj["profile_dict"], dict):
        # 1) full_text entfernen
        new_profile_dict = {
            k: v for k, v in new_obj["profile_dict"].items()
            if k != "full_text"
        }

        # 2) doc_raw überschreiben
        new_doc_raw = json.dumps(
            new_profile_dict,
            ensure_ascii=False,
            default=str
        )

        new_obj["profile_dict"] = new_profile_dict
        new_obj["doc_raw"] = new_doc_raw

    pooling_results_without_fulltext.append(new_obj)

# Check
print(pooling_results_without_fulltext[0]["profile_dict"].keys())
print(pooling_results_without_fulltext[1]["doc_raw"])
print(len(pooling_results_without_fulltext))


dict_keys(['branches', 'companyLocationCity', 'companyLocationStreet', 'companyLocationZip', 'companyName', 'companyTypes', 'companyWebsite', 'description', 'employeeOfInstitutionNames', 'firstName', 'gender', 'id', 'jobTitle', 'lastName', 'projectsDescription', 'skills', 'title'])
{"branches": ["Technologie"], "companyLocationCity": "Siegen", "companyLocationStreet": "Sonnenstraße 33-35", "companyLocationZip": 57078.0, "companyName": "K-iS Systemhaus GmbH", "companyTypes": ["Dienstleistung"], "companyWebsite": "https://www.k-is.com/", "description": "Als Account Manager mit dem Hintergrund der Heilerziehungspflege bringe ich eine einzigartige Kombination aus Empathie und technischem Know-how mit. \nMeine Kernkompetenz ist es, komplexe Herausforderungen zu erkennen und in einfache Lösungen zu übersetzen.\nFlexibilität und Anpassungsfähigkeit sind die Skills, die ich auch gerne bei Ihnen anwende.\nIch stehe Ihnen als vertrauensvoller Partner zur Seite und unterstütze Sie bei der digital

In [9]:
## create copy with only fulltext
import copy

pooling_results_copy = copy.deepcopy(pooling_results)

pooling_results_fulltext_only = []

for obj in pooling_results_copy:
    new_obj = obj.copy()

    profile = new_obj.get("profile_dict")

    if isinstance(profile, dict):
        # full_text extrahieren
        full_text = profile.get("full_text")

        # doc_raw entfernen
        new_obj.pop("doc_raw", None)

        # neuen Key setzen
        new_obj["doc_raw_fulltext"] = full_text

    pooling_results_fulltext_only.append(new_obj)

# Check
print(pooling_results_fulltext_only[0].keys())
print(pooling_results_fulltext_only[0])


dict_keys(['doc_id', 'profile_dict', 'doc_raw_fulltext'])
{'doc_id': '67251b202f496742be0ea207', 'profile_dict': {'branches': [], 'companyLocationCity': 'Wuppertal', 'companyLocationStreet': 'Schwelmer Straße 245', 'companyLocationZip': 42389.0, 'companyName': 'Salto Systems GmbH', 'companyTypes': [], 'companyWebsite': 'saltosystems.com/de', 'description': 'Ich bin ein Enthusiast für elektronische Zutrittskontrolle und Schließsysteme und es begeistert mich, wenn durch integrative Lösungen Probleme gelöst und Prozesse optimiert werden können.', 'employeeOfInstitutionNames': [], 'firstName': 'Bela', 'gender': 'MALE', 'id': '67251b202f496742be0ea207', 'jobTitle': 'System- und Projektberater', 'lastName': 'Marahrens', 'projectsDescription': 'Elektronische Zutrittskontrolle\nElektronische Schließsysteme\nBesuchermanagement\nCloudbasierte Zutrittskontrolle', 'skills': [], 'title': None, 'full_text': 'Bela Marahrens ist System- und Projektberater. Er ist Enthusiast für elektronische Zutrittsk

# Create result dataframe

In [10]:
import hashlib

def make_run_id(search_method, query_id, document_id, document_type, length=12):
    raw = f"{search_method}|{query_id}|{document_id}|{document_type}"
    return hashlib.sha256(raw.encode("utf-8")).hexdigest()[:length]

In [11]:
columns = [
    "run_id",
    "search_method",
    "model",
    "query_id",
    "query_type",
    "query_text",
    "document_id",
    "document_type",
    "rank",
    "score",
    "duration"
]

In [12]:
from __future__ import annotations

from typing import Any, Dict, Iterable, List, Optional, Union
import pandas as pd

## Ranking logic überprüfen
def append_search_results(
    df: Optional[pd.DataFrame],
    *,
    search_method: str,
    model: str,
    results_list: Iterable[Dict[str, Any]],
) -> pd.DataFrame:
    """
    Appendet Suchergebnisse (Dokumente) einer einzelnen Query als neue Zeilen an ein DataFrame.

    Erwartet `results` als Iterable von Dicts.
    Rank wird aus der Reihenfolge in `results` erzeugt (rank_start..).

    Returns: neues DataFrame (df erweitert). Wenn df None ist, wird ein neues erstellt.
    """

    if df is None:
        df = pd.DataFrame(columns=columns)
    else:
        # falls df existiert, aber Spalten fehlen (z.B. beim ersten Run)
        missing = [c for c in columns if c not in df.columns]
        if missing:
            for c in missing:
                df[c] = pd.NA
            df = df[columns]

    rows: List[Dict[str, Any]] = []

    # {'query_id': 'c0_1_single_keyword', 'query_text': 'IT-Systemadministration', 'query_type': 'single_keyword', 'document_type': 'fulltext', 'results': [{'score': 0.2971865, 'doc_id': '683d99aea9d017f2c4cf2330'}

    for query_run_results in results_list:
        # doc kann z.B. {"id": "...", "type": "..."} sein oder mehr Felder enthalten
        # hier score extrahieren
        query_id = query_run_results["query_id"]
        query_type = query_run_results["query_type"]
        query_text = query_run_results["query_text"]
        document_type = query_run_results["document_type"]
        duration = query_run_results["duration"]

        for idx, doc in enumerate(query_run_results["results"], start=1):
            document_id = doc["doc_id"]
            score = doc["score"]

            run_id = make_run_id(
                search_method=search_method,
                query_id=query_id,
                document_id=document_id,
                document_type=document_type
            )

            rows.append(
                {
                    "run_id": run_id,
                    "search_method": search_method,
                    "model": model,
                    "query_id": query_id,
                    "query_type": query_type,
                    "query_text": query_text,
                    "document_id": document_id,
                    "document_type": document_type,
                    "rank": idx,
                    "score": score,
                    "duration": duration
                }
            )

    if not rows:
        # keine Ergebnisse -> df unverändert zurück
        return df

    new_df = pd.DataFrame(rows, columns=columns)
    return pd.concat([df, new_df], ignore_index=True)


In [13]:
from __future__ import annotations

import os
from typing import Optional, Sequence

import pandas as pd


# Expected schema for query result logging (column order is kept stable)
REQUIRED_COLUMNS: Sequence[str] = (
    "run_id",
    "search_method",
    "model",
    "query_id",
    "query_type",
    "query_text",
    "document_id",
    "document_type",
    "rank",
    "score",
    "duration"
)

# Columns used to prevent duplicate entries across runs
DEDUP_KEYS: Sequence[str] = ("run_id",)


def append_query_results_df_to_csv(
    query_results: Optional[pd.DataFrame],
    csv_path: str,
) -> None:
    """
    Append query-level retrieval results to a CSV file while preventing duplicates.

    If the CSV exists, only rows with new (search_method, query_id, document_id)
    combinations are appended. Otherwise, the full DataFrame is written.
    """

    # Nothing to do if input is empty
    if query_results is None or query_results.empty:
        return

    # Ensure required columns are present
    missing_columns = [c for c in REQUIRED_COLUMNS if c not in query_results.columns]
    if missing_columns:
        raise ValueError(
            f"Missing required columns: {missing_columns}. "
            f"Expected columns: {list(REQUIRED_COLUMNS)}"
        )

    # Work on a copy and enforce a stable column order
    df = query_results.loc[:, REQUIRED_COLUMNS].copy()

    # If the CSV does not yet exist, write everything
    if not os.path.exists(csv_path):
        df.to_csv(csv_path, index=False)
        return

    # Load only the key columns from the existing CSV
    existing_keys_df = pd.read_csv(csv_path, usecols=list(DEDUP_KEYS))

    # Build indices for fast comparison of key combinations
    existing_index = pd.MultiIndex.from_frame(existing_keys_df.loc[:, DEDUP_KEYS])
    new_index = pd.MultiIndex.from_frame(df.loc[:, DEDUP_KEYS])

    # Keep only rows that are not yet present in the CSV
    is_new_row = ~new_index.isin(existing_index)
    df_new = df.loc[is_new_row]

    # Exit early if there is nothing new to append
    if df_new.empty:
        return

    # Append new rows without writing the header again
    df_new.to_csv(csv_path, mode="a", header=False, index=False)


In [14]:
import pandas as pd
from pathlib import Path

search_results_file = Path(EXPERIMENTS_DATA) / "search_results_data.csv"

if search_results_file.exists():
    print("Found")
    search_results = pd.read_csv(search_results_file)
else:
    print("Not Found")
    search_results = pd.DataFrame(
        columns = columns
    )


Found


In [15]:
search_results

Unnamed: 0,run_id,search_method,model,query_id,query_type,query_text,document_id,document_type,rank,score,duration
0,a60c18726fa0,bi_encoder,sentence-transformers/distiluse-base-multiling...,c0_1_single_keyword,single_keyword,IT-Systemadministration,683d99aea9d017f2c4cf2330,fulltext,1,0.297187,0.672
1,4baa6c7cc291,bi_encoder,sentence-transformers/distiluse-base-multiling...,c0_1_single_keyword,single_keyword,IT-Systemadministration,65ba06be3897d6f0e653f0cc,fulltext,2,0.268765,0.672
2,1f19623c7279,bi_encoder,sentence-transformers/distiluse-base-multiling...,c0_1_single_keyword,single_keyword,IT-Systemadministration,6682c449680d5b5995b4229d,fulltext,3,0.264832,0.672
3,1f0148601c4c,bi_encoder,sentence-transformers/distiluse-base-multiling...,c0_1_single_keyword,single_keyword,IT-Systemadministration,66193c72b5bb2b1c9a963589,fulltext,4,0.246097,0.672
4,4964813cf7da,bi_encoder,sentence-transformers/distiluse-base-multiling...,c0_1_single_keyword,single_keyword,IT-Systemadministration,660fe785b5bb2b1c9a9440b3,fulltext,5,0.231186,0.672
...,...,...,...,...,...,...,...,...,...,...,...
201595,62fe49b7eaef,cross_encoder,cross-encoder/mmarco-mMiniLMv2-L12-H384-v1,c13_5_combination_freetext_synonym,combination_free_synonym,Gesucht wird ein Experte mit Fähigkeiten in de...,6645e131f48ebd8f3f48df57,structured_fulltext,46,-1.401143,2.625
201596,8c5b7f94a33a,cross_encoder,cross-encoder/mmarco-mMiniLMv2-L12-H384-v1,c13_5_combination_freetext_synonym,combination_free_synonym,Gesucht wird ein Experte mit Fähigkeiten in de...,65fae668763465cfa7e2b971,structured_fulltext,47,-1.429569,2.625
201597,ebf170f63e04,cross_encoder,cross-encoder/mmarco-mMiniLMv2-L12-H384-v1,c13_5_combination_freetext_synonym,combination_free_synonym,Gesucht wird ein Experte mit Fähigkeiten in de...,66eabc0d41f659468bdd8da8,structured_fulltext,48,-1.476483,2.625
201598,bdc5a6737787,cross_encoder,cross-encoder/mmarco-mMiniLMv2-L12-H384-v1,c13_5_combination_freetext_synonym,combination_free_synonym,Gesucht wird ein Experte mit Fähigkeiten in de...,65f937c1763465cfa7e252ad,structured_fulltext,49,-1.532228,2.625


# Runs

In [16]:
from scipy.sparse import save_npz, load_npz
import joblib
import json
import re
from tqdm import tqdm

In [17]:
LIMIT = 50

In [18]:
def get_document_type(collection):
  return collection.split("_", 2)[2]

In [19]:
import time
from functools import wraps

def measure_latency(fn):
    @wraps(fn)
    def wrapper(*args, **kwargs):
        start = time.perf_counter()
        result = fn(*args, **kwargs)
        duration = round(time.perf_counter() - start, 3)
        return result, duration
    return wrapper

## Preprocessing steps

In [20]:
# source: https://snowballstem.org/algorithms/german/stop.txt

GERMAN_STOPWORDS = [
    "aber",
    "alle", "allem", "allen", "aller", "alles",
    "als", "also", "am", "an",
    "ander", "andere", "anderem", "anderen", "anderer", "anderes",
    "anderm", "andern", "anderr", "anders",
    "auch", "auf", "aus", "bei",
    "bin", "bis", "bist",
    "da", "damit", "dann",
    "der", "den", "des", "dem", "die", "das",
    "daß",
    "derselbe", "derselben", "denselben", "desselben", "demselben",
    "dieselbe", "dieselben", "dasselbe",
    "dazu",
    "dein", "deine", "deinem", "deinen", "deiner", "deines",
    "denn",
    "derer", "dessen",
    "dich", "dir", "du",
    "dies", "diese", "diesem", "diesen", "dieser", "dieses",
    "doch", "dort",
    "durch",
    "ein", "eine", "einem", "einen", "einer", "eines",
    "einig", "einige", "einigem", "einigen", "einiger", "einiges",
    "einmal",
    "er", "ihn", "ihm",
    "es", "etwas",
    "euer", "eure", "eurem", "euren", "eurer", "eures",
    "für", "gegen",
    "gewesen",
    "hab", "habe", "haben", "hat", "hatte", "hatten",
    "hier", "hin", "hinter",
    "ich", "mich", "mir",
    "ihr", "ihre", "ihrem", "ihren", "ihrer", "ihres",
    "euch",
    "im", "in", "indem", "ins",
    "ist",
    "jede", "jedem", "jeden", "jeder", "jedes",
    "jene", "jenem", "jenen", "jener", "jenes",
    "jetzt",
    "kann",
    "kein", "keine", "keinem", "keinen", "keiner", "keines",
    "können", "könnte",
    "machen",
    "man",
    "manche", "manchem", "manchen", "mancher", "manches",
    "mein", "meine", "meinem", "meinen", "meiner", "meines",
    "mit",
    "muss", "musste",
    "nach",
    "nicht", "nichts",
    "noch", "nun", "nur",
    "ob", "oder", "ohne",
    "sehr",
    "sein", "seine", "seinem", "seinen", "seiner", "seines",
    "selbst", "sich",
    "sie", "ihnen",
    "sind",
    "so",
    "solche", "solchem", "solchen", "solcher", "solches",
    "soll", "sollte",
    "sondern", "sonst",
    "über",
    "um",
    "und",
    "uns", "unse", "unsem", "unsen", "unser", "unses",
    "unter",
    "viel",
    "vom", "von", "vor",
    "während",
    "war", "waren", "warst",
    "was",
    "weg",
    "weil",
    "weiter",
    "welche", "welchem", "welchen", "welcher", "welches",
    "wenn",
    "werde", "werden",
    "wie",
    "wieder",
    "will",
    "wir",
    "wird",
    "wirst",
    "wo",
    "wollen", "wollte",
    "würde", "würden",
    "zu", "zum", "zur",
    "zwar",
    "zwischen"
]


In [21]:
_whitespace = re.compile(r"\s+")

def normalize(text: str) -> str:
    """
    Minimale Normalisierung (fair & reproduzierbar).
    """
    text = text.lower()
    text = _whitespace.sub(" ", text).strip()
    return text


In [22]:
def bm25_tokenize(text: str) -> list[str]:
    tokens = re.findall(r"(?u)\b\w\w+\b", normalize(text))
    return [t for t in tokens if t not in GERMAN_STOPWORDS]

## Load indexes

In [23]:
def load_tfidf_index(file_name):
    """
    Lädt den TF-IDF Index (Matrix, Vectorizer).
    """
    X = load_npz(f"{PATH_LEXICAL_INDEX}/{file_name}_matrix.npz")
    vectorizer = joblib.load(f"{PATH_LEXICAL_INDEX}/{file_name}_vectorizer.joblib")
    return vectorizer, X

def load_bm25_index(file_name):
    """
    Lädt den BM25-Index (BM25Okapi Objekt).
    """
    bm25 = joblib.load(f"{PATH_LEXICAL_INDEX}{file_name}_bm25.joblib")
    return bm25

In [24]:
tfidf_index_fulltext_vectorizer, tfidf_index_fulltext_matrix = load_tfidf_index("tfidf_index_fulltext_only")
tfidf_index_structured_vectorizer, tfidf_index_structured_matrix= load_tfidf_index("tfidf_index_without_fulltext")
tfidf_index_structured_fulltext_vectorizer, tfidf_structured_fulltext_matrix = load_tfidf_index("tfidf_index_structured_fulltext")

In [25]:
bm25_index_fulltext = load_bm25_index("bm25_index_fulltext_only")
bm25_index_structured = load_bm25_index("bm25_index_without_fulltext")
bm25_index_structured_fulltext = load_bm25_index("bm25_index_structured_fulltext")

In [None]:
bm25_index_structured_fulltext

<rank_bm25.BM25Okapi at 0x7ae6122d7f20>

In [None]:
tfidf_index_structured_fulltext_vectorizer

In [None]:
tfidf_structured_fulltext_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 12790 stored elements and shape (259, 1646)>

## TF-IDF

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [27]:
# get vectorizer

@measure_latency
def query_tf_idf(search_query: str, expert_data: str, vectorizer, matrix):

    """
    Führt eine TF-IDF Suche aus und gibt Top-k Treffer im benötigten Format zurück.

    Output pro Treffer:
      - doc_id: ID des Dokuments
      - score: TF-IDF Similarity Score (float)
    """
    q = vectorizer.transform([normalize(search_query)])   # (1, n_terms)
    scores = (matrix @ q.T).toarray().ravel()           # (n_docs,)

    k = min(LIMIT, len(scores))
    idx = np.argpartition(-scores, k - 1)[:k]
    idx = idx[np.argsort(-scores[idx])]

    results = []
    for i in idx:
        document = expert_data[int(i)]
        doc_id = document.get("doc_id", int(i)) if isinstance(document, dict) else int(i)

        results.append({
            "doc_id": doc_id,
            "score": float(scores[int(i)]),
        })

    return results

In [28]:
query_tf_idf("IT-Systemadministration", pooling_results, tfidf_index_structured_fulltext_vectorizer, tfidf_structured_fulltext_matrix)

([{'doc_id': '65ba06be3897d6f0e653f0cc', 'score': 0.48766859662079326},
  {'doc_id': '660fe785b5bb2b1c9a9440b3', 'score': 0.4220286263956869},
  {'doc_id': '6682c449680d5b5995b4229d', 'score': 0.4152478673726665},
  {'doc_id': '683d99aea9d017f2c4cf2330', 'score': 0.3908057606714663},
  {'doc_id': '6670848036b0c2e419593c7c', 'score': 0.3779624425596489},
  {'doc_id': '671f67082f496742be0b1d07', 'score': 0.3616301263073536},
  {'doc_id': '670fa3796a3c6a462fdd1016', 'score': 0.2781499891366951},
  {'doc_id': '65b0d5453897d6f0e6516f88', 'score': 0.2607353173054746},
  {'doc_id': '66d57bc9528e751260e51cd1', 'score': 0.197755790289363},
  {'doc_id': '6711189d2f496742be023f8b', 'score': 0.1939699504367333},
  {'doc_id': '65b00ae03897d6f0e6512d70', 'score': 0.1922920046623926},
  {'doc_id': '6718fa8a2f496742be072e52', 'score': 0.18944258034092867},
  {'doc_id': '66d1a5de528e751260e38ff3', 'score': 0.1716130397685791},
  {'doc_id': '671f67692f496742be0b1d80', 'score': 0.14186477344334986},
  {'

In [29]:
collections = [
    {
        "vectorizer": tfidf_index_fulltext_vectorizer,
        "matrix": tfidf_index_fulltext_matrix,
        "documents": pooling_results_fulltext_only,
        "document_type": "fulltext"
    },
    {
        "vectorizer": tfidf_index_structured_vectorizer,
        "matrix": tfidf_index_structured_matrix,
        "documents": pooling_results_without_fulltext,
        "document_type": "structured"
    },
    {
        "vectorizer": tfidf_index_structured_fulltext_vectorizer,
        "matrix": tfidf_structured_fulltext_matrix,
        "documents": pooling_results,
        "document_type": "structured_fulltext"
    }
]

for collection in tqdm(collections):

    # get document type from collection name
    document_type = collection["document_type"]

    queries_results_list = []
    for query in tqdm(combined_search_queries):
      query_id = query["id"]
      query_text = query["query"]
      query_type = query["type"]

      vectorizer = collection["vectorizer"]
      matrix = collection["matrix"]
      documents = collection["documents"]

      tf_idf_results, duration = query_tf_idf(query_text, documents, vectorizer, matrix) # hier anpassen für andere Methode

      results_list = [
          {"score": r.get("score"), "doc_id": r.get("doc_id")}
          for r in tf_idf_results
      ]

      queries_results_list.append(
          {
              "query_id": query_id,
              "query_text": query_text,
              "query_type": query_type,
              "document_type": document_type,
              "results": results_list,
              "duration": duration
          }
      )

    search_results = append_search_results(
        df=search_results,
        search_method="tf-idf",
        model="TfidfVectorizer",
        results_list=queries_results_list
    )

    append_query_results_df_to_csv(
        query_results=search_results,
        csv_path=search_results_file
    )



  0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/336 [00:00<?, ?it/s][A
 28%|██▊       | 93/336 [00:00<00:00, 923.53it/s][A
 61%|██████▏   | 206/336 [00:00<00:00, 1040.61it/s][A
100%|██████████| 336/336 [00:00<00:00, 1055.62it/s]
 33%|███▎      | 1/3 [00:02<00:04,  2.43s/it]
  0%|          | 0/336 [00:00<?, ?it/s][A
 25%|██▍       | 83/336 [00:00<00:00, 824.78it/s][A
 49%|████▉     | 166/336 [00:00<00:00, 811.34it/s][A
 74%|███████▍  | 248/336 [00:00<00:00, 796.61it/s][A
100%|██████████| 336/336 [00:00<00:00, 790.43it/s]
 67%|██████▋   | 2/3 [00:05<00:02,  2.53s/it]
  0%|          | 0/336 [00:00<?, ?it/s][A
 34%|███▍      | 115/336 [00:00<00:00, 1149.19it/s][A
100%|██████████| 336/336 [00:00<00:00, 1134.24it/s]
100%|██████████| 3/3 [00:08<00:00,  2.77s/it]


In [30]:
    search_results

Unnamed: 0,run_id,search_method,model,query_id,query_type,query_text,document_id,document_type,rank,score,duration
0,a60c18726fa0,bi_encoder,sentence-transformers/distiluse-base-multiling...,c0_1_single_keyword,single_keyword,IT-Systemadministration,683d99aea9d017f2c4cf2330,fulltext,1,0.297187,0.672
1,4baa6c7cc291,bi_encoder,sentence-transformers/distiluse-base-multiling...,c0_1_single_keyword,single_keyword,IT-Systemadministration,65ba06be3897d6f0e653f0cc,fulltext,2,0.268765,0.672
2,1f19623c7279,bi_encoder,sentence-transformers/distiluse-base-multiling...,c0_1_single_keyword,single_keyword,IT-Systemadministration,6682c449680d5b5995b4229d,fulltext,3,0.264832,0.672
3,1f0148601c4c,bi_encoder,sentence-transformers/distiluse-base-multiling...,c0_1_single_keyword,single_keyword,IT-Systemadministration,66193c72b5bb2b1c9a963589,fulltext,4,0.246097,0.672
4,4964813cf7da,bi_encoder,sentence-transformers/distiluse-base-multiling...,c0_1_single_keyword,single_keyword,IT-Systemadministration,660fe785b5bb2b1c9a9440b3,fulltext,5,0.231186,0.672
...,...,...,...,...,...,...,...,...,...,...,...
251995,7973544200c2,tf-idf,TfidfVectorizer,c13_5_combination_freetext_synonym,combination_free_synonym,Gesucht wird ein Experte mit Fähigkeiten in de...,66a1fdc53c04b3996294624f,structured_fulltext,46,0.030875,0.001
251996,eaf332f41901,tf-idf,TfidfVectorizer,c13_5_combination_freetext_synonym,combination_free_synonym,Gesucht wird ein Experte mit Fähigkeiten in de...,6630ee06b5bb2b1c9a9b4376,structured_fulltext,47,0.030604,0.001
251997,cf50ab70b9bd,tf-idf,TfidfVectorizer,c13_5_combination_freetext_synonym,combination_free_synonym,Gesucht wird ein Experte mit Fähigkeiten in de...,6749c74eb546838eefc6115b,structured_fulltext,48,0.030537,0.001
251998,26d21dc519fa,tf-idf,TfidfVectorizer,c13_5_combination_freetext_synonym,combination_free_synonym,Gesucht wird ein Experte mit Fähigkeiten in de...,65b264eb3897d6f0e65202da,structured_fulltext,49,0.029497,0.001


## BM25

In [31]:
from rank_bm25 import BM25Okapi
from typing import Iterable, Sequence

In [32]:
# get vectorizer

@measure_latency
def query_bm25(search_query: str, expert_data: str, bm25_index):

    """
    Führt eine BM25 Suche aus und gibt Top-k Treffer im benötigten Format zurück.

    Output pro Treffer:
      - doc_id: ID des Dokuments
      - score: BM25 Score (float)
    """

    q_tokens = bm25_tokenize(search_query)

    scores = bm25_index.get_scores(q_tokens)  # numpy array (n_docs,)
    k = min(LIMIT, len(scores))

    idx = np.argpartition(-scores, k - 1)[:k]
    idx = idx[np.argsort(-scores[idx])]

    results = []
    for i in idx:
        document = expert_data[i]
        doc_id = document.get("doc_id") if isinstance(document, dict) else None

        results.append({
            "doc_id": doc_id,
            "score": float(scores[i]),
        })

    return results

In [33]:
query_bm25("IT-Systemadministration", pooling_results, bm25_index_structured_fulltext)

([{'doc_id': '6682c449680d5b5995b4229d', 'score': 9.582234986569244},
  {'doc_id': '660fe785b5bb2b1c9a9440b3', 'score': 9.458657003610632},
  {'doc_id': '65ba06be3897d6f0e653f0cc', 'score': 9.40899309311261},
  {'doc_id': '683d99aea9d017f2c4cf2330', 'score': 9.408555983005936},
  {'doc_id': '6670848036b0c2e419593c7c', 'score': 8.989776978451026},
  {'doc_id': '671f67082f496742be0b1d07', 'score': 8.876311796712315},
  {'doc_id': '65b0d5453897d6f0e6516f88', 'score': 8.51027317696499},
  {'doc_id': '65b00ae03897d6f0e6512d70', 'score': 7.843854633852368},
  {'doc_id': '65acfb5a3897d6f0e6506db1', 'score': 6.752763209993221},
  {'doc_id': '670fa3796a3c6a462fdd1016', 'score': 4.582976979393751},
  {'doc_id': '6718fa8a2f496742be072e52', 'score': 4.395484660352346},
  {'doc_id': '6711189d2f496742be023f8b', 'score': 4.305970450861275},
  {'doc_id': '66d1a5de528e751260e38ff3', 'score': 4.114841290787558},
  {'doc_id': '671f67692f496742be0b1d80', 'score': 3.9866124433102668},
  {'doc_id': '66eabc0

In [34]:
collections = [
    {
        "bm25_index": bm25_index_fulltext,
        "documents": pooling_results_fulltext_only,
        "document_type": "fulltext"
    },
    {
        "bm25_index": bm25_index_structured,
        "documents": pooling_results_without_fulltext,
        "document_type": "structured"
    },
    {
        "bm25_index": bm25_index_structured_fulltext,
        "documents": pooling_results,
        "document_type": "structured_fulltext"
    }
]

for collection in tqdm(collections):

    # get document type from collection name
    document_type = collection["document_type"]

    queries_results_list = []
    for query in tqdm(combined_search_queries):
      query_id = query["id"]
      query_text = query["query"]
      query_type = query["type"]

      bm25_index = collection["bm25_index"]
      documents = collection["documents"]

      tf_idf_results, duration = query_bm25(query_text, documents, bm25_index) # hier anpassen für andere Methode

      results_list = [
          {"score": r.get("score"), "doc_id": r.get("doc_id")}
          for r in tf_idf_results
      ]

      queries_results_list.append(
          {
              "query_id": query_id,
              "query_text": query_text,
              "query_type": query_type,
              "document_type": document_type,
              "results": results_list,
              "duration": duration
          }
      )

    search_results = append_search_results(
        df=search_results,
        search_method="bm25",
        model="BM25Okapi",
        results_list=queries_results_list
    )

    append_query_results_df_to_csv(
        query_results=search_results,
        csv_path=search_results_file
    )




  0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/336 [00:00<?, ?it/s][A
100%|██████████| 336/336 [00:00<00:00, 2360.68it/s]
 33%|███▎      | 1/3 [00:02<00:04,  2.00s/it]
  0%|          | 0/336 [00:00<?, ?it/s][A
100%|██████████| 336/336 [00:00<00:00, 2146.85it/s]
 67%|██████▋   | 2/3 [00:03<00:01,  1.99s/it]
  0%|          | 0/336 [00:00<?, ?it/s][A
100%|██████████| 336/336 [00:00<00:00, 2262.33it/s]
100%|██████████| 3/3 [00:06<00:00,  2.02s/it]


In [35]:
search_results

Unnamed: 0,run_id,search_method,model,query_id,query_type,query_text,document_id,document_type,rank,score,duration
0,a60c18726fa0,bi_encoder,sentence-transformers/distiluse-base-multiling...,c0_1_single_keyword,single_keyword,IT-Systemadministration,683d99aea9d017f2c4cf2330,fulltext,1,0.297187,0.672
1,4baa6c7cc291,bi_encoder,sentence-transformers/distiluse-base-multiling...,c0_1_single_keyword,single_keyword,IT-Systemadministration,65ba06be3897d6f0e653f0cc,fulltext,2,0.268765,0.672
2,1f19623c7279,bi_encoder,sentence-transformers/distiluse-base-multiling...,c0_1_single_keyword,single_keyword,IT-Systemadministration,6682c449680d5b5995b4229d,fulltext,3,0.264832,0.672
3,1f0148601c4c,bi_encoder,sentence-transformers/distiluse-base-multiling...,c0_1_single_keyword,single_keyword,IT-Systemadministration,66193c72b5bb2b1c9a963589,fulltext,4,0.246097,0.672
4,4964813cf7da,bi_encoder,sentence-transformers/distiluse-base-multiling...,c0_1_single_keyword,single_keyword,IT-Systemadministration,660fe785b5bb2b1c9a9440b3,fulltext,5,0.231186,0.672
...,...,...,...,...,...,...,...,...,...,...,...
302395,cf6c7b79dc98,bm25,BM25Okapi,c13_5_combination_freetext_synonym,combination_free_synonym,Gesucht wird ein Experte mit Fähigkeiten in de...,65c5352ec8a0f67c4c5cfdb0,structured_fulltext,46,1.879505,0.001
302396,ab0496017af1,bm25,BM25Okapi,c13_5_combination_freetext_synonym,combination_free_synonym,Gesucht wird ein Experte mit Fähigkeiten in de...,67e570701d726b223415793a,structured_fulltext,47,1.855341,0.001
302397,3cf665cc050d,bm25,BM25Okapi,c13_5_combination_freetext_synonym,combination_free_synonym,Gesucht wird ein Experte mit Fähigkeiten in de...,65b264eb3897d6f0e65202da,structured_fulltext,48,1.851180,0.001
302398,d27866f3d12a,bm25,BM25Okapi,c13_5_combination_freetext_synonym,combination_free_synonym,Gesucht wird ein Experte mit Fähigkeiten in de...,6683ba3c08d4ee614ef74793,structured_fulltext,49,1.806669,0.001
