# Pooling

In diesem Notebook sollen relevante Dokumente für die Testkollektion gepooled werden

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
DATA_PATH = "drive/MyDrive/Uni/Master/Masterthesis/Data/"
PROD_DATA_PATH = "drive/MyDrive/Uni/Master/Masterthesis/Data/topics/"
PATH_TEST_COLLECTION_DATA="drive/MyDrive/Uni/Master/Masterthesis/Data/test_collection/"

In [None]:
import pandas as pd

## Import Queries

In [None]:
import json

# Build file path for final test collection.
file_path = PROD_DATA_PATH + "suchanfragen_testkollektion_final.json"

# Load enriched test collection from JSON.
with open(file_path, "r", encoding="utf-8") as f:
    queries_testcollection = json.load(f)

# Basic structural validation.
print(type(queries_testcollection))   # Expected: list or dict
print(len(queries_testcollection))    # Number of clusters
print(queries_testcollection[:2])     # Preview first two entries

<class 'list'>
11
[{'id': 'c0', 'cluster': 'cluster_0', 'topic': 'IT-Management und Wissensmanagement', 'single': [{'id': 'c0_1_single', 'keyword': 'IT-Systemadministration', 'variations': [{'id': 'c0_1_single_keyword', 'query': 'IT-Systemadministration', 'type': 'single_keyword'}, {'id': 'c0_1_single_freetext', 'query': 'Gesucht wird ein Experte mit Fähigkeiten im Bereich IT-Systemadministration', 'type': 'single_free'}, {'id': 'c0_1_single_keyword_synonym', 'query': 'Serveradministration', 'type': 'single_keyword_synonym'}, {'id': 'c0_1_single_freetext_synonym', 'query': 'Gesucht wird ein Experte mit Fähigkeiten im Bereich Serveradministration', 'type': 'single_free_synonym'}]}, {'id': 'c0_2_single', 'keyword': 'Information Technology', 'variations': [{'id': 'c0_2_single_keyword', 'query': 'Information Technology', 'type': 'single_keyword'}, {'id': 'c0_2_single_freetext', 'query': 'Gesucht wird ein Experte mit Fähigkeiten im Bereich Information Technology', 'type': 'single_free'}, {'

## Import Expertprofiles

In [None]:
import json

# Build file path for enhanced expert profiles.
file_path = DATA_PATH + "enhanced_expert_profiles.json"

# Load expert profile dataset from JSON.
with open(file_path, "r", encoding="utf-8") as f:
    expert_data = json.load(f)

# Basic structural validation.
print(type(expert_data))        # Expected: list or dict
print(len(expert_data))         # Number of expert entries
print(expert_data[0])           # Inspect first record
print(expert_data[0].keys())    # Inspect available fields

<class 'list'>
307
{'branches': ['Technologie', 'Medien & Werbung', 'Unternehmensdienstleistungen'], 'companyLocationCity': 'Lennestadt', 'companyLocationStreet': 'Hagener Straße 64', 'companyLocationZip': 57368.0, 'companyName': 'lenne.Tech GmbH', 'companyTypes': ['Dienstleistung'], 'companyWebsite': 'https://lenne.tech', 'description': 'Seit über 15 Jahren begleite ich digitale Projekte von der Konzeption bis zur "schlüsselfertigen" Anwendung. Als Mitglied im Team von lenne.Tech (https://lenne.tech) entwickle ich Apps & Webanwendungen und unterstütze Unternehmen beim Aufbau sowie bei der Weiterbildung von Entwicklungsteams. Im Rahmen unserer lenne.Learning Akademie (https://lennelearning.de) organisiere ich gemeinsam mit anderen Tutoren & Mentoren das Recruiting sowie das Onboarding neuer Entwickler für Partnerunternehmen.', 'employeeOfInstitutionNames': [], 'firstName': 'Ege', 'gender': 'MALE', 'id': '65acfb5a3897d6f0e6506db1', 'jobTitle': 'Softwareentwickler', 'lastName': 'Siebert'

In [None]:
# only keep original data fields from experts
keys_to_remove = ["full_text_fields_used", "full_text_fields_used_fixed", "full_text_word_count", "full_text_fields_used_sorted", "validation_issues"]

expert_data = [
    {k: v for k, v in obj.items() if k not in keys_to_remove}
    for obj in expert_data
]

expert_data[0].keys()

dict_keys(['branches', 'companyLocationCity', 'companyLocationStreet', 'companyLocationZip', 'companyName', 'companyTypes', 'companyWebsite', 'description', 'employeeOfInstitutionNames', 'firstName', 'gender', 'id', 'jobTitle', 'lastName', 'projectsDescription', 'skills', 'title', 'full_text'])

## Extract queries for pooling

In [None]:
def extract_keyword_queries(data):
    """
    Extrahiert alle Queries mit type == 'single_keyword' oder 'combination_keyword'
    aus der gegebenen JSON-Struktur.

    Args:
        data (list): JSON-Daten (Liste von Clustern)

    Returns:
        list: Liste der extrahierten Query-Strings
    """

    # Target variation types to extract.
    target_types = {"single_keyword", "combination_keyword"}

    queries = []

    # Iterate over clusters.
    for cluster in data:

        # Process single keyword variations.
        for single in cluster.get("single", []):
            for variation in single.get("variations", []):
                if variation.get("type") in target_types:
                    queries.append(variation.get("query"))

        # Process combination keyword variations.
        for combination in cluster.get("combination", []):
            for variation in combination.get("variations", []):
                if variation.get("type") in target_types:
                    queries.append(variation.get("query"))

    return queries

In [None]:
queries_for_pooling = extract_keyword_queries(queries_testcollection)
print(len(queries_for_pooling))
print(queries_for_pooling)

84
['IT-Systemadministration', 'Information Technology', 'Wissensmanagement', 'Konstruktion', 'Metallhandwerk', 'Mechanik', 'Automatisierungstechnik', 'Elektrotechnik', 'Konstruktion, Metallhandwerk', 'Mechanik, Elektrotechnik', 'Metallhandwerk, Mechanik', 'Metallhandwerk, Elektrotechnik', 'Mechanik, Automatisierungstechnik', 'Automatisierungstechnik, Elektrotechnik', 'KI', 'Public Relations', 'Sales', 'Social Media Marketing', 'Public Relations, Social Media Marketing', 'Marketing', 'Brand Management', 'Marketing, Social Media Marketing', 'Marketing, Brand Management', 'Social Media Marketing, Brand Management', 'Employee Development', 'Personalentwicklung und Mentoring', 'Human Resources', 'Coaching', 'Business Development, Employee Development', 'Business Development, Personalentwicklung und Mentoring', 'Business Development, Human Resources', 'Business Development, Coaching', 'Employee Development, Personalentwicklung und Mentoring', 'Employee Development, Human Resources', 'Employ

## Import Qdrant Client

In [None]:
!pip install -U sentence-transformers
!pip install -U qdrant-client
!pip install fastembed



In [None]:
from qdrant_client import QdrantClient, models

client = QdrantClient(
    url="url_to_cluster_here",
    api_key="qdrant_key_here",
)

## Build Pooling Dataframe

In [None]:
import pandas as pd

def append_retrieval_results_to_pool(
    pool_df: pd.DataFrame,
    *,
    search_method: str,
    query: str,
    rel_docs,
) -> pd.DataFrame:
    """
    Hängt Retrieval-Ergebnisse (z.B. Qdrant query_points().points ODER
    klassische IR-Ergebnisse wie TF-IDF / BM25) als neue Zeilen an ein Pooling-DF an.

    Unterstützte Eingabeformate je Treffer:

    1) Qdrant Point:
       - doc.payload (dict)
       - doc.score (float)
       - doc.id (optional)

    2) Klassische IR (TF-IDF / BM25):
       - dict mit Keys:
         - "payload" (dict, z.B. Expertenprofil)
         - "score" (float)
         - "id" (optional, z.B. Experten-ID)

    Spalten im Output:
      search_method, query, doc_id, doc_raw, score
    """

    rows = []

    # Iterate over retrieved documents (heterogeneous input supported).
    for doc in rel_docs:

        # --- Case 1: classical IR result (dict-based)
        if isinstance(doc, dict):
            payload = doc.get("payload", None)
            score = doc.get("score", None)
            doc_id = doc.get("id", None)

        # --- Case 2: Qdrant Point (object-based)
        else:
            payload = getattr(doc, "payload", None)
            score = getattr(doc, "score", None)
            doc_id = getattr(doc, "id", None)

        # Prefer ID from payload if available (canonical expert ID).
        if isinstance(payload, dict) and "id" in payload:
            doc_id = payload["id"]

        # Append normalized row.
        rows.append({
            "search_method": search_method,
            "query": query,
            "doc_id": doc_id,
            "doc_raw": payload,   # full expert profile / payload
            "score": score,
        })

    # Create DataFrame for new results.
    add_df = pd.DataFrame(
        rows,
        columns=["search_method", "query", "doc_id", "doc_raw", "score"]
    )

    # Initialize pool if empty.
    if pool_df is None or pool_df.empty:
        return add_df.reset_index(drop=True)

    # Append to existing pool.
    return pd.concat([pool_df, add_df], ignore_index=True)

In [None]:
import pandas as pd
from pathlib import Path

# Build path to pooling results file.
pool_path = Path(PATH_TEST_COLLECTION_DATA) / "pooling_results_raw.csv"

# Load existing pooling results if file exists.
if pool_path.exists():
    print("Found")
    pool_df = pd.read_csv(pool_path)
else:
    print("Not Found")
    # Initialize empty pooling DataFrame with predefined schema.
    pool_df = pd.DataFrame(
        columns=["search_method", "query", "doc_id", "doc_raw", "score"]
    )

Found


In [None]:
pool_df

Unnamed: 0,search_method,query,doc_id,doc_raw,score
0,distiluse-base-multilingual-cased-v2,IT-Systemadministration,67251b202f496742be0ea207,"{""branches"": [], ""companyLocationCity"": ""Wuppe...",0.207031
1,distiluse-base-multilingual-cased-v2,IT-Systemadministration,671f67692f496742be0b1d80,"{""branches"": [""Technologie""], ""companyLocation...",0.109566
2,distiluse-base-multilingual-cased-v2,IT-Systemadministration,6670848036b0c2e419593c7c,"{""branches"": [""Technologie""], ""companyLocation...",0.107298
3,distiluse-base-multilingual-cased-v2,IT-Systemadministration,65b0d5453897d6f0e6516f88,"{""branches"": [""Technologie""], ""companyLocation...",0.105019
4,distiluse-base-multilingual-cased-v2,IT-Systemadministration,671f67082f496742be0b1d07,"{""branches"": [""Technologie"", ""Unternehmensdien...",0.094808
...,...,...,...,...,...
1255,opensearch-neural-sparse-encoding-multilingual-v1,"Business Development, Business Process Analysis",67d44084b5308acb98802c45,"{""branches"": [""Technologie"", ""Immobilien"", ""Me...",9.613712
1256,opensearch-neural-sparse-encoding-multilingual-v1,"Business Development, Business Process Analysis",65c5352ec8a0f67c4c5cfdb0,"{""branches"": [""Immobilien""], ""companyLocationC...",9.471822
1257,opensearch-neural-sparse-encoding-multilingual-v1,"Business Development, Business Process Analysis",66d1a5de528e751260e38ff3,"{""branches"": [""Technologie""], ""companyLocation...",9.155007
1258,opensearch-neural-sparse-encoding-multilingual-v1,"Business Development, Business Process Analysis",67487abbb546838eefc5467d,"{""branches"": [""Unternehmensdienstleistungen""],...",8.847045


In [None]:
import os
import json
import pandas as pd

def append_pool_df_to_csv(pool_df: pd.DataFrame, csv_path: str):
    """
    Schreibt ein Pooling-DataFrame in eine CSV-Datei.

    Logik:
    - Falls die CSV existiert:
        - prüfe, welche search_methods bereits enthalten sind
        - hänge nur Daten für neue search_methods an
    - Falls die CSV nicht existiert:
        - schreibe das komplette DataFrame

    Erwartete Spalten:
    - search_method
    - query
    - doc_id
    - doc_raw (dict)
    - score
    """

    # Skip if nothing to persist.
    if pool_df is None or pool_df.empty:
        return

    df = pool_df.copy()

    # Serialize dict payloads to JSON string for CSV compatibility.
    if "doc_raw" in df.columns:
        df["doc_raw"] = df["doc_raw"].apply(
            lambda x: json.dumps(x, ensure_ascii=False)
            if isinstance(x, dict) else x
        )

    # If CSV already exists: append only new search methods.
    if os.path.exists(csv_path):

        # Load existing search methods (minimal I/O).
        existing_df = pd.read_csv(csv_path, usecols=["search_method"])
        existing_methods = set(existing_df["search_method"].unique())

        # Filter out already persisted search methods.
        df = df[~df["search_method"].isin(existing_methods)]

        # Abort if no new data remains.
        if df.empty:
            print("No data to add")
            return

        print("Added data from ", set(df["search_method"].to_list()))

        # Append without header.
        df.to_csv(
            csv_path,
            mode="a",
            header=False,
            index=False
        )

    # If CSV does not exist: write full DataFrame with header.
    else:
        df.to_csv(
            csv_path,
            mode="w",
            header=True,
            index=False
        )

## Top-K Limit

In [None]:
LIMIT = 5

## Fetch pool documents for BI-Encoder

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
encoder = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')

In [None]:
# Iterate over evaluation queries for pooling.
for query in queries_for_pooling:
    print(query)

    # Execute dense vector search against Qdrant collection.
    rel_docs = client.query_points(
        collection_name="expert_data_structured_and_fulltext_dense",
        query=encoder.encode(query).tolist(),  # Encode query to dense vector.
        limit=LIMIT,
    ).points

    # Append retrieval results to pooling DataFrame.
    pool_df = append_retrieval_results_to_pool(
        pool_df,
        search_method="distiluse-base-multilingual-cased-v2",
        query=query,
        rel_docs=rel_docs,
    )

IT-Systemadministration
Information Technology
Wissensmanagement
Konstruktion
Metallhandwerk
Mechanik
Automatisierungstechnik
Elektrotechnik
Konstruktion, Metallhandwerk
Mechanik, Elektrotechnik
Metallhandwerk, Mechanik
Metallhandwerk, Elektrotechnik
Mechanik, Automatisierungstechnik
Automatisierungstechnik, Elektrotechnik
KI
Public Relations
Sales
Social Media Marketing
Public Relations, Social Media Marketing
Marketing
Brand Management
Marketing, Social Media Marketing
Marketing, Brand Management
Social Media Marketing, Brand Management
Employee Development
Personalentwicklung und Mentoring
Human Resources
Coaching
Business Development, Employee Development
Business Development, Personalentwicklung und Mentoring
Business Development, Human Resources
Business Development, Coaching
Employee Development, Personalentwicklung und Mentoring
Employee Development, Human Resources
Employee Development, Coaching
Personalentwicklung und Mentoring, Human Resources
Personalentwicklung und Mentori

In [None]:
pool_df

Unnamed: 0,search_method,query,doc_id,doc_raw,score
0,distiluse-base-multilingual-cased-v2,IT-Systemadministration,67251b202f496742be0ea207,"{'branches': [], 'companyLocationCity': 'Wuppe...",0.207031
1,distiluse-base-multilingual-cased-v2,IT-Systemadministration,671f67692f496742be0b1d80,"{'branches': ['Technologie'], 'companyLocation...",0.109566
2,distiluse-base-multilingual-cased-v2,IT-Systemadministration,6670848036b0c2e419593c7c,"{'branches': ['Technologie'], 'companyLocation...",0.107298
3,distiluse-base-multilingual-cased-v2,IT-Systemadministration,65b0d5453897d6f0e6516f88,"{'branches': ['Technologie'], 'companyLocation...",0.105019
4,distiluse-base-multilingual-cased-v2,IT-Systemadministration,671f67082f496742be0b1d07,"{'branches': ['Technologie', 'Unternehmensdien...",0.094808
...,...,...,...,...,...
415,distiluse-base-multilingual-cased-v2,"Business Development, Business Process Analysis",682adab50b2e403ede03a46a,"{'branches': ['Unternehmensdienstleistungen'],...",0.225938
416,distiluse-base-multilingual-cased-v2,"Business Development, Business Process Analysis",67501be2cc9cf2f9a1e5d829,"{'branches': ['Technologie'], 'companyLocation...",0.200533
417,distiluse-base-multilingual-cased-v2,"Business Development, Business Process Analysis",65b7691a3897d6f0e6533c32,"{'branches': ['Unternehmensdienstleistungen', ...",0.197369
418,distiluse-base-multilingual-cased-v2,"Business Development, Business Process Analysis",65c4d290c8a0f67c4c5ce5fb,"{'branches': ['Technologie', 'Medien & Werbung...",0.185413


In [None]:
# after every run
append_pool_df_to_csv(pool_df, PATH_TEST_COLLECTION_DATA + "pooling_results_raw.csv")

No data to add


## Fetch pool documents for Cross-Encoder

In [None]:
from sentence_transformers import CrossEncoder

cross_model = CrossEncoder("cross-encoder/mmarco-mMiniLMv2-L12-H384-v1")

# Model: https://huggingface.co/cross-encoder/mmarco-mMiniLMv2-L12-H384-v1

print(cross_model)

CrossEncoder(
  (model): XLMRobertaForSequenceClassification(
    (roberta): XLMRobertaModel(
      (embeddings): XLMRobertaEmbeddings(
        (word_embeddings): Embedding(250002, 384, padding_idx=1)
        (position_embeddings): Embedding(514, 384, padding_idx=1)
        (token_type_embeddings): Embedding(1, 384)
        (LayerNorm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): XLMRobertaEncoder(
        (layer): ModuleList(
          (0-11): 12 x XLMRobertaLayer(
            (attention): XLMRobertaAttention(
              (self): XLMRobertaSdpaSelfAttention(
                (query): Linear(in_features=384, out_features=384, bias=True)
                (key): Linear(in_features=384, out_features=384, bias=True)
                (value): Linear(in_features=384, out_features=384, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): XLMRobertaSelfOutpu

In [None]:
import json
from types import SimpleNamespace

def doc_to_text(doc: dict) -> str:
    # Serialize document to deterministic JSON string representation.
    # Ensures consistent input format across retrieval pipelines.
    return json.dumps(doc, ensure_ascii=False)


def crossencoder_query_points(
    cross_encoder,
    *,
    query: str,
    documents: list,        # Liste deiner Expert-JSONs
    limit: int = 5,
    batch_size: int = 32,
    text_fn=doc_to_text,
):
    """
    Cross-Encoder 'Standalone Retrieval':
    scored alle Dokumente (query, doc_text) und gibt Top-k zurück
    als Liste von Objekten mit .payload und .score (Qdrant-like).
    """

    # Build (query, document_text) pairs for scoring.
    pairs = [(query, text_fn(doc)) for doc in documents]

    # Compute relevance scores using cross-encoder.
    scores = cross_encoder.predict(pairs, batch_size=batch_size)

    # Select top-k indices sorted by descending score.
    top_idx = sorted(
        range(len(scores)),
        key=lambda i: scores[i],
        reverse=True
    )[:limit]

    # Construct Qdrant-like result objects (.payload, .score, .id).
    hits = []
    for i in top_idx:
        payload = documents[i]
        hits.append(SimpleNamespace(
            payload=payload,
            score=float(scores[i]),
            id=payload.get("id", i)  # Prefer explicit document ID.
        ))

    return hits

In [None]:
from sentence_transformers import CrossEncoder
import pandas as pd

# Iterate over evaluation queries and apply Cross-Encoder retrieval.
for idx, query in enumerate(queries_for_pooling):

    # Score all documents with cross-encoder and retrieve top-k.
    rel_docs = crossencoder_query_points(
        cross_model,
        query=query,
        documents=expert_data,
        limit=LIMIT,
        batch_size=32,
    )

    # Basic logging.
    print(idx)
    print(query)
    print("Found Docs: ", len(rel_docs))

    # Append results to pooling DataFrame.
    pool_df = append_retrieval_results_to_pool(
        pool_df,
        search_method="mmarco-mMiniLMv2-L12-H384-v1",
        query=query,
        rel_docs=rel_docs,
    )

0
IT-Systemadministration
Found Docs:  5
1
Information Technology
Found Docs:  5
2
Wissensmanagement
Found Docs:  5
3
Konstruktion
Found Docs:  5
4
Metallhandwerk
Found Docs:  5
5
Mechanik
Found Docs:  5
6
Automatisierungstechnik
Found Docs:  5
7
Elektrotechnik
Found Docs:  5
8
Konstruktion, Metallhandwerk
Found Docs:  5
9
Mechanik, Elektrotechnik
Found Docs:  5
10
Metallhandwerk, Mechanik
Found Docs:  5
11
Metallhandwerk, Elektrotechnik
Found Docs:  5
12
Mechanik, Automatisierungstechnik
Found Docs:  5
13
Automatisierungstechnik, Elektrotechnik
Found Docs:  5
14
KI
Found Docs:  5
15
Public Relations
Found Docs:  5
16
Sales
Found Docs:  5
17
Social Media Marketing
Found Docs:  5
18
Public Relations, Social Media Marketing
Found Docs:  5
19
Marketing
Found Docs:  5
20
Brand Management
Found Docs:  5
21
Marketing, Social Media Marketing
Found Docs:  5
22
Marketing, Brand Management
Found Docs:  5
23
Social Media Marketing, Brand Management
Found Docs:  5
24
Employee Development
Found Doc

In [None]:
pool_df

Unnamed: 0,search_method,query,doc_id,doc_raw,score
0,distiluse-base-multilingual-cased-v2,IT-Systemadministration,67251b202f496742be0ea207,"{'branches': [], 'companyLocationCity': 'Wuppe...",0.207031
1,distiluse-base-multilingual-cased-v2,IT-Systemadministration,671f67692f496742be0b1d80,"{'branches': ['Technologie'], 'companyLocation...",0.109566
2,distiluse-base-multilingual-cased-v2,IT-Systemadministration,6670848036b0c2e419593c7c,"{'branches': ['Technologie'], 'companyLocation...",0.107298
3,distiluse-base-multilingual-cased-v2,IT-Systemadministration,65b0d5453897d6f0e6516f88,"{'branches': ['Technologie'], 'companyLocation...",0.105019
4,distiluse-base-multilingual-cased-v2,IT-Systemadministration,671f67082f496742be0b1d07,"{'branches': ['Technologie', 'Unternehmensdien...",0.094808
...,...,...,...,...,...
835,mmarco-mMiniLMv2-L12-H384-v1,"Business Development, Business Process Analysis",65c5e1d861580d8db5fe02d4,"{'branches': ['Investor (Eigenkapital)', 'Unte...",4.049880
836,mmarco-mMiniLMv2-L12-H384-v1,"Business Development, Business Process Analysis",67d44084b5308acb98802c45,"{'branches': ['Technologie', 'Immobilien', 'Me...",3.989550
837,mmarco-mMiniLMv2-L12-H384-v1,"Business Development, Business Process Analysis",66eacb9b41f659468bdda50c,"{'branches': ['Automobilindustrie'], 'companyL...",3.393510
838,mmarco-mMiniLMv2-L12-H384-v1,"Business Development, Business Process Analysis",6695032d5199a106e76ad753,"{'branches': ['Unternehmensdienstleistungen'],...",3.121296


In [None]:
# nach jedem Methodenlauf oder Query
append_pool_df_to_csv(pool_df, PATH_TEST_COLLECTION_DATA + "pooling_results_raw.csv")

Added data from  {'mmarco-mMiniLMv2-L12-H384-v1'}


## Fetch pool documents for Neural-Sparse Model

In [None]:
from sentence_transformers.sparse_encoder import SparseEncoder

In [None]:
# Download from the 🤗 Hub
sparse_model = SparseEncoder("opensearch-project/opensearch-neural-sparse-encoding-multilingual-v1")

In [None]:
def encode_sparse_for_qdrant(text: str) -> models.SparseVector:
    # Encode text into sparse tensor representation (1D, vocab_size).
    tensor = sparse_model.encode_document(text)
    tensor = tensor.coalesce()  # Ensure indices are merged and canonical.

    # Extract non-zero token indices and corresponding weights.
    idx = tensor.indices()   # Shape: [1, nnz]
    vals = tensor.values()   # Shape: [nnz]

    token_indices = idx[0].tolist()
    token_values = vals.tolist()

    # Convert to Qdrant-compatible SparseVector.
    return models.SparseVector(
        indices=token_indices,
        values=token_values,
    )

In [None]:
# Iterate over evaluation queries and perform sparse retrieval.
for idx, query in enumerate(queries_for_pooling):
    print(query)

    # Execute sparse vector search against Qdrant collection.
    response = client.query_points(
        collection_name="expert_data_structured_and_fulltext_sparse",
        query=encode_sparse_for_qdrant(query),   # Convert query to sparse vector.
        using="neural_sparse_vector",            # Named sparse vector field.
        limit=LIMIT,
        with_vectors=True,
        with_payload=True
    )

    print(idx)
    print("Found Docs: ", len(response.points))

    # Append sparse retrieval results to pooling DataFrame.
    pool_df = append_retrieval_results_to_pool(
        pool_df,
        search_method="opensearch-neural-sparse-encoding-multilingual-v1",
        query=query,
        rel_docs=response.points,
    )

IT-Systemadministration
0
Found Docs:  5
Information Technology
1
Found Docs:  5
Wissensmanagement
2
Found Docs:  5
Konstruktion
3
Found Docs:  5
Metallhandwerk
4
Found Docs:  5
Mechanik
5
Found Docs:  5
Automatisierungstechnik
6
Found Docs:  5
Elektrotechnik
7
Found Docs:  5
Konstruktion, Metallhandwerk
8
Found Docs:  5
Mechanik, Elektrotechnik
9
Found Docs:  5
Metallhandwerk, Mechanik
10
Found Docs:  5
Metallhandwerk, Elektrotechnik
11
Found Docs:  5
Mechanik, Automatisierungstechnik
12
Found Docs:  5
Automatisierungstechnik, Elektrotechnik
13
Found Docs:  5
KI
14
Found Docs:  5
Public Relations
15
Found Docs:  5
Sales
16
Found Docs:  5
Social Media Marketing
17
Found Docs:  5
Public Relations, Social Media Marketing
18
Found Docs:  5
Marketing
19
Found Docs:  5
Brand Management
20
Found Docs:  5
Marketing, Social Media Marketing
21
Found Docs:  5
Marketing, Brand Management
22
Found Docs:  5
Social Media Marketing, Brand Management
23
Found Docs:  5
Employee Development
24
Found Doc

In [None]:
pool_df

Unnamed: 0,search_method,query,doc_id,doc_raw,score
0,distiluse-base-multilingual-cased-v2,IT-Systemadministration,67251b202f496742be0ea207,"{'branches': [], 'companyLocationCity': 'Wuppe...",0.207031
1,distiluse-base-multilingual-cased-v2,IT-Systemadministration,671f67692f496742be0b1d80,"{'branches': ['Technologie'], 'companyLocation...",0.109566
2,distiluse-base-multilingual-cased-v2,IT-Systemadministration,6670848036b0c2e419593c7c,"{'branches': ['Technologie'], 'companyLocation...",0.107298
3,distiluse-base-multilingual-cased-v2,IT-Systemadministration,65b0d5453897d6f0e6516f88,"{'branches': ['Technologie'], 'companyLocation...",0.105019
4,distiluse-base-multilingual-cased-v2,IT-Systemadministration,671f67082f496742be0b1d07,"{'branches': ['Technologie', 'Unternehmensdien...",0.094808
...,...,...,...,...,...
1255,opensearch-neural-sparse-encoding-multilingual-v1,"Business Development, Business Process Analysis",67d44084b5308acb98802c45,"{'branches': ['Technologie', 'Immobilien', 'Me...",9.613712
1256,opensearch-neural-sparse-encoding-multilingual-v1,"Business Development, Business Process Analysis",65c5352ec8a0f67c4c5cfdb0,"{'branches': ['Immobilien'], 'companyLocationC...",9.471822
1257,opensearch-neural-sparse-encoding-multilingual-v1,"Business Development, Business Process Analysis",66d1a5de528e751260e38ff3,"{'branches': ['Technologie'], 'companyLocation...",9.155007
1258,opensearch-neural-sparse-encoding-multilingual-v1,"Business Development, Business Process Analysis",67487abbb546838eefc5467d,"{'branches': ['Unternehmensdienstleistungen'],...",8.847045


In [None]:
# nach jedem Methodenlauf oder Query
append_pool_df_to_csv(pool_df, PATH_TEST_COLLECTION_DATA + "pooling_results_raw.csv")

Added data from  {'opensearch-neural-sparse-encoding-multilingual-v1'}


## Fetch pool documents for Late-Interaction Model

In [None]:
from fastembed import LateInteractionTextEmbedding

late_model = LateInteractionTextEmbedding("jinaai/jina-colbert-v2")
print(late_model.embedding_size)

128


In [None]:
# Iterate over evaluation queries and perform late-interaction retrieval.
for idx, query in enumerate(queries_for_pooling):

    # Generate late-interaction query embedding (e.g., ColBERT-style).
    query_embedding = next(
        late_model.query_embed([query])
    ).tolist()

    # Execute late-interaction search in Qdrant.
    rel_docs = client.query_points(
        collection_name="expert_data_structured_and_fulltext_late",
        query=query_embedding,
        using="late_interaction",
        limit=LIMIT,
    ).points

    # Basic logging.
    print(idx)
    print(query)
    print("Found Docs: ", len(rel_docs))

    # Append retrieval results to pooling DataFrame.
    pool_df = append_retrieval_results_to_pool(
        pool_df,
        search_method="jina-colbert-v2",
        query=query,
        rel_docs=rel_docs,
    )

0
IT-Systemadministration
Found Docs:  5
1
Information Technology
Found Docs:  5
2
Wissensmanagement
Found Docs:  5
3
Konstruktion
Found Docs:  5
4
Metallhandwerk
Found Docs:  5
5
Mechanik
Found Docs:  5
6
Automatisierungstechnik
Found Docs:  5
7
Elektrotechnik
Found Docs:  5
8
Konstruktion, Metallhandwerk
Found Docs:  5
9
Mechanik, Elektrotechnik
Found Docs:  5
10
Metallhandwerk, Mechanik
Found Docs:  5
11
Metallhandwerk, Elektrotechnik
Found Docs:  5
12
Mechanik, Automatisierungstechnik
Found Docs:  5
13
Automatisierungstechnik, Elektrotechnik
Found Docs:  5
14
KI
Found Docs:  5
15
Public Relations
Found Docs:  5
16
Sales
Found Docs:  5
17
Social Media Marketing
Found Docs:  5
18
Public Relations, Social Media Marketing
Found Docs:  5
19
Marketing
Found Docs:  5
20
Brand Management
Found Docs:  5
21
Marketing, Social Media Marketing
Found Docs:  5
22
Marketing, Brand Management
Found Docs:  5
23
Social Media Marketing, Brand Management
Found Docs:  5
24
Employee Development
Found Doc

In [None]:
pool_df

Unnamed: 0,search_method,query,doc_id,doc_raw,score
0,distiluse-base-multilingual-cased-v2,IT-Systemadministration,67251b202f496742be0ea207,"{""branches"": [], ""companyLocationCity"": ""Wuppe...",0.207031
1,distiluse-base-multilingual-cased-v2,IT-Systemadministration,671f67692f496742be0b1d80,"{""branches"": [""Technologie""], ""companyLocation...",0.109566
2,distiluse-base-multilingual-cased-v2,IT-Systemadministration,6670848036b0c2e419593c7c,"{""branches"": [""Technologie""], ""companyLocation...",0.107298
3,distiluse-base-multilingual-cased-v2,IT-Systemadministration,65b0d5453897d6f0e6516f88,"{""branches"": [""Technologie""], ""companyLocation...",0.105019
4,distiluse-base-multilingual-cased-v2,IT-Systemadministration,671f67082f496742be0b1d07,"{""branches"": [""Technologie"", ""Unternehmensdien...",0.094808
...,...,...,...,...,...
1675,jina-colbert-v2,"Business Development, Business Process Analysis",65c4d290c8a0f67c4c5ce5fb,"{'branches': ['Technologie', 'Medien & Werbung...",17.635206
1676,jina-colbert-v2,"Business Development, Business Process Analysis",66eacb9b41f659468bdda50c,"{'branches': ['Automobilindustrie'], 'companyL...",17.618885
1677,jina-colbert-v2,"Business Development, Business Process Analysis",6749c74eb546838eefc6115b,"{'branches': ['Technologie', 'Immobilien', 'Me...",17.507235
1678,jina-colbert-v2,"Business Development, Business Process Analysis",67487abbb546838eefc5467d,"{'branches': ['Unternehmensdienstleistungen'],...",17.500910


In [None]:
# nach jedem Methodenlauf oder Query
append_pool_df_to_csv(pool_df, PATH_TEST_COLLECTION_DATA + "pooling_results_raw.csv")

No data to add


## Fetch pool documents for TFIDF

In [None]:
# source: https://snowballstem.org/algorithms/german/stop.txt

GERMAN_STOPWORDS = [
    "aber",
    "alle", "allem", "allen", "aller", "alles",
    "als", "also", "am", "an",
    "ander", "andere", "anderem", "anderen", "anderer", "anderes",
    "anderm", "andern", "anderr", "anders",
    "auch", "auf", "aus", "bei",
    "bin", "bis", "bist",
    "da", "damit", "dann",
    "der", "den", "des", "dem", "die", "das",
    "daß",
    "derselbe", "derselben", "denselben", "desselben", "demselben",
    "dieselbe", "dieselben", "dasselbe",
    "dazu",
    "dein", "deine", "deinem", "deinen", "deiner", "deines",
    "denn",
    "derer", "dessen",
    "dich", "dir", "du",
    "dies", "diese", "diesem", "diesen", "dieser", "dieses",
    "doch", "dort",
    "durch",
    "ein", "eine", "einem", "einen", "einer", "eines",
    "einig", "einige", "einigem", "einigen", "einiger", "einiges",
    "einmal",
    "er", "ihn", "ihm",
    "es", "etwas",
    "euer", "eure", "eurem", "euren", "eurer", "eures",
    "für", "gegen",
    "gewesen",
    "hab", "habe", "haben", "hat", "hatte", "hatten",
    "hier", "hin", "hinter",
    "ich", "mich", "mir",
    "ihr", "ihre", "ihrem", "ihren", "ihrer", "ihres",
    "euch",
    "im", "in", "indem", "ins",
    "ist",
    "jede", "jedem", "jeden", "jeder", "jedes",
    "jene", "jenem", "jenen", "jener", "jenes",
    "jetzt",
    "kann",
    "kein", "keine", "keinem", "keinen", "keiner", "keines",
    "können", "könnte",
    "machen",
    "man",
    "manche", "manchem", "manchen", "mancher", "manches",
    "mein", "meine", "meinem", "meinen", "meiner", "meines",
    "mit",
    "muss", "musste",
    "nach",
    "nicht", "nichts",
    "noch", "nun", "nur",
    "ob", "oder", "ohne",
    "sehr",
    "sein", "seine", "seinem", "seinen", "seiner", "seines",
    "selbst", "sich",
    "sie", "ihnen",
    "sind",
    "so",
    "solche", "solchem", "solchen", "solcher", "solches",
    "soll", "sollte",
    "sondern", "sonst",
    "über",
    "um",
    "und",
    "uns", "unse", "unsem", "unsen", "unser", "unses",
    "unter",
    "viel",
    "vom", "von", "vor",
    "während",
    "war", "waren", "warst",
    "was",
    "weg",
    "weil",
    "weiter",
    "welche", "welchem", "welchen", "welcher", "welches",
    "wenn",
    "werde", "werden",
    "wie",
    "wieder",
    "will",
    "wir",
    "wird",
    "wirst",
    "wo",
    "wollen", "wollte",
    "würde", "würden",
    "zu", "zum", "zur",
    "zwar",
    "zwischen"
]


In [None]:
import json
import re
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
_whitespace = re.compile(r"\s+")

def json_to_text(doc: dict) -> str:
    """
    Serialisiert das komplette Expertenprofil zu einem String.
    """
    # Deterministic JSON serialization for indexing or scoring.
    return json.dumps(doc, ensure_ascii=False)


def normalize(text: str) -> str:
    """
    Minimale Normalisierung
    """
    # Lowercase for case-insensitive comparison.
    text = text.lower()

    # Collapse multiple whitespace characters into single space.
    text = _whitespace.sub(" ", text).strip()

    return text

In [None]:
def build_tfidf_index(
    expert_profiles: list[dict],
    min_df: int | float = 2,
    max_df: int | float = 0.95,
    ngram_range: tuple[int, int] = (1, 1),
):
    """
    Baut den TF-IDF Index über vollständige JSON-Profile.
    """

    # Convert each expert profile to normalized text representation.
    texts = [
        normalize(json_to_text(profile))
        for profile in expert_profiles
    ]

    print(texts)  # Debug: inspect generated corpus

    # Configure TF-IDF vectorizer with controlled preprocessing.
    vectorizer = TfidfVectorizer(
        lowercase=True,
        stop_words=GERMAN_STOPWORDS,
        ngram_range=ngram_range,
        min_df=min_df,
        max_df=max_df,
        sublinear_tf=False,
        norm="l2",
        token_pattern=r"(?u)\b\w\w+\b",
    )

    # Fit vectorizer and transform corpus into sparse TF-IDF matrix.
    X = vectorizer.fit_transform(texts)  # shape: (n_docs, n_terms)

    return vectorizer, X

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_search(
    query: str,
    vectorizer: TfidfVectorizer,
    X,
    expert_profiles: list[dict],
    top_k: int = 10,
):
    """
    Führt eine TF-IDF Suche aus und gibt Top-k Treffer im benötigten Format zurück.

    Output pro Treffer:
      - id: bevorzugt expert_profile["id"], sonst doc_index
      - payload: komplettes Expertenprofil (dict)
      - score: TF-IDF Similarity Score (float)
    """

    # Transform normalized query into TF-IDF vector.
    q = vectorizer.transform([normalize(query)])   # shape: (1, n_terms)

    # Compute cosine similarity via dot product (L2-normalized).
    scores = (X @ q.T).toarray().ravel()           # shape: (n_docs,)

    # Select top-k indices (partial sort for efficiency).
    k = min(top_k, len(scores))
    idx = np.argpartition(-scores, k - 1)[:k]
    idx = idx[np.argsort(-scores[idx])]            # sort top-k by score desc

    results = []

    for i in idx:
        payload = expert_profiles[int(i)]

        # Prefer explicit document ID if available.
        doc_id = (
            payload.get("id", int(i))
            if isinstance(payload, dict)
            else int(i)
        )

        results.append({
            "id": doc_id,
            "payload": payload,
            "score": float(scores[int(i)]),
        })

    return results

In [None]:
vectorizer, X = build_tfidf_index(expert_data)

['{"branches": ["technologie", "medien & werbung", "unternehmensdienstleistungen"], "companylocationcity": "lennestadt", "companylocationstreet": "hagener straße 64", "companylocationzip": 57368.0, "companyname": "lenne.tech gmbh", "companytypes": ["dienstleistung"], "companywebsite": "https://lenne.tech", "description": "seit über 15 jahren begleite ich digitale projekte von der konzeption bis zur \\"schlüsselfertigen\\" anwendung. als mitglied im team von lenne.tech (https://lenne.tech) entwickle ich apps & webanwendungen und unterstütze unternehmen beim aufbau sowie bei der weiterbildung von entwicklungsteams. im rahmen unserer lenne.learning akademie (https://lennelearning.de) organisiere ich gemeinsam mit anderen tutoren & mentoren das recruiting sowie das onboarding neuer entwickler für partnerunternehmen.", "employeeofinstitutionnames": [], "firstname": "ege", "gender": "male", "id": "65acfb5a3897d6f0e6506db1", "jobtitle": "softwareentwickler", "lastname": "siebert", "projectsde

In [None]:
# Optional: local accumulator (currently unused).
pool_rows = []

# Iterate over evaluation queries and perform TF-IDF retrieval.
for query in queries_for_pooling:

    # Execute sparse lexical search.
    results = tfidf_search(
        query=query,
        vectorizer=vectorizer,
        X=X,
        expert_profiles=expert_data,
        top_k=LIMIT,
    )

    # Basic logging.
    print(query)
    print(results)
    print("Found Docs: ", len(results))

    # Append retrieval results to pooling DataFrame.
    pool_df = append_retrieval_results_to_pool(
        pool_df,
        search_method="tf-idf",
        query=query,
        rel_docs=results,
    )

IT-Systemadministration
[{'id': '6682c449680d5b5995b4229d', 'payload': {'branches': ['Technologie', 'Unternehmensdienstleistungen'], 'companyLocationCity': 'Hagen', 'companyLocationStreet': 'Hochofenstr. 22 ', 'companyLocationZip': 58135.0, 'companyName': 'stepIT.net Operations GmbH', 'companyTypes': ['Dienstleistung'], 'companyWebsite': 'www.stepIT.net', 'description': None, 'employeeOfInstitutionNames': [], 'firstName': 'Domenic', 'gender': 'MALE', 'id': '6682c449680d5b5995b4229d', 'jobTitle': 'CEO', 'lastName': 'Wartenberg', 'projectsDescription': None, 'skills': ['IT-Management', 'IT-Systemadministration', 'Human Resources', 'Information Technology', 'Cybersecurity'], 'title': None, 'full_text': 'Domenic Wartenberg verfügt als CEO der stepIT.net Operations GmbH über ausgeprägte Kompetenzen in IT-Management, IT-Systemadministration, Information Technology, Human Resources und Cybersecurity. In den Bereichen Technologie und Unternehmensdienstleistungen setzt er seine Expertise geziel

In [None]:
# nach jedem Methodenlauf oder Query
append_pool_df_to_csv(pool_df, PATH_TEST_COLLECTION_DATA + "pooling_results_raw.csv")

Added data from  {'tf-idf'}


## Fetch pool documents for BM25

In [None]:
#https://dl.acm.org/doi/10.1145/2682862.2682863

In [None]:
!pip install rank-bm25



In [None]:
from __future__ import annotations

from rank_bm25 import BM25Okapi
import json
import re
from typing import Iterable, Sequence
import numpy as np


# Precompiled regex for performance (used in normalize / tokenize)
_whitespace = re.compile(r"\s+")
_token_re = re.compile(r"(?u)\b\w\w+\b")  # gleiche Token-Definition wie TF-IDF


def json_to_text(doc: dict) -> str:
    """Serialisiert ein Expertenprofil (dict) deterministisch zu Text."""
    return json.dumps(doc, ensure_ascii=False)


def normalize(text: str) -> str:
    """Lowercase + Whitespace-Normalisierung."""
    text = text.lower()
    text = _whitespace.sub(" ", text).strip()
    return text


def tokenize(text: str) -> list[str]:
    """
    Tokenisierung analog zur TF-IDF-Konfiguration:
    - mindestens 2 Wortzeichen
    - Unicode-aware
    """
    return _token_re.findall(text)


def preprocess_for_retrieval(
    text: str,
    *,
    stop_words: set[str] | None = None,
    ngram_range: tuple[int, int] = (1, 1),
) -> list[str]:
    """
    Einheitliches Preprocessing für Retrieval (TF-IDF / BM25).

    Pipeline:
      1. normalize
      2. tokenize
      3. optional Stopwort-Filter
      4. optional N-Gram-Generierung
    """
    tokens = tokenize(normalize(text))

    # Stopwort-Filter optional
    if stop_words:
        tokens = [t for t in tokens if t not in stop_words]

    n_min, n_max = ngram_range
    if (n_min, n_max) == (1, 1):
        return tokens  # Baseline: Unigrams

    # N-Gram-Generierung (nur falls explizit gewünscht)
    out: list[str] = []
    L = len(tokens)

    for n in range(n_min, n_max + 1):
        if n <= 0:
            continue
        for i in range(0, L - n + 1):
            out.append(" ".join(tokens[i : i + n]))

    return out


def build_bm25_index(
    expert_profiles: Sequence[dict],
    *,
    stop_words: set[str] | None = None,
    ngram_range: tuple[int, int] = (1, 1),   # Default: Unigrams
    k1: float = 2,
    b: float = 0.5,
) -> tuple[BM25Okapi, list[list[str]]]:
    """
    Baut einen BM25-Index über Expertenprofile.

    Rückgabe:
      - bm25 Instanz
      - tokenisierte Dokumente (für Debugging / Analyse)
    """
    tokenized_docs = [
        preprocess_for_retrieval(
            json_to_text(p),
            stop_words=stop_words,
            ngram_range=ngram_range,
        )
        for p in expert_profiles
    ]

    bm25 = BM25Okapi(tokenized_docs, k1=k1, b=b)
    return bm25, tokenized_docs


def bm25_search(
    query: str,
    bm25: BM25Okapi,
    expert_profiles: Sequence[dict],
    *,
    stop_words: set[str] | None = None,
    ngram_range: tuple[int, int] = (1, 1),  # Default: Unigrams
    top_k: int = 10,
):
    """
    Führt eine BM25-Suche aus.

    Rückgabeformat (kompatibel zu TF-IDF-Variante):
      - id
      - payload (Originalprofil)
      - score (BM25)
    """
    # Query-Preprocessing identisch zur Indexierung
    q_tokens = preprocess_for_retrieval(
        query,
        stop_words=stop_words,
        ngram_range=ngram_range,
    )

    # BM25 Score-Berechnung (Shape: n_docs,)
    scores = bm25.get_scores(q_tokens)

    k = min(top_k, len(scores))

    # Effiziente Top-k Selektion ohne vollständiges Sorting
    idx = np.argpartition(-scores, k - 1)[:k]
    idx = idx[np.argsort(-scores[idx])]  # finale Sortierung

    results = []

    for i in idx:
        payload = expert_profiles[i]
        doc_id = payload.get("id") if isinstance(payload, dict) else None

        results.append({
            "id": doc_id,
            "payload": payload,
            "score": float(scores[i]),
        })

    return results

In [None]:
tokenized_docs = [
    preprocess_for_retrieval(json_to_text(p), stop_words=GERMAN_STOPWORDS, ngram_range=(1, 1))
    for p in expert_data
]

tokenized_docs[:1]


In [None]:
# Build the BM25 index using identical preprocessing as TF-IDF
# → ensures a fair methodological comparison in the retrieval experiment
bm25, tokenized_docs = build_bm25_index(
    expert_data,
    stop_words=GERMAN_STOPWORDS,   # same stopword list as TF-IDF
    ngram_range=(1, 1),            # baseline: unigrams
    k1=2,
    b=0.5,
)


# Iterate over all evaluation queries (pooling setup)
for query in queries_for_pooling:

    # Execute retrieval with preprocessing parameters consistent with indexing
    results = bm25_search(
        query,
        bm25,
        expert_data,
        stop_words=GERMAN_STOPWORDS,
        ngram_range=(1, 1),
        top_k=LIMIT,               # only top-k documents contribute to the pool
    )

    # Append results to the shared relevance pool
    # → forms the basis for manual relevance assessment and method comparison
    pool_df = append_retrieval_results_to_pool(
          pool_df,
          search_method="okapi_bm25",
          query=query,
          rel_docs=results,
    )

In [None]:
pool_df

Unnamed: 0,search_method,query,doc_id,doc_raw,score
0,distiluse-base-multilingual-cased-v2,IT-Systemadministration,67251b202f496742be0ea207,"{""branches"": [], ""companyLocationCity"": ""Wuppe...",0.207031
1,distiluse-base-multilingual-cased-v2,IT-Systemadministration,671f67692f496742be0b1d80,"{""branches"": [""Technologie""], ""companyLocation...",0.109566
2,distiluse-base-multilingual-cased-v2,IT-Systemadministration,6670848036b0c2e419593c7c,"{""branches"": [""Technologie""], ""companyLocation...",0.107298
3,distiluse-base-multilingual-cased-v2,IT-Systemadministration,65b0d5453897d6f0e6516f88,"{""branches"": [""Technologie""], ""companyLocation...",0.105019
4,distiluse-base-multilingual-cased-v2,IT-Systemadministration,671f67082f496742be0b1d07,"{""branches"": [""Technologie"", ""Unternehmensdien...",0.094808
...,...,...,...,...,...
2515,okapi_bm25,"Business Development, Business Process Analysis",65d64dae61580d8db5016c17,"{'branches': ['Unternehmensdienstleistungen'],...",7.047387
2516,okapi_bm25,"Business Development, Business Process Analysis",65b1525c3897d6f0e651b697,"{'branches': ['Unternehmensdienstleistungen', ...",6.887179
2517,okapi_bm25,"Business Development, Business Process Analysis",6638bc9268131729196ea198,"{'branches': ['Investor (Eigenkapital)', 'Unte...",6.785589
2518,okapi_bm25,"Business Development, Business Process Analysis",65aff66a3897d6f0e6512478,"{'branches': ['Unternehmensdienstleistungen'],...",6.726348


In [None]:
# nach jedem Methodenlauf oder Query
append_pool_df_to_csv(pool_df, PATH_TEST_COLLECTION_DATA + "pooling_results_raw.csv")

Added data from  {'okapi_bm25'}


## Generate cleaned pool

After the raw pooling results are generated the data needs to be depulicated to get the final pool.

**Caution**:
The following code cells should only be run if the raw data includes search results from every of the above pooling methods. This leads to that the raw data must contain a combined amount of 5040 datarows. Otherwise some of the methods had not run yet or they run multiple times. So ensure that each method produces 840 rows in the dataset.

In [None]:
import pandas as pd
from pathlib import Path

# Construct absolute path to the raw pooling results file
pool_path = Path(PATH_TEST_COLLECTION_DATA) / "pooling_results_raw.csv"

# Load existing pooling results if available
if pool_path.exists():
    print("Found")
    pool_df = pd.read_csv(pool_path)
else:
    # No prior pooling file found (e.g., first experimental run)
    print("Not Found")

Found


In [None]:
pool_df

Unnamed: 0,search_method,query,doc_id,doc_raw,score
0,distiluse-base-multilingual-cased-v2,IT-Systemadministration,67251b202f496742be0ea207,"{""branches"": [], ""companyLocationCity"": ""Wuppe...",0.207031
1,distiluse-base-multilingual-cased-v2,IT-Systemadministration,671f67692f496742be0b1d80,"{""branches"": [""Technologie""], ""companyLocation...",0.109566
2,distiluse-base-multilingual-cased-v2,IT-Systemadministration,6670848036b0c2e419593c7c,"{""branches"": [""Technologie""], ""companyLocation...",0.107298
3,distiluse-base-multilingual-cased-v2,IT-Systemadministration,65b0d5453897d6f0e6516f88,"{""branches"": [""Technologie""], ""companyLocation...",0.105019
4,distiluse-base-multilingual-cased-v2,IT-Systemadministration,671f67082f496742be0b1d07,"{""branches"": [""Technologie"", ""Unternehmensdien...",0.094808
...,...,...,...,...,...
2515,okapi_bm25,"Business Development, Business Process Analysis",65d64dae61580d8db5016c17,"{""branches"": [""Unternehmensdienstleistungen""],...",7.047387
2516,okapi_bm25,"Business Development, Business Process Analysis",65b1525c3897d6f0e651b697,"{""branches"": [""Unternehmensdienstleistungen"", ...",6.887179
2517,okapi_bm25,"Business Development, Business Process Analysis",6638bc9268131729196ea198,"{""branches"": [""Investor (Eigenkapital)"", ""Unte...",6.785589
2518,okapi_bm25,"Business Development, Business Process Analysis",65aff66a3897d6f0e6512478,"{""branches"": [""Unternehmensdienstleistungen""],...",6.726348


In [None]:
# Deduplicate pooled documents by document ID
# ensures each document appears only once in the final assessment set
unique_docs_df = (
    pool_df
    .drop_duplicates(subset=["doc_id"])
    .loc[:, ["doc_id", "doc_raw"]]  # keep only relevant columns
    .reset_index(drop=True)         # clean sequential indexing
)

# Log size reduction after deduplication (sanity check)
print(len(pool_df), "→", len(unique_docs_df))

# Preview resulting unique document set
unique_docs_df.head()

2520 → 259


Unnamed: 0,doc_id,doc_raw
0,67251b202f496742be0ea207,"{""branches"": [], ""companyLocationCity"": ""Wuppe..."
1,671f67692f496742be0b1d80,"{""branches"": [""Technologie""], ""companyLocation..."
2,6670848036b0c2e419593c7c,"{""branches"": [""Technologie""], ""companyLocation..."
3,65b0d5453897d6f0e6516f88,"{""branches"": [""Technologie""], ""companyLocation..."
4,671f67082f496742be0b1d07,"{""branches"": [""Technologie"", ""Unternehmensdien..."


In [None]:
# Map original expert profiles back into the pooled DataFrame
import json

# Build lookup table: profile_id (str) → full profile dict
# → enables deterministic reconstruction of original payload
expert_map = {str(p["id"]): p for p in expert_data if "id" in p}

# Ensure consistent ID type for mapping (string-based join key)
unique_docs_df["doc_id"] = unique_docs_df["doc_id"].astype(str)

# Attach full profile dictionary to each pooled document
unique_docs_df["profile_dict"] = unique_docs_df["doc_id"].map(expert_map)

# Overwrite raw document field with canonical JSON serialization
# → guarantees consistent representation for later relevance assessment
unique_docs_df["doc_raw"] = unique_docs_df["profile_dict"].apply(
    lambda d: json.dumps(d, ensure_ascii=False, default=str)
    if isinstance(d, dict) else None
)

In [None]:
unique_docs_df

Unnamed: 0,doc_id,doc_raw,profile_dict
0,67251b202f496742be0ea207,"{""branches"": [], ""companyLocationCity"": ""Wuppe...","{'branches': [], 'companyLocationCity': 'Wuppe..."
1,671f67692f496742be0b1d80,"{""branches"": [""Technologie""], ""companyLocation...","{'branches': ['Technologie'], 'companyLocation..."
2,6670848036b0c2e419593c7c,"{""branches"": [""Technologie""], ""companyLocation...","{'branches': ['Technologie'], 'companyLocation..."
3,65b0d5453897d6f0e6516f88,"{""branches"": [""Technologie""], ""companyLocation...","{'branches': ['Technologie'], 'companyLocation..."
4,671f67082f496742be0b1d07,"{""branches"": [""Technologie"", ""Unternehmensdien...","{'branches': ['Technologie', 'Unternehmensdien..."
...,...,...,...
254,65b817053897d6f0e6537dce,"{""branches"": [""Technologie"", ""Immobilien"", ""Me...","{'branches': ['Technologie', 'Immobilien', 'Me..."
255,65fae369763465cfa7e2b79f,"{""branches"": [""Investor (Eigenkapital)"", ""Unte...","{'branches': ['Investor (Eigenkapital)', 'Unte..."
256,6683ea8108d4ee614ef75c50,"{""branches"": [""Gesundheitswesen""], ""companyLoc...","{'branches': ['Gesundheitswesen'], 'companyLoc..."
257,673328df51c382bea60f7ef8,"{""branches"": [""Technologie"", ""Automobilindustr...","{'branches': ['Technologie', 'Automobilindustr..."


In [None]:
## to json

final_path = Path(PATH_TEST_COLLECTION_DATA) / "pooling_results_final.json"

unique_docs_df.to_json(
    final_path,
    orient="records",
    force_ascii=False,
    indent=2
)