## This notebook is needed to index the data for the lexical searchmethods

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
PATH_TEST_COLLECTION_DATA="drive/MyDrive/Uni/Master/Masterthesis/Data/test_collection/"
PATH_LEXICAL_INDEX="drive/MyDrive/Uni/Master/Masterthesis/Experiments/Indexing/lexical_index/"

In [None]:
import re

# Load pooling data

In [None]:
import json

# importieren der Pooling results
pooling_file = PATH_TEST_COLLECTION_DATA + "pooling_results_final.json"

# JSON direkt einlesen
with open(pooling_file, "r", encoding="utf-8") as f:
    pooling_results = json.load(f)

pooling_results[0]


{'doc_id': '67251b202f496742be0ea207',
 'doc_raw': '{"branches": [], "companyLocationCity": "Wuppertal", "companyLocationStreet": "Schwelmer Straße 245", "companyLocationZip": 42389.0, "companyName": "Salto Systems GmbH", "companyTypes": [], "companyWebsite": "saltosystems.com/de", "description": "Ich bin ein Enthusiast für elektronische Zutrittskontrolle und Schließsysteme und es begeistert mich, wenn durch integrative Lösungen Probleme gelöst und Prozesse optimiert werden können.", "employeeOfInstitutionNames": [], "firstName": "Bela", "gender": "MALE", "id": "67251b202f496742be0ea207", "jobTitle": "System- und Projektberater", "lastName": "Marahrens", "projectsDescription": "Elektronische Zutrittskontrolle\\nElektronische Schließsysteme\\nBesuchermanagement\\nCloudbasierte Zutrittskontrolle", "skills": [], "title": null, "full_text": "Bela Marahrens ist System- und Projektberater. Er ist Enthusiast für elektronische Zutrittskontrolle und Schließsysteme und überzeugt davon, dass inte

# Index documents

## Prepare data

In [None]:
pooling_results[0]["profile_dict"].keys()

dict_keys(['branches', 'companyLocationCity', 'companyLocationStreet', 'companyLocationZip', 'companyName', 'companyTypes', 'companyWebsite', 'description', 'employeeOfInstitutionNames', 'firstName', 'gender', 'id', 'jobTitle', 'lastName', 'projectsDescription', 'skills', 'title', 'full_text'])

In [None]:
### create copy without fulltext
import copy
import json

pooling_results_copy = copy.deepcopy(pooling_results)

pooling_results_without_fulltext = []

for obj in pooling_results_copy:
    new_obj = obj.copy()

    if "profile_dict" in new_obj and isinstance(new_obj["profile_dict"], dict):
        # 1) full_text entfernen
        new_profile_dict = {
            k: v for k, v in new_obj["profile_dict"].items()
            if k != "full_text"
        }

        # 2) doc_raw überschreiben
        new_doc_raw = json.dumps(
            new_profile_dict,
            ensure_ascii=False,
            default=str
        )

        new_obj["profile_dict"] = new_profile_dict
        new_obj["doc_raw"] = new_doc_raw

    pooling_results_without_fulltext.append(new_obj)

# Check
print(pooling_results_without_fulltext[0]["profile_dict"].keys())
print(pooling_results_without_fulltext[1]["doc_raw"])
print(len(pooling_results_without_fulltext))


dict_keys(['branches', 'companyLocationCity', 'companyLocationStreet', 'companyLocationZip', 'companyName', 'companyTypes', 'companyWebsite', 'description', 'employeeOfInstitutionNames', 'firstName', 'gender', 'id', 'jobTitle', 'lastName', 'projectsDescription', 'skills', 'title'])
{"branches": ["Technologie"], "companyLocationCity": "Siegen", "companyLocationStreet": "Sonnenstraße 33-35", "companyLocationZip": 57078.0, "companyName": "K-iS Systemhaus GmbH", "companyTypes": ["Dienstleistung"], "companyWebsite": "https://www.k-is.com/", "description": "Als Account Manager mit dem Hintergrund der Heilerziehungspflege bringe ich eine einzigartige Kombination aus Empathie und technischem Know-how mit. \nMeine Kernkompetenz ist es, komplexe Herausforderungen zu erkennen und in einfache Lösungen zu übersetzen.\nFlexibilität und Anpassungsfähigkeit sind die Skills, die ich auch gerne bei Ihnen anwende.\nIch stehe Ihnen als vertrauensvoller Partner zur Seite und unterstütze Sie bei der digital

In [None]:
## create copy with only fulltext
import copy

pooling_results_copy = copy.deepcopy(pooling_results)

pooling_results_fulltext_only = []

for obj in pooling_results_copy:
    new_obj = obj.copy()

    profile = new_obj.get("profile_dict")

    if isinstance(profile, dict):
        # full_text extrahieren
        full_text = profile.get("full_text")

        # doc_raw entfernen
        new_obj.pop("doc_raw", None)

        # neuen Key setzen
        new_obj["doc_raw_fulltext"] = full_text

    pooling_results_fulltext_only.append(new_obj)

# Check
print(pooling_results_fulltext_only[0].keys())
print(pooling_results_fulltext_only[0])


dict_keys(['doc_id', 'profile_dict', 'doc_raw_fulltext'])
{'doc_id': '67251b202f496742be0ea207', 'profile_dict': {'branches': [], 'companyLocationCity': 'Wuppertal', 'companyLocationStreet': 'Schwelmer Straße 245', 'companyLocationZip': 42389.0, 'companyName': 'Salto Systems GmbH', 'companyTypes': [], 'companyWebsite': 'saltosystems.com/de', 'description': 'Ich bin ein Enthusiast für elektronische Zutrittskontrolle und Schließsysteme und es begeistert mich, wenn durch integrative Lösungen Probleme gelöst und Prozesse optimiert werden können.', 'employeeOfInstitutionNames': [], 'firstName': 'Bela', 'gender': 'MALE', 'id': '67251b202f496742be0ea207', 'jobTitle': 'System- und Projektberater', 'lastName': 'Marahrens', 'projectsDescription': 'Elektronische Zutrittskontrolle\nElektronische Schließsysteme\nBesuchermanagement\nCloudbasierte Zutrittskontrolle', 'skills': [], 'title': None, 'full_text': 'Bela Marahrens ist System- und Projektberater. Er ist Enthusiast für elektronische Zutrittsk

# Index documents

## Preprocessing lexical methods

In [None]:
# source: https://snowballstem.org/algorithms/german/stop.txt

GERMAN_STOPWORDS = [
    "aber",
    "alle", "allem", "allen", "aller", "alles",
    "als", "also", "am", "an",
    "ander", "andere", "anderem", "anderen", "anderer", "anderes",
    "anderm", "andern", "anderr", "anders",
    "auch", "auf", "aus", "bei",
    "bin", "bis", "bist",
    "da", "damit", "dann",
    "der", "den", "des", "dem", "die", "das",
    "daß",
    "derselbe", "derselben", "denselben", "desselben", "demselben",
    "dieselbe", "dieselben", "dasselbe",
    "dazu",
    "dein", "deine", "deinem", "deinen", "deiner", "deines",
    "denn",
    "derer", "dessen",
    "dich", "dir", "du",
    "dies", "diese", "diesem", "diesen", "dieser", "dieses",
    "doch", "dort",
    "durch",
    "ein", "eine", "einem", "einen", "einer", "eines",
    "einig", "einige", "einigem", "einigen", "einiger", "einiges",
    "einmal",
    "er", "ihn", "ihm",
    "es", "etwas",
    "euer", "eure", "eurem", "euren", "eurer", "eures",
    "für", "gegen",
    "gewesen",
    "hab", "habe", "haben", "hat", "hatte", "hatten",
    "hier", "hin", "hinter",
    "ich", "mich", "mir",
    "ihr", "ihre", "ihrem", "ihren", "ihrer", "ihres",
    "euch",
    "im", "in", "indem", "ins",
    "ist",
    "jede", "jedem", "jeden", "jeder", "jedes",
    "jene", "jenem", "jenen", "jener", "jenes",
    "jetzt",
    "kann",
    "kein", "keine", "keinem", "keinen", "keiner", "keines",
    "können", "könnte",
    "machen",
    "man",
    "manche", "manchem", "manchen", "mancher", "manches",
    "mein", "meine", "meinem", "meinen", "meiner", "meines",
    "mit",
    "muss", "musste",
    "nach",
    "nicht", "nichts",
    "noch", "nun", "nur",
    "ob", "oder", "ohne",
    "sehr",
    "sein", "seine", "seinem", "seinen", "seiner", "seines",
    "selbst", "sich",
    "sie", "ihnen",
    "sind",
    "so",
    "solche", "solchem", "solchen", "solcher", "solches",
    "soll", "sollte",
    "sondern", "sonst",
    "über",
    "um",
    "und",
    "uns", "unse", "unsem", "unsen", "unser", "unses",
    "unter",
    "viel",
    "vom", "von", "vor",
    "während",
    "war", "waren", "warst",
    "was",
    "weg",
    "weil",
    "weiter",
    "welche", "welchem", "welchen", "welcher", "welches",
    "wenn",
    "werde", "werden",
    "wie",
    "wieder",
    "will",
    "wir",
    "wird",
    "wirst",
    "wo",
    "wollen", "wollte",
    "würde", "würden",
    "zu", "zum", "zur",
    "zwar",
    "zwischen"
]


In [None]:
_whitespace = re.compile(r"\s+")

def normalize(text: str) -> str:
    """
    Minimale Normalisierung (fair & reproduzierbar).
    """
    text = text.lower()
    text = _whitespace.sub(" ", text).strip()
    return text


In [None]:
def bm25_tokenize(text: str) -> list[str]:
    tokens = re.findall(r"(?u)\b\w\w+\b", normalize(text))
    return [t for t in tokens if t not in GERMAN_STOPWORDS]

## Save index to files

In [None]:
from scipy.sparse import save_npz, load_npz
import joblib
import json

## TF-IDF
def save_tfidf_index(file_name, vectorizer, X):
    """
    Speichert den TF-IDF Index (Matrix, Vectorizer).
    """
    save_npz(f"{PATH_LEXICAL_INDEX}{file_name}_matrix.npz", X)
    joblib.dump(vectorizer, f"{PATH_LEXICAL_INDEX}{file_name}_vectorizer.joblib")
    print("Saved index to:" + f"{PATH_LEXICAL_INDEX}{file_name}_matrix.npz")
    print("Saved vectorizer to:" + f"{PATH_LEXICAL_INDEX}{file_name}_vectorizer.joblib")

def load_tfidf_index(file_name):
    """
    Lädt den TF-IDF Index (Matrix, Vectorizer).
    """
    X = load_npz(f"{PATH_LEXICAL_INDEX}/{file_name}_matrix.npz")
    vectorizer = joblib.load(f"{PATH_LEXICAL_INDEX}/{file_name}_vectorizer.joblib")
    return vectorizer, X

## BM25

def save_bm25_index(file_name, bm25):
    """
    Speichert den BM25-Index (BM25Okapi Objekt).
    """
    joblib.dump(bm25, f"{PATH_LEXICAL_INDEX}{file_name}_bm25.joblib")
    print("Saved BM25 index to:" + f"{PATH_LEXICAL_INDEX}{file_name}_bm25.joblib")


def load_bm25_index(file_name):
    """
    Lädt den BM25-Index (BM25Okapi Objekt).
    """
    bm25 = joblib.load(f"{PATH_LEXICAL_INDEX}{file_name}_bm25.joblib")
    return bm25


## Index documents TF-IDF

### Create Index

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
def build_tfidf_index(
    expert_profiles: list[dict],
    min_df: int | float = 2,
    max_df: int | float = 0.95,
    ngram_range: tuple[int, int] = (1, 1),
):
    """
    Baut den TF-IDF Index über vollständige JSON-Profile.
    """

    texts = [
        normalize(profile)
        for profile in expert_profiles
    ]

    vectorizer = TfidfVectorizer(
        lowercase=True,
        stop_words=GERMAN_STOPWORDS,
        ngram_range=ngram_range,
        min_df=min_df,
        max_df=max_df,
        sublinear_tf=False,
        norm="l2",
        token_pattern=r"(?u)\b\w\w+\b",
    )

    X = vectorizer.fit_transform(texts)  # shape: (n_docs, n_terms)
    return vectorizer, X


In [None]:
## load only the structured original expert data without additional full_text

profiles_without_fulltext = [profile.get("doc_raw") for profile in pooling_results_without_fulltext]

vectorizer, X = build_tfidf_index(profiles_without_fulltext)

print(X)
print(vectorizer)

save_tfidf_index("tfidf_index_without_fulltext", vectorizer, X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 9208 stored elements and shape (259, 1283)>
  Coords	Values
  (0, 1259)	0.23467346472386896
  (0, 1071)	0.13761525954315837
  (0, 1093)	0.21628444651141882
  (0, 503)	0.07856790661172788
  (0, 272)	0.15341197796931672
  (0, 304)	0.06203028420545217
  (0, 367)	0.24772068078246992
  (0, 1278)	0.7431620423474097
  (0, 180)	0.24772068078246992
  (0, 721)	0.1515605821631423
  (0, 912)	0.22455325771455667
  (0, 933)	0.1978954282989687
  (0, 727)	0.054823650339749744
  (0, 1089)	0.23467346472386896
  (1, 503)	0.03864495943356079
  (1, 272)	0.07545828724373317
  (1, 721)	0.149095293536743
  (1, 727)	0.02696594366258826
  (1, 1105)	0.04597309474937009
  (1, 1015)	0.0773993017050524
  (1, 1030)	0.12184562466455762
  (1, 26)	0.10294444809482682
  (1, 27)	0.10294444809482682
  (1, 41)	0.11045033410318221
  (1, 608)	0.21276635771523222
  :	:
  (258, 727)	0.0646261647036658
  (258, 317)	0.0702308070082912
  (258, 1260)	0.07524193605388209

In [None]:
#load only the generated fulltext for each expert object

profiles_fulltext_only = [profile.get("doc_raw_fulltext") for profile in pooling_results_fulltext_only]

vectorizer, X = build_tfidf_index(profiles_fulltext_only)

print(X)
print(vectorizer)

save_tfidf_index("tfidf_index_fulltext_only", vectorizer, X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 9056 stored elements and shape (259, 1122)>
  Coords	Values
  (0, 925)	0.2219400456004056
  (0, 282)	0.25419830263255777
  (0, 1115)	0.7625949078976733
  (0, 211)	0.20307017489797208
  (0, 611)	0.15024953964662754
  (0, 762)	0.25419830263255777
  (0, 610)	0.24080991630283913
  (0, 783)	0.18250779667877967
  (0, 712)	0.1792969493375059
  (0, 774)	0.1762934022385349
  (0, 977)	0.14703869230535377
  (1, 611)	0.17438102041135684
  (1, 609)	0.14751246327993417
  (1, 16)	0.13974312010813897
  (1, 620)	0.10404685783281671
  (1, 516)	0.14751246327993417
  (1, 929)	0.2674334987162057
  (1, 427)	0.06444593381315995
  (1, 947)	0.13974312010813897
  (1, 546)	0.12879284593139348
  (1, 465)	0.12879284593139348
  (1, 554)	0.12879284593139348
  (1, 452)	0.12879284593139348
  (1, 57)	0.06094376385703841
  (1, 351)	0.14751246327993417
  :	:
  (258, 402)	0.10236140610719478
  (258, 763)	0.14302719381067405
  (258, 992)	0.06733398011311333
  (2

In [None]:
#load the structured expert data with fulltext

profiles_structured_fulltext = [profile.get("doc_raw") for profile in pooling_results]

vectorizer, X = build_tfidf_index(profiles_structured_fulltext)

print(X)
print(vectorizer)

save_tfidf_index("tfidf_index_structured_fulltext", vectorizer, X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 12790 stored elements and shape (259, 1646)>
  Coords	Values
  (0, 1608)	0.1251709910228989
  (0, 1356)	0.07340173051588544
  (0, 1383)	0.11536258922383343
  (0, 652)	0.04190683742090879
  (0, 336)	0.0818274419726048
  (0, 375)	0.03308593988913308
  (0, 459)	0.26426032569889557
  (0, 1636)	0.7927809770966866
  (0, 210)	0.13213016284944779
  (0, 924)	0.14540101528177932
  (0, 1161)	0.23954607597944258
  (0, 1188)	0.1832716875433405
  (0, 1093)	0.1084034173972846
  (0, 931)	0.029242039156808796
  (0, 1378)	0.23072517844766685
  (0, 374)	0.13213016284944779
  (0, 366)	0.09859501559821912
  (0, 922)	0.11536258922383343
  (0, 1092)	0.08878661379915366
  (0, 1178)	0.09016930414149978
  (0, 1447)	0.07642948893942718
  (1, 652)	0.046521920158142334
  (1, 336)	0.04541943458963112
  (1, 924)	0.16141362221900352
  (1, 931)	0.016231191550565224
  :	:
  (258, 382)	0.1577136390841902
  (258, 554)	0.0948864369362687
  (258, 1352)	0.0910397

## Index documents BM25

In [None]:
!pip install rank-bm25

Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2


In [None]:
from rank_bm25 import BM25Okapi
from typing import Iterable, Sequence

In [None]:
def build_bm25_index(
    expert_profiles: Sequence[str],
) -> BM25Okapi:
    """
    Baut einen BM25-Index über Expertenprofile.

    Returns:
      (bm25, tokenized_docs)
    """
    tokenized_docs = [
        bm25_tokenize(profile)
        for profile in expert_profiles
    ]

    bm25_index = BM25Okapi(tokenized_docs, k1=2, b=0.5)
    return bm25_index

In [None]:
# load only the structured original expert data without additional full_text

profiles_without_fulltext = [profile.get("doc_raw") for profile in pooling_results_without_fulltext]

bm25_index = build_bm25_index(profiles_without_fulltext)

print(bm25_index)
#save_bm25_index("bm25_index_without_fulltext", bm25_index)

<rank_bm25.BM25Okapi object at 0x7e692ced7fb0>


In [None]:
# load only the generated fulltext for each expert object

profiles_fulltext_only = [profile.get("doc_raw_fulltext") for profile in pooling_results_fulltext_only]

bm25_index = build_bm25_index(profiles_fulltext_only)

save_bm25_index("bm25_index_fulltext_only", bm25_index)

Saved BM25 index to:drive/MyDrive/Uni/Master/Masterthesis/Experiments/Indexing/lexical_index/bm25_index_fulltext_only_bm25.joblib


In [None]:
# load the structured expert data with fulltext

profiles_structured_fulltext = [profile.get("doc_raw") for profile in pooling_results]

bm25_index = build_bm25_index(profiles_structured_fulltext)

save_bm25_index("bm25_index_structured_fulltext", bm25_index)

Saved BM25 index to:drive/MyDrive/Uni/Master/Masterthesis/Experiments/Indexing/lexical_index/bm25_index_structured_fulltext_bm25.joblib
