# Erweitern der Suchanfragen

Originale Suchanfragen werden zu Freitext formuliert. Für beide Varianten (Original & Freitext) wird zusätzlich eine Synonym-Version erzeugt.


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
RAW_DATA_PATH = "drive/MyDrive/Uni/Master/Masterthesis/Data/raw_data/"
PROD_DATA_PATH = "drive/MyDrive/Uni/Master/Masterthesis/Data/topics/"

In [3]:
import pandas as pd

In [4]:
import json

# Build file path for test collection.
file_path = PROD_DATA_PATH + "suchanfragen_testkollektion.json"

# Load raw query test collection from JSON.
with open(file_path, "r", encoding="utf-8") as f:
    raw_queries_testcollection = json.load(f)

# Basic structural validation.
print(type(raw_queries_testcollection))  # Expected: list or dict
print(len(raw_queries_testcollection))   # Number of entries
print(raw_queries_testcollection)        # Full content (use cautiously for large files)

<class 'list'>
11
[{'id': 'c0', 'cluster': 'cluster_0', 'topic': 'IT-Management und Wissensmanagement', 'single': [{'id': 'c0_1_single', 'keyword': 'IT-Systemadministration', 'variations': [{'id': 'c0_1_single_keyword', 'query': 'IT-Systemadministration', 'type': 'single_keyword'}]}, {'id': 'c0_2_single', 'keyword': 'Information Technology', 'variations': [{'id': 'c0_2_single_keyword', 'query': 'Information Technology', 'type': 'single_keyword'}]}, {'id': 'c0_3_single', 'keyword': 'Wissensmanagement', 'variations': [{'id': 'c0_3_single_keyword', 'query': 'Wissensmanagement', 'type': 'single_keyword'}]}], 'combination': []}, {'id': 'c1', 'cluster': 'cluster_1', 'topic': 'Mechanik-, Elektro- und Automatisierungstechnik', 'single': [{'id': 'c1_1_single', 'keyword': 'Konstruktion', 'variations': [{'id': 'c1_1_single_keyword', 'query': 'Konstruktion', 'type': 'single_keyword'}]}, {'id': 'c1_2_single', 'keyword': 'Metallhandwerk', 'variations': [{'id': 'c1_2_single_keyword', 'query': 'Metall

In [None]:
# Initialize global counters.
total_singles = 0
total_combinations = 0

# Iterate over clusters in test collection.
for cluster in raw_queries_testcollection:

    # Count variations for single-term queries.
    single_count = sum(
        len(s.get("variations", []))
        for s in cluster["single"]
    )

    # Count variations for combination queries.
    comb_count = sum(
        len(c.get("variations", []))
        for c in cluster["combination"]
    )

    # Aggregate totals.
    total_singles += single_count
    total_combinations += comb_count

    # Print per-cluster summary.
    print(
        cluster["id"],
        "| topic:", cluster["topic"],
        "| single_queries:", single_count,
        "| combination_queries:", comb_count,
        "| total:", single_count + comb_count,
    )

# Print overall totals across all clusters.
print("\n--- Gesamtzahlen über alle Cluster hinweg ---")
print("Total Single Variations:", total_singles)
print("Total Combination Variations:", total_combinations)
print("Total Queries:", total_singles + total_combinations)

c0 | topic: IT-Management und Wissensmanagement | single_queries: 3 | combination_queries: 0 | total: 3
c1 | topic: Mechanik-, Elektro- und Automatisierungstechnik | single_queries: 5 | combination_queries: 6 | total: 11
c3 | topic: Künstliche Intelligenz | single_queries: 1 | combination_queries: 0 | total: 1
c5 | topic: Verkauf, Kommunikation und Marketing | single_queries: 3 | combination_queries: 1 | total: 4
c6 | topic: Digitales Marketing und Markenkommunikation | single_queries: 2 | combination_queries: 3 | total: 5
c7 | topic: Personal- und Organisationsentwicklung | single_queries: 4 | combination_queries: 10 | total: 14
c8 | topic: Bau- und Handwerkstechnik | single_queries: 3 | combination_queries: 1 | total: 4
c10 | topic: Finanzmanagement und Unternehmensplanung | single_queries: 3 | combination_queries: 10 | total: 13
c11 | topic: Strategische Unternehmensführung und Veränderungsmanagement | single_queries: 3 | combination_queries: 5 | total: 8
c12 | topic: Digitales Mana

## Hinzufügen von Freitext-Anfragen

In [None]:
ENHANCED_DATA = PROD_DATA_PATH + "suchanfragen_testkollektion_freetext.json"

In [None]:
def make_freetext_variation(base_variation, freetext: str):
    # Extract original variation ID and normalize suffix.
    id = base_variation["id"]
    base_id = id.replace("_keyword", "")

    # Extract original type and normalize suffix.
    typ = base_variation["type"]
    base_type = typ.replace("_keyword", "")

    # Return freetext-based variation with adjusted ID and type.
    return {
        "id": base_id + "_freetext",
        "query": freetext,
        "type": base_type + "_free",
    }

In [None]:
from copy import deepcopy
from tqdm import tqdm

# Create deep copy to avoid mutating original test collection.
queries_with_freetext = deepcopy(raw_queries_testcollection)

print(queries_with_freetext)

# Iterate over clusters and extend with freetext variations.
for cluster in tqdm(queries_with_freetext, desc="Add Freetext"):

    # ---------- SINGLE ----------
    for single in cluster["single"]:
        keyword = single["keyword"]

        # Build simple freetext formulation for single keyword.
        freetext = f"Gesucht wird ein Experte mit Fähigkeiten im Bereich {keyword}"

        # Add exactly one freetext variation per keyword (based on keyword baseline).
        for var in single["variations"]:
            if var["type"] == "single_keyword":
                new_var = make_freetext_variation(var, freetext)
                single["variations"].append(new_var)
                break  # ensure only one freetext variation per keyword

    # ---------- COMBINATION ----------
    for comb in cluster["combination"]:
        keywords = comb["keywords"]  # e.g. ["Public Relations", "Social Media Marketing"]

        # Build freetext query for two-keyword combination.
        freetext = (
            f"Gesucht wird ein Experte mit Fähigkeiten in den Bereichen "
            f"{keywords[0]} und {keywords[1]}"
        )

        # Add exactly one freetext variation per combination (based on keyword baseline).
        for var in comb["variations"]:
            if var["type"] == "combination_keyword":
                new_var = make_freetext_variation(var, freetext)
                comb["variations"].append(new_var)
                break  # ensure only one freetext variation per combination

[{'id': 'c0', 'cluster': 'cluster_0', 'topic': 'IT-Management und Wissensmanagement', 'single': [{'id': 'c0_1_single', 'keyword': 'IT-Systemadministration', 'variations': [{'id': 'c0_1_single_keyword', 'query': 'IT-Systemadministration', 'type': 'single_keyword'}]}, {'id': 'c0_2_single', 'keyword': 'Information Technology', 'variations': [{'id': 'c0_2_single_keyword', 'query': 'Information Technology', 'type': 'single_keyword'}]}, {'id': 'c0_3_single', 'keyword': 'Wissensmanagement', 'variations': [{'id': 'c0_3_single_keyword', 'query': 'Wissensmanagement', 'type': 'single_keyword'}]}], 'combination': []}, {'id': 'c1', 'cluster': 'cluster_1', 'topic': 'Mechanik-, Elektro- und Automatisierungstechnik', 'single': [{'id': 'c1_1_single', 'keyword': 'Konstruktion', 'variations': [{'id': 'c1_1_single_keyword', 'query': 'Konstruktion', 'type': 'single_keyword'}]}, {'id': 'c1_2_single', 'keyword': 'Metallhandwerk', 'variations': [{'id': 'c1_2_single_keyword', 'query': 'Metallhandwerk', 'type':

Add Freetext: 100%|██████████| 11/11 [00:00<00:00, 24308.40it/s]


In [None]:
queries_with_freetext

[{'id': 'c0',
  'cluster': 'cluster_0',
  'topic': 'IT-Management und Wissensmanagement',
  'single': [{'id': 'c0_1_single',
    'keyword': 'IT-Systemadministration',
    'variations': [{'id': 'c0_1_single_keyword',
      'query': 'IT-Systemadministration',
      'type': 'single_keyword'},
     {'id': 'c0_1_single_freetext',
      'query': 'Gesucht wird ein Experte mit Fähigkeiten im Bereich IT-Systemadministration',
      'type': 'single_free'}]},
   {'id': 'c0_2_single',
    'keyword': 'Information Technology',
    'variations': [{'id': 'c0_2_single_keyword',
      'query': 'Information Technology',
      'type': 'single_keyword'},
     {'id': 'c0_2_single_freetext',
      'query': 'Gesucht wird ein Experte mit Fähigkeiten im Bereich Information Technology',
      'type': 'single_free'}]},
   {'id': 'c0_3_single',
    'keyword': 'Wissensmanagement',
    'variations': [{'id': 'c0_3_single_keyword',
      'query': 'Wissensmanagement',
      'type': 'single_keyword'},
     {'id': 'c0_3_

## Hinzufügen von Synonymen Versionen der Anfragen


In [None]:
# install langchain
!pip install -U langchain
!pip install -U "langchain[openai]"

Collecting langchain-openai (from langchain[openai])
  Downloading langchain_openai-1.1.0-py3-none-any.whl.metadata (2.6 kB)
Downloading langchain_openai-1.1.0-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.3/84.3 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: langchain-openai
Successfully installed langchain-openai-1.1.0


In [None]:
# imports
import os
from langchain_openai import ChatOpenAI
from pydantic import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate

In [None]:
# add api key for OPENAI
os.environ["OPENAI_API_KEY"] = input("Bitte gib deinen OpenAI API Key ein: ")
print("API Key erfolgreich gesetzt!")

In [None]:
model = ChatOpenAI(
    model="gpt-5-nano-2025-08-07",
    temperature=0.1,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

In [None]:
SYSTEM_PROMPT_SYNONYM_SINGLE = """
Du bist Experte für die Optimierung von Suchanfragen.
Deine Aufgabe ist es, aus einem einzelnen Stichwort eine alternative Suchanfrage zu erzeugen, indem du ein bedeutungsgleiches Synonym verwendest.

Regeln:
1. Das ursprüngliche Stichwort darf nicht in der Ausgabe vorkommen.
2. Verwende ein echtes Synonym mit gleicher oder sehr ähnlicher Bedeutung.
3. Nutze keine umfangreichen Umschreibungen oder Satzkonstruktionen – bleibe beim Charakter einer Stichwort-Suchanfrage.
4. Begriffe aus der Blacklist dürfen nicht enthalten sein.
5. Falls kein sinnvolles Synonym existiert, gib "NO_VALID_SYN" zurück.

"""


In [None]:
HUMAN_PROMPT_SYNONYM_SINGLE = """
Ersetze das folgende Stichwort durch ein bedeutungsgleiches Synonym, um daraus eine neue Suchanfrage zu erzeugen.
Verwende keines der Wörter aus der bereitgestellten Blacklist.
Die Ausgabe darf nur aus einem einzigen Wort bestehen – dem gewählten Synonym.

Stichwort:
{query}

Blacklist:
{blacklist}
"""

In [None]:
SYSTEM_PROMPT_SYNONYM_COMBINATION = """
Du bist Experte für die Optimierung von Suchanfragen.
Dir wird eine Kombination aus zwei Stichwörtern übergeben.
Erstelle darauf basierend eine alternative Suchanfrage, indem du beide Stichwörter durch bedeutungsgleiche Synonyme ersetzt.

Regeln:
1. Keines der ursprünglichen Stichwörter darf in der Ausgabe vorkommen.
2. Verwende echte Synonyme mit gleicher oder sehr ähnlicher Bedeutung.
3. Nutze keine umfangreichen Umschreibungen oder Satzkonstruktionen – bleibe beim Charakter einer Stichwort-Suchanfrage.
4. Begriffe aus der Blacklist dürfen nicht enthalten sein.
5. Falls kein sinnvolles Synonym existiert, gib "NO_VALID_SYN" zurück.

"""

In [None]:
HUMAN_PROMPT_SYNONYM_COMBINATION = """
Ersetze die folgende Stichwörter durch bedeutungsgleiche Synonyme, um daraus eine neue Suchanfrage zu erzeugen.
Verwende keines der Wörter aus der bereitgestellten Blacklist.
Die Ausgabe darf nur aus zwei Stichwörtern bestehen – den gewählten Synonymen.

Stichwörter:
{query}

Blacklist:
{blacklist}
"""

In [None]:
from typing import List, Literal

# Structured output model for single-keyword synonym generation.
class Synonym_Keyword_Query(BaseModel):
    """Ausgabe der Erstellung von Synonymen-Versionen der Anfragen."""

    # Single synonym for a given keyword.
    synonym_keyword: str = Field(
        description="Eine synonyme Version des relevanten Stichwortes"
    )


# Structured output model for multi-keyword (combination) synonym generation.
class Synonym_Keywords_Combination_Query(BaseModel):
    """Ausgabe der Erstellung von Synonymen-Versionen der Anfragen."""

    # Synonym list corresponding to the original keyword combination.
    synonym_keywords: List[str] = Field(
        description="Eine synonyme Version der beiden relevanten Stichwörter"
    )

In [None]:
chat_prompt_single = ChatPromptTemplate.from_messages([
    ("system", SYSTEM_PROMPT_SYNONYM_SINGLE),
    ("human", HUMAN_PROMPT_SYNONYM_SINGLE)
  ])

chat_prompt_combi = ChatPromptTemplate.from_messages([
    ("system", SYSTEM_PROMPT_SYNONYM_COMBINATION),
    ("human", HUMAN_PROMPT_SYNONYM_COMBINATION)
  ])

In [None]:
model_with_structure_single_keyword = model.with_structured_output(Synonym_Keyword_Query)

model_with_structure_combinations = model.with_structured_output(Synonym_Keywords_Combination_Query)

In [None]:
def make_synonym_variation(base_variation, synonym_query: str):
    # Create synonym-based variation derived from base variation.
    # ID and type are extended with "_synonym" suffix.
    return {
        "id": base_variation["id"] + "_synonym",
        "query": synonym_query,
        "type": base_variation["type"] + "_synonym",
    }

In [None]:
PROFILES_DATA = "drive/MyDrive/Uni/Master/Masterthesis/Data/enhanced_expert_profiles.json"

In [None]:
## load enhanced profile data
df_profiles = pd.read_json(PROFILES_DATA)
df_profiles.head()

Unnamed: 0,branches,companyLocationCity,companyLocationStreet,companyLocationZip,companyName,companyTypes,companyWebsite,description,employeeOfInstitutionNames,firstName,...,lastName,projectsDescription,skills,title,full_text,full_text_fields_used,validation_issues,full_text_fields_used_fixed,full_text_word_count,full_text_fields_used_sorted
0,"[Technologie, Medien & Werbung, Unternehmensdi...",Lennestadt,Hagener Straße 64,57368.0,lenne.Tech GmbH,[Dienstleistung],https://lenne.tech,Seit über 15 Jahren begleite ich digitale Proj...,[],Ege,...,Siebert,"lenne.Tech unterstützt Unternehmen, Selbststän...","[Softwareentwicklung, Web-Entwicklung, IT-Syst...",,Ege Siebert ist Softwareentwickler mit umfangr...,"[branches, description, jobTitle, projectsDesc...",OK,"[skills, branches, description, projectsDescri...",116.0,"[branches, description, jobTitle, projectsDesc..."
1,[Unternehmensdienstleistungen],Lüdenscheid,Sauerfelder Straße 5,58511.0,RegioKonneX,[Dienstleistung],www.regiokonnex.de,Ich bin Mitglied von Team RegioKonneX und steh...,[RegioKonneX],Hendrik,...,Nöh,Als Angebot der SWF Beratung GmbH unterstützen...,"[Financial Management, Cybersecurity, Finance,...",,Hendrik Nöh ist Administrator und Projektmanag...,"[branches, description, jobTitle, projectsDesc...",OK,"[skills, branches, description, projectsDescri...",77.0,"[branches, description, jobTitle, projectsDesc..."
2,"[Automobilindustrie, Maschinenbau und Industri...",Lüdenscheid,Sauerfelder Straße 5,58511.0,RegioKonneX,[Dienstleistung],,Als Relationship-Manager unterstütze ich Gründ...,[RegioKonneX],Deniz,...,Buchholz,,"[Benchmarking, Budget Planning, Budgeting, Bus...",,Deniz Buchholz ist Relationship-Manager (Gründ...,"[branches, description, jobTitle, skills]",OK,"[branches, jobTitle, skills, description]",61.0,"[branches, description, jobTitle, skills]"
3,"[Unternehmensdienstleistungen, Investor (Eigen...",Lüdenscheid,Sauerfelder Straße 4,58511.0,RegioKonneX,[Dienstleistung],www.regiokonnex.de,Der Themenwelt rund um Start-Ups bin ich schon...,[RegioKonneX],Klara,...,Bozsik,RegioKonneX möchte das die unternehmerische La...,"[Business Planning, Business Development, Coac...",,Klara Bozsik arbeitet als Firmenkundenbetreuun...,"[branches, description, jobTitle, projectsDesc...",OK,"[skills, branches, description, projectsDescri...",84.0,"[branches, description, jobTitle, projectsDesc..."
4,"[Logistik, Konsum und Handel]",Ludenscheid,Wefelshohler Str 48a,58511.0,Karl Koerschulte GmbH,[Dienstleistung],www.koerschulte.de,,[],Melvin,...,Molitor,Die Molitor-GROUP ist ein Produktionsverbindun...,"[Logistik, Supply Chain Management, Controllin...",,Melvin Molitor ist Geschäftsführer der Karl Ko...,"[branches, jobTitle, projectsDescription, skills]",OK,"[projectsDescription, branches, jobTitle, skills]",78.0,"[branches, jobTitle, projectsDescription, skills]"


In [None]:
def get_synonym_blacklist(keywords, df):
    """
    keywords: str oder List[str]
    df: DataFrame mit Spalten 'skills', 'description', 'projectsDescription'
    return: Set mit geblacklisteten Skills
    """

    # Normalize input to list.
    if isinstance(keywords, str):
        keywords = [keywords]

    # Lowercase keyword list for case-insensitive matching.
    keywords = [k.lower() for k in keywords if isinstance(k, str)]

    def cell_contains_keyword(value, kws):
        """Prüft, ob eines der Keywords in value vorkommt (case-insensitive)."""

        # Skip None/NaN values.
        if value is None or (isinstance(value, float) and pd.isna(value)):
            return False

        # Handle list-type fields (e.g., skills).
        if isinstance(value, list):
            for item in value:
                if item is None:
                    continue
                text = str(item).lower()
                if any(k in text for k in kws):
                    return True
            return False

        # Fallback: treat value as string.
        text = str(value).lower()
        return any(k in text for k in kws)

    # Build mask: keyword appears in at least one relevant column.
    mask = df.apply(
        lambda row: (
            cell_contains_keyword(row.get("skills"), keywords) or
            cell_contains_keyword(row.get("description"), keywords) or
            cell_contains_keyword(row.get("projectsDescription"), keywords)
        ),
        axis=1
    )

    # Collect all skills from matched profiles into blacklist set.
    blacklist = set()
    for skills_list in df.loc[mask, "skills"]:
        if isinstance(skills_list, list):
            for skill in skills_list:
                if isinstance(skill, str):
                    blacklist.add(skill.strip())

    return list(blacklist)

In [None]:
unique_skills = sorted({skill for skills_list in df_profiles["skills"] for skill in skills_list})
unique_skills

['AI-gestützte Kundenbetreuung (Chatbots)',
 'Accounting',
 'Adobe Creative Cloud',
 'Ambulanzmanagement (LabCentre)',
 'Anlagenbau',
 'Antriebstechnik',
 'Asset Management',
 'Autodesk Inventor',
 'Automatisierte Produktions- & Montagelinien',
 'Automatisierungstechnik',
 'Bau- und Ausbauhandwerk',
 'Bauinformatik',
 'Bedrucken & Lackieren von Metalloberflächen',
 'Behavioral Finance',
 'Bekleidungs-, Textil- und Lederhandwerk',
 'Benchmarking',
 'Benutzeroberflächen-Design',
 'Beratung & Bildungscoaching',
 'Blockchain',
 'Brand Management',
 'Buchhaltung',
 'Budget Control',
 'Budget Planning',
 'Budgeting',
 'Business Analysis',
 'Business Continuity',
 'Business Development',
 'Business English',
 'Business Planning',
 'Business Process Analysis',
 'Business Process Management (BPM) & Automatisierung',
 'CAD-Systeme',
 'CRM-Plattformen',
 'Capacity Planning',
 'Change Management',
 'Change Management im digitalen Wandel',
 'Cloud-Computing',
 'Coaching',
 'Communication',
 'Compet

In [None]:
from copy import deepcopy
from tqdm import tqdm

# Create deep copy to avoid mutating freetext-enhanced baseline.
queries_with_synonyms = deepcopy(queries_with_freetext)

# Iterate over clusters and enrich with synonym-based variations.
for cluster in tqdm(queries_with_synonyms, desc="Add Synonyms"):

    # ---------- SINGLE ----------
    for single in cluster["single"]:
        keyword = single["keyword"]
        print("Original Keyword: ", keyword)

        # Generate synonym for single keyword via structured LLM call.
        formatted_messages = chat_prompt_single.format_messages(
            query=keyword,
            blacklist=unique_skills
        )
        response = model_with_structure_single_keyword.invoke(formatted_messages)
        print(response)

        synonym_keyword = response.synonym_keyword

        # Build freetext formulation based on synonym.
        freetext = (
            f"Gesucht wird ein Experte mit Fähigkeiten im Bereich "
            f"{synonym_keyword}"
        )

        # Extend baseline variations with synonym-based variants.
        for var in single["variations"]:
            print(var["type"])

            # Keyword-based variant.
            if var["type"] == "single_keyword":
                new_var = make_synonym_variation(var, synonym_keyword)
                single["variations"].append(new_var)

            # Freetext-based variant.
            if var["type"] == "single_free":
                new_var = make_synonym_variation(var, freetext)
                single["variations"].append(new_var)

    # ---------- COMBINATION ----------
    for comb in cluster["combination"]:
        keywords = comb["keywords"]
        print("Original Keywords: ", keywords)

        # Generate synonym pair for keyword combination.
        formatted_messages = chat_prompt_combi.format_messages(
            query=keywords,
            blacklist=unique_skills
        )
        response = model_with_structure_combinations.invoke(formatted_messages)
        print(response)

        synonym_keywords = response.synonym_keywords

        # Build freetext formulation for synonym combination.
        freetext = (
            f"Gesucht wird ein Experte mit Fähigkeiten in den Bereichen "
            f"{synonym_keywords[0]} und {synonym_keywords[1]}"
        )

        # Extend baseline variations with synonym-based variants.
        for var in comb["variations"]:

            if var["type"] == "combination_keyword":
                print("create variation")
                new_var = make_synonym_variation(var, freetext)
                comb["variations"].append(new_var)

            if var["type"] == "combination_free":
                print("create variation")
                new_var = make_synonym_variation(var, freetext)
                comb["variations"].append(new_var)

Add Synonyms:   0%|          | 0/11 [00:00<?, ?it/s]

Original Keyword:  IT-Systemadministration
synonym_keyword='Serveradministration'
single_keyword
single_free
single_keyword_synonym
single_free_synonym
Original Keyword:  Information Technology
synonym_keyword='Computing'
single_keyword
single_free
single_keyword_synonym
single_free_synonym
Original Keyword:  Wissensmanagement


Add Synonyms:   9%|▉         | 1/11 [00:42<07:07, 42.78s/it]

synonym_keyword='Wissensverwaltung'
single_keyword
single_free
single_keyword_synonym
single_free_synonym
Original Keyword:  Konstruktion
synonym_keyword='Aufbau'
single_keyword
single_free
single_keyword_synonym
single_free_synonym
Original Keyword:  Metallhandwerk
synonym_keyword='Metallbearbeitung'
single_keyword
single_free
single_keyword_synonym
single_free_synonym
Original Keyword:  Mechanik
synonym_keyword='Maschinenbau'
single_keyword
single_free
single_keyword_synonym
single_free_synonym
Original Keyword:  Automatisierungstechnik
synonym_keyword='Automatisierungstechnologie'
single_keyword
single_free
single_keyword_synonym
single_free_synonym
Original Keyword:  Elektrotechnik
synonym_keyword='Elektroingenieurwesen'
single_keyword
single_free
single_keyword_synonym
single_free_synonym
Original Keywords:  ['Konstruktion', 'Metallhandwerk']
synonym_keywords=['Errichtung', 'Metallverarbeitung']
create variation
create variation
Original Keywords:  ['Mechanik', 'Elektrotechnik']
s

Add Synonyms:  18%|█▊        | 2/11 [04:59<25:19, 168.86s/it]

synonym_keywords=['Automatisierung', 'Elektronik']
create variation
create variation
Original Keyword:  KI


Add Synonyms:  27%|██▋       | 3/11 [05:11<12:56, 97.11s/it] 

synonym_keyword='AI'
single_keyword
single_free
single_keyword_synonym
single_free_synonym
Original Keyword:  Public Relations
synonym_keyword='Öffentlichkeitsarbeit'
single_keyword
single_free
single_keyword_synonym
single_free_synonym
Original Keyword:  Sales
synonym_keyword='Verkauf'
single_keyword
single_free
single_keyword_synonym
single_free_synonym
Original Keyword:  Social Media Marketing
synonym_keyword='SoMeWerbung'
single_keyword
single_free
single_keyword_synonym
single_free_synonym
Original Keywords:  ['Public Relations', 'Social Media Marketing']


Add Synonyms:  36%|███▋      | 4/11 [06:45<11:09, 95.69s/it]

synonym_keywords=['Pressearbeit', 'Social-Media-Werbung']
create variation
create variation
Original Keyword:  Marketing
synonym_keyword='Absatzförderung'
single_keyword
single_free
single_keyword_synonym
single_free_synonym
Original Keyword:  Brand Management
synonym_keyword='Markenführung'
single_keyword
single_free
single_keyword_synonym
single_free_synonym
Original Keywords:  ['Marketing', 'Social Media Marketing']
synonym_keywords=['Absatzförderung', 'Werbung in sozialen Netzwerken']
create variation
create variation
Original Keywords:  ['Marketing', 'Brand Management']
synonym_keywords=['Vermarktung', 'Markenführung']
create variation
create variation
Original Keywords:  ['Social Media Marketing', 'Brand Management']


Add Synonyms:  45%|████▌     | 5/11 [08:59<10:57, 109.54s/it]

synonym_keywords=['Soziale Netzwerke-Werbung', 'Markenführung']
create variation
create variation
Original Keyword:  Employee Development
synonym_keyword='Mitarbeiterentwicklung'
single_keyword
single_free
single_keyword_synonym
single_free_synonym
Original Keyword:  Personalentwicklung und Mentoring
synonym_keyword='Mitarbeiterentwicklung'
single_keyword
single_free
single_keyword_synonym
single_free_synonym
Original Keyword:  Human Resources
synonym_keyword='Personalwesen'
single_keyword
single_free
single_keyword_synonym
single_free_synonym
Original Keyword:  Coaching
synonym_keyword='Mentoring'
single_keyword
single_free
single_keyword_synonym
single_free_synonym
Original Keywords:  ['Business Development', 'Employee Development']
synonym_keywords=['Geschäftsentwicklung', 'Mitarbeiterentwicklung']
create variation
create variation
Original Keywords:  ['Business Development', 'Personalentwicklung und Mentoring']
synonym_keywords=['Unternehmensentwicklung', 'Mitarbeiterentwicklung un

Add Synonyms:  55%|█████▍    | 6/11 [13:40<14:00, 168.09s/it]

synonym_keywords=['Personalwesen', 'Mentoring']
create variation
create variation
Original Keyword:  Bau- und Ausbauhandwerk
synonym_keyword='Bauberufe'
single_keyword
single_free
single_keyword_synonym
single_free_synonym
Original Keyword:  Holzhandwerk
synonym_keyword='Schreinerei'
single_keyword
single_free
single_keyword_synonym
single_free_synonym
Original Keyword:  Anlagenbau
synonym_keyword='Industrieanlagenbau'
single_keyword
single_free
single_keyword_synonym
single_free_synonym
Original Keywords:  ['Bau- und Ausbauhandwerk', 'Anlagenbau']


Add Synonyms:  64%|██████▎   | 7/11 [14:45<08:57, 134.39s/it]

synonym_keywords=['Baugewerbe', 'Anlagentechnik']
create variation
create variation
Original Keyword:  Finance
synonym_keyword='Finances'
single_keyword
single_free
single_keyword_synonym
single_free_synonym
Original Keyword:  Financial Management
synonym_keyword='Finanzmanagement'
single_keyword
single_free
single_keyword_synonym
single_free_synonym
Original Keyword:  Budget Planning
synonym_keyword='Budgetierung'
single_keyword
single_free
single_keyword_synonym
single_free_synonym
Original Keywords:  ['Business Planning', 'Finance']
synonym_keywords=['Unternehmensplanung', 'Finanzen']
create variation
create variation
Original Keywords:  ['Business Planning', 'Financial Management']
synonym_keywords=['Unternehmensplanung', 'Finanzsteuerung']
create variation
create variation
Original Keywords:  ['Business Planning', 'Budget Planning']
synonym_keywords=['Corporate Strategy', 'Cost Forecasting']
create variation
create variation
Original Keywords:  ['Business Planning', 'Business Deve

Add Synonyms:  73%|███████▎  | 8/11 [19:32<09:08, 182.77s/it]

synonym_keywords=['Budget forecasting', 'Corporate growth']
create variation
create variation
Original Keyword:  Business Planning
synonym_keyword='Unternehmensplanung'
single_keyword
single_free
single_keyword_synonym
single_free_synonym
Original Keyword:  Business Development
synonym_keyword='Geschäftsentwicklung'
single_keyword
single_free
single_keyword_synonym
single_free_synonym
Original Keyword:  Controlling
synonym_keyword='Steuerung'
single_keyword
single_free
single_keyword_synonym
single_free_synonym
Original Keywords:  ['Change Management', 'Business Planning']
synonym_keywords=['Veränderungsmanagement', 'Unternehmensplanung']
create variation
create variation
Original Keywords:  ['Change Management', 'Business Development']
synonym_keywords=['Veränderungsmanagement', 'Geschäftsentwicklung']
create variation
create variation
Original Keywords:  ['Change Management', 'Controlling']
synonym_keywords=['Veränderungsmanagement', 'Kostenkontrolle']
create variation
create variati

Add Synonyms:  82%|████████▏ | 9/11 [22:02<05:45, 172.52s/it]

synonym_keywords=['Geschäftsentwicklung', 'Finanzsteuerung']
create variation
create variation
Original Keyword:  Cloud-Computing
synonym_keyword='Wolkenrechenleistung'
single_keyword
single_free
single_keyword_synonym
single_free_synonym
Original Keyword:  IT-Management
synonym_keyword='Informationsmanagement'
single_keyword
single_free
single_keyword_synonym
single_free_synonym
Original Keyword:  Datenbankverwaltung
synonym_keyword='Datenbankadministration'
single_keyword
single_free
single_keyword_synonym
single_free_synonym
Original Keyword:  Softwareentwicklung
synonym_keyword='Anwendungsentwicklung'
single_keyword
single_free
single_keyword_synonym
single_free_synonym
Original Keyword:  Data Analysis
synonym_keyword='Datenanalyse'
single_keyword
single_free
single_keyword_synonym
single_free_synonym
Original Keywords:  ['Cloud-Computing', 'IT-Management']
synonym_keywords=['Cloud-Dienste', 'IT-Führung']
create variation
create variation
Original Keywords:  ['Cloud-Computing', 'Di

Add Synonyms:  91%|█████████ | 10/11 [25:54<03:10, 190.89s/it]

synonym_keywords=['Softwaretechnik', 'Datenanalyse']
create variation
create variation
Original Keyword:  Change Management
synonym_keyword='Veränderungsmanagement'
single_keyword
single_free
single_keyword_synonym
single_free_synonym
Original Keyword:  Digital Ethics
synonym_keyword='Digitalethik'
single_keyword
single_free
single_keyword_synonym
single_free_synonym
Original Keyword:  Business Process Analysis
synonym_keyword='Prozessanalyse'
single_keyword
single_free
single_keyword_synonym
single_free_synonym
Original Keywords:  ['Change Management', 'Digital Ethics']
synonym_keywords=['Veränderungsmanagement', 'Ethik der Digitalisierung']
create variation
create variation
Original Keywords:  ['Change Management', 'Business Process Analysis']
synonym_keywords=['Veränderungsmanagement', 'Geschäftsprozessanalyse']
create variation
create variation
Original Keywords:  ['Digital Ethics', 'Business Development']
synonym_keywords=['Technologieethik', 'Geschäftsentwicklung']
create variati

Add Synonyms: 100%|██████████| 11/11 [28:26<00:00, 155.17s/it]

synonym_keywords=['Geschäftsentwicklung', 'Geschäftsprozessanalyse']
create variation
create variation





In [None]:
queries_with_synonyms

[{'id': 'c0',
  'cluster': 'cluster_0',
  'topic': 'IT-Management und Wissensmanagement',
  'single': [{'id': 'c0_1_single',
    'keyword': 'IT-Systemadministration',
    'variations': [{'id': 'c0_1_single_keyword',
      'query': 'IT-Systemadministration',
      'type': 'single_keyword'},
     {'id': 'c0_1_single_freetext',
      'query': 'Gesucht wird ein Experte mit Fähigkeiten im Bereich IT-Systemadministration',
      'type': 'single_free'},
     {'id': 'c0_1_single_keyword_synonym',
      'query': 'Netzwerkadministration',
      'type': 'single_keyword_synonym'},
     {'id': 'c0_1_single_freetext_synonym',
      'query': 'Gesucht wird ein Experte mit Fähigkeiten im Bereich Netzwerkadministration',
      'type': 'single_free_synonym'}]},
   {'id': 'c0_2_single',
    'keyword': 'Information Technology',
    'variations': [{'id': 'c0_2_single_keyword',
      'query': 'Information Technology',
      'type': 'single_keyword'},
     {'id': 'c0_2_single_freetext',
      'query': 'Gesuch

In [None]:
import json

# Build output file path for final test collection.
file_path = PROD_DATA_PATH + "suchanfragen_testkollektion_final.json"

# Persist enriched query collection to JSON.
with open(file_path, "w", encoding="utf-8") as f:
    json.dump(queries_with_synonyms, f)

# Confirm successful write operation.
print(f"Saved data to {file_path}")


Saved data to drive/MyDrive/Uni/Master/Masterthesis/Data/topics/suchanfragen_testkollektion_final.json
