Dieses Skript erzeugt dsa_ner_train.spacy und dsa_ner_dev.spacy.

In [15]:
import random
import spacy
from spacy.tokens import DocBin, Span
import os

# -----------------------------
# 1Ô∏è‚É£ Pfade zu Textlisten
# -----------------------------
HUMAN_FILE = "dsa_names.txt"
DEMON_FILE = "dsa_dnames.txt"
GOD_FILE   = "dsa_gnames.txt"
CITY_FILE  = "dsa_stadtnames.txt"

def load_list(path):
    with open(path, encoding="utf8") as f:
        return [line.strip() for line in f if line.strip()]

HUMANS = load_list(HUMAN_FILE)
DEMONS = load_list(DEMON_FILE)
GODS   = load_list(GOD_FILE)
CITIES = load_list(CITY_FILE)

# -----------------------------
# 2Ô∏è‚É£ Templates f√ºr S√§tze
# -----------------------------
TEMPLATES = [
    "{HUMAN} reiste nach {CITY}.",
    "In {CITY} sprach {HUMAN} ein Gebet zu {GOD}.",
    "{HUMAN} warnte vor dem D√§mon {DEMON}.",
    "Zwischen {CITY} und {CITY2} sah {HUMAN} ein seltsames Licht.",
    "{DEMON} erschien vor {HUMAN} in einer Rauchwolke.",
    "Im Tempel des {GOD} bat {HUMAN2} um Rat.",
    "{HUMAN} wurde in {CITY} Zeuge eines Rituals f√ºr {GOD}.",
]

# -----------------------------
# 3Ô∏è‚É£ NER Dokument erzeugen
# -----------------------------
def generate_ner_doc(nlp, text, entities_dict):
    doc = nlp(text)
    spans = []

    for ent_text, label in entities_dict.items():
        for token in doc:
            if token.text == ent_text:
                span = doc[token.i:token.i+1]
                spans.append(Span(doc, span.start, span.end, label=label))

    # √úberlappungen entfernen
    spans = spacy.util.filter_spans(spans)
    doc.ents = spans
    return doc

# -----------------------------
# 4Ô∏è‚É£ Trainingsdaten generieren
# -----------------------------
def generate_spacy_data(n_samples, out_file):
    nlp = spacy.load("de_core_news_lg")
    db = DocBin()

    for _ in range(n_samples):
        template = random.choice(TEMPLATES)
        human1 = random.choice(HUMANS)
        human2 = random.choice(HUMANS)
        demon  = random.choice(DEMONS)
        god    = random.choice(GODS)
        city1  = random.choice(CITIES)
        city2  = random.choice(CITIES)

        if city1 == city2 and len(CITIES) > 1:
            city2 = random.choice([c for c in CITIES if c != city1])

        text = template.format(
            HUMAN=human1, HUMAN2=human2,
            DEMON=demon, GOD=god,
            CITY=city1, CITY2=city2
        )

        ents = {}
        ents[human1] = "HUMAN"
        ents[human2] = "HUMAN"
        ents[demon]  = "DEMON"
        ents[god]    = "GOD"
        ents[city1]  = "CITY"
        ents[city2]  = "CITY"
        ents = {k:v for k,v in ents.items() if k in text}

        doc = generate_ner_doc(nlp, text, ents)
        db.add(doc)

    db.to_disk(out_file)
    print(f"‚úÖ Gespeichert: {out_file}")

# -----------------------------
# 5Ô∏è‚É£ Config erstellen
# -----------------------------
CONFIG_TEXT = """
[paths]
train = "dsa_ner_train.spacy"
dev = "dsa_ner_dev.spacy"
vectors = {"@vectors": "spacy.Vectors.v1"}

[nlp]
lang = "de"
pipeline = ["tok2vec","ner"]
batch_size = 128
disabled = []

[components]

[components.tok2vec]
factory = "tok2vec"

[components.ner]
factory = "ner"
labels = ["HUMAN", "DEMON", "GOD", "CITY"]

[corpora]

[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}

[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}

[training]
train_corpus = "corpora.train"
dev_corpus = "corpora.dev"
max_epochs = 20
patience = 5
seed = 42
accumulate_gradient = 1
eval_frequency = 100
dropout = 0.2
frozen_components = []
gpu_allocator = null

[training.optimizer]
learn_rate = 0.001
L2 = 0.0
beta1 = 0.9
beta2 = 0.999
eps = 1e-08
grad_clip = 1.0
L2_is_weight_decay = true
use_averages = false

[training.batcher]
@batchers = "spacy.batch_by_words.v1"
discard_oversize = true
tolerance = 0.2
size = 2000
"""

# -----------------------------
# 6Ô∏è‚É£ Alles ausf√ºhren
# -----------------------------
if __name__ == "__main__":
    # Trainings- und Devdaten erstellen
    generate_spacy_data(1800, "dsa_ner_train.spacy")
    generate_spacy_data(200, "dsa_ner_dev.spacy")

    # Config schreiben
    with open("config.cfg", "w", encoding="utf8") as f:
        f.write(CONFIG_TEXT)
    print("‚úÖ config.cfg erstellt")

    # Hinweis f√ºr Training
    print("\nüéØ Jetzt kann das Training gestartet werden mit:")
    print("python -m spacy train config.cfg --output ./dsa_ner_model")


‚úÖ Gespeichert: dsa_ner_train.spacy
‚úÖ Gespeichert: dsa_ner_dev.spacy
‚úÖ config.cfg erstellt

üéØ Jetzt kann das Training gestartet werden mit:
python -m spacy train config.cfg --output ./dsa_ner_model
