In [10]:
import spacy
from spacy.training import Example
from data.dsa_train_data_fixed import TRAIN_DATA

# 1Ô∏è‚É£ Deutsches Basismodell laden
nlp = spacy.load("de_core_news_sm")

# 2Ô∏è‚É£ POS-Tagger-Komponente holen
tagger = nlp.get_pipe("tagger")

"""# 3Ô∏è‚É£ Falls du eigene Tags nutzt, hier hinzuf√ºgen (optional)
for sent, ann in TRAIN_DATA:
    for tag in ann["pos"]:
        if tag not in tagger.labels:
            tagger.add_label(tag)"""

# 4Ô∏è‚É£ Fine-Tuning
print("Starte Fine-Tuning auf DSA-Daten ...")
for epoch in range(10):
    losses = {}
    for text, annotations in TRAIN_DATA:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        nlp.update([example], losses=losses)
    print(f"Epoche {epoch+1} - Loss: {losses}")

# 5Ô∏è‚É£ Modell speichern
output_dir = "de_dsa_tagger"
nlp.to_disk(output_dir)
print(f"Modell gespeichert unter: {output_dir}")


Starte Fine-Tuning auf DSA-Daten ...
Epoche 1 - Loss: {'tok2vec': 161.5016918182373, 'tagger': 0.0, 'morphologizer': 53.240132331848145, 'parser': 0.0, 'lemmatizer': 0.0, 'ner': 0.0}
Epoche 2 - Loss: {'tok2vec': 30.115575637901202, 'tagger': 0.0, 'morphologizer': 8.026043093879707, 'parser': 0.0, 'lemmatizer': 0.0, 'ner': 0.0}
Epoche 3 - Loss: {'tok2vec': 4.158587197240422, 'tagger': 0.0, 'morphologizer': 0.7085235720361887, 'parser': 0.0, 'lemmatizer': 0.0, 'ner': 0.0}
Epoche 4 - Loss: {'tok2vec': 0.0064903936869644685, 'tagger': 0.0, 'morphologizer': 0.0011465696719241691, 'parser': 0.0, 'lemmatizer': 0.0, 'ner': 0.0}
Epoche 5 - Loss: {'tok2vec': 0.0003373495041086201, 'tagger': 0.0, 'morphologizer': 5.9781852585920336e-05, 'parser': 0.0, 'lemmatizer': 0.0, 'ner': 0.0}
Epoche 6 - Loss: {'tok2vec': 5.274267037289407e-06, 'tagger': 0.0, 'morphologizer': 1.4538145260327776e-06, 'parser': 0.0, 'lemmatizer': 0.0, 'ner': 0.0}
Epoche 7 - Loss: {'tok2vec': 1.5635855214832933e-06, 'tagger': 0

In [8]:
import spacy
from data.dsa_train_data import TRAIN_DATA

nlp = spacy.blank("de")

for text, ann in TRAIN_DATA:
    doc = nlp.make_doc(text)
    if len(doc) != len(ann["pos"]):
        print(f"‚ö†Ô∏è  Tokenanzahl passt nicht: '{text}'")
        print(f"  Tokens: {[t.text for t in doc]}")
        print(f"  POS:    {ann['pos']}")


‚ö†Ô∏è  Tokenanzahl passt nicht: 'Der Perainegeweihte heilt die Wunden des Bauern.'
  Tokens: ['Der', 'Perainegeweihte', 'heilt', 'die', 'Wunden', 'des', 'Bauern', '.']
  POS:    ['DET', 'NOUN', 'VERB', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'PUNCT']


In [9]:
import spacy
from data.dsa_train_data import TRAIN_DATA
from pprint import pprint

# ‚öôÔ∏è Lade ein deutsches Tokenizer-Modell (nur zum Tokenisieren!)
nlp = spacy.blank("de")

print("üîç √úberpr√ºfe DSA-Trainingsdaten auf Token/POS-L√§ngen...\n")

fixed_data = []
errors = 0

for text, ann in TRAIN_DATA:
    doc = nlp.make_doc(text)
    tokens = [t.text for t in doc]
    pos = ann["pos"]

    if len(tokens) != len(pos):
        errors += 1
        print(f"‚ö†Ô∏è  Mismatch in Satz: {text}")
        print(f"   Tokens ({len(tokens)}): {tokens}")
        print(f"   POS    ({len(pos)}): {pos}")

        # Vorschlag: gleiche L√§nge auff√ºllen
        # Wenn mehr Tokens ‚Üí k√ºrze POS
        # Wenn weniger Tokens ‚Üí f√ºge 'X' hinzu (unbekannt)
        if len(tokens) > len(pos):
            corrected_pos = pos + ["X"] * (len(tokens) - len(pos))
        else:
            corrected_pos = pos[:len(tokens)]

        print(f"üëâ Vorschlag korrigiert ({len(corrected_pos)}): {corrected_pos}\n")

        fixed_data.append((text, {"pos": corrected_pos}))
    else:
        fixed_data.append((text, ann))

print(f"\n‚úÖ √úberpr√ºfung abgeschlossen. {errors} fehlerhafte S√§tze gefunden.\n")

# Optional: korrigierte Datei speichern
save_choice = input("üíæ Korrigierte Daten als neue Datei speichern? (j/n): ").strip().lower()
if save_choice == "j":
    with open("data/dsa_train_data_fixed.py", "w", encoding="utf8") as f:
        f.write("TRAIN_DATA = [\n")
        for text, ann in fixed_data:
            f.write(f'    ({text!r}, {ann!r}),\n')
        f.write("]\n")
    print("üìÅ Gespeichert als: data/dsa_train_data_fixed.py")
else:
    print("‚ùå Keine Datei gespeichert. Nur √ºberpr√ºft.")


üîç √úberpr√ºfe DSA-Trainingsdaten auf Token/POS-L√§ngen...

‚ö†Ô∏è  Mismatch in Satz: Der Perainegeweihte heilt die Wunden des Bauern.
   Tokens (8): ['Der', 'Perainegeweihte', 'heilt', 'die', 'Wunden', 'des', 'Bauern', '.']
   POS    (9): ['DET', 'NOUN', 'VERB', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'PUNCT']
üëâ Vorschlag korrigiert (8): ['DET', 'NOUN', 'VERB', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN']


‚úÖ √úberpr√ºfung abgeschlossen. 1 fehlerhafte S√§tze gefunden.

üìÅ Gespeichert als: data/dsa_train_data_fixed.py


### Wenn du eigene neue Tags brauchst (z. B. ‚ÄûARTEFAKT‚Äú, ‚ÄûGOTT‚Äú usw.)

Dann kannst du nicht das fertige Modell de_core_news_sm nehmen,
sondern musst eine leere Pipeline mit neuem Tagger aufbauen, z. B.:

In [4]:
import spacy
from spacy.pipeline import Tagger
from spacy.training import Example
from data.dsa_train_data import TRAIN_DATA

# 1Ô∏è‚É£ Leere deutsche Pipeline erstellen
nlp = spacy.blank("de")

# 2Ô∏è‚É£ Neuen Tagger hinzuf√ºgen
tagger = nlp.add_pipe("tagger")

# 3Ô∏è‚É£ Alle POS-Tags hinzuf√ºgen
for _, ann in TRAIN_DATA:
    for tag in ann["pos"]:
        tagger.add_label(tag)

# 4Ô∏è‚É£ Training starten
nlp.begin_training()
for epoch in range(20):
    losses = {}
    for text, annotations in TRAIN_DATA:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        nlp.update([example], losses=losses)
    print(f"Epoche {epoch+1} - Loss: {losses}")

# 5Ô∏è‚É£ Modell speichern
#nlp.to_disk("de_dsa_tagger_custom")
print("‚úÖ Neues DSA-Tagger-Modell gespeichert!")


ValueError: [E971] Found incompatible lengths in `Doc.from_array`: 9 for the array and 8 for the Doc itself.

## Evaluate DSA POS Tagger

In [2]:
import spacy
from data.dsa_train_data import TRAIN_DATA

# Lade beide Modelle
nlp_base = spacy.load("de_core_news_sm")
nlp_dsa  = spacy.load("de_dsa_tagger")

print("üîç Vergleiche Basismodell vs. DSA-Modell\n")

# Hilfsfunktion f√ºr Genauigkeit
def evaluate(model, data):
    correct = 0
    total = 0
    for text, annotations in data:
        doc = model(text)
        predicted_tags = [token.pos_ for token in doc]
        gold_tags = annotations["pos"]
        total += len(gold_tags)
        correct += sum(p == g for p, g in zip(predicted_tags, gold_tags))
    return correct / total if total > 0 else 0

# 1Ô∏è‚É£ Genauigkeit berechnen
acc_base = evaluate(nlp_base, TRAIN_DATA)
acc_dsa  = evaluate(nlp_dsa,  TRAIN_DATA)

print(f"Basismodell: {acc_base*100:.2f}% korrekt")
print(f"DSA-Modell : {acc_dsa*100:.2f}% korrekt\n")

# 2Ô∏è‚É£ Zeige Unterschiede
print("‚öîÔ∏è  Unterschiede zwischen den Modellen:")
for text, annotations in TRAIN_DATA:
    doc_base = nlp_base(text)
    doc_dsa  = nlp_dsa(text)
    gold_tags = annotations["pos"]
    print(f"\nüìú Satz: {text}")
    for token, gold, pred_b, pred_d in zip(doc_dsa, gold_tags,
                                           [t.pos_ for t in doc_base],
                                           [t.pos_ for t in doc_dsa]):
        mark = "‚úÖ" if pred_d == gold else "‚ùå"
        diff = "" if pred_b == pred_d else f"(Base:{pred_b}‚ÜíDSA:{pred_d})"
        print(f"  {token.text:12} Gold:{gold:6} DSA:{pred_d:6} {mark} {diff}")


OSError: [E050] Can't find model 'de_dsa_tagger'. It doesn't seem to be a Python package or a valid path to a data directory.