# Vorgehen
0. Satzbeleg auswählen
1. Worttokenization
2. Modelle
    - Spacy, Morph. Tagging, https://universaldependencies.org/u/feat/index.html
    - SFTS Analysis je Wort, https://www.cis.uni-muenchen.de/~schmid/tools/SMOR/dspin/ch01s03.html#Tags
3. Matche 2b with 2a, See conversion table https://universaldependencies.org/tagset-conversion/de-smor-uposf.html (Nur das letzte Morphem muss matchen)
4. Replace word and SFTS Generative



In [1]:
import sys
sys.path.append('..')

In [2]:
import sfst_transduce
import pathlib
import conllu
import io

## CoNLL-U File

In [3]:
%%time
iowrapper = io.open("de_hdt-ud-dev.conllu", "r", encoding="utf-8")
dat = [s for s in conllu.parse_incr(iowrapper)]
print(len(dat))

18434
CPU times: user 12.7 s, sys: 251 ms, total: 12.9 s
Wall time: 13 s


In [22]:
senttext = dat[5002].metadata.get('text')
senttext

'DV-Material vom Computer wandelt die studio-Box in Echtzeit in analoge Signale um , etwa zur Ausgabe auf dem Fernseher oder zur Aufnahme auf einem VHS-Recorder .'

In [23]:
tokens = [t for t in dat[5002]]
tokens[0]

{'id': 1,
 'form': 'DV-Material',
 'lemma': 'Material',
 'upos': 'NOUN',
 'xpos': 'NN',
 'feats': {'Case': 'Acc', 'Gender': 'Neut', 'Number': 'Sing', 'Person': '3'},
 'head': 5,
 'deprel': 'obj',
 'deps': None,
 'misc': None}

## FST

In [8]:
# path to compiled transducer
FSTPATH = f"{str(pathlib.Path.home())}/flexion_data/smor/SMOR/lib/smor.a"

In [9]:
# Create instance from compiled transducer
fst = sfst_transduce.Transducer(FSTPATH)

## Analyse each token

In [46]:
morphemes = [[t, fst.analyse(t.get("form"))] for t in tokens]

In [97]:
morphemes[10]

[{'id': 10,
  'form': 'in',
  'lemma': 'in',
  'upos': 'ADP',
  'xpos': 'APPR',
  'feats': {'AdpType': 'Prep', 'Case': 'Acc'},
  'head': 12,
  'deprel': 'case',
  'deps': None,
  'misc': None},
 ['in<+PREP><Acc>', 'in<+PREP><Dat>']]

see table 
https://universaldependencies.org/tagset-conversion/de-smor-uposf.html

In [132]:
def token2sfst(t, last=True):
    tags = []
    # xpos = t.get('xpos')
    upos = t.get('upos')
    feats = t.get('feats')
    
    if upos == "NOUN":
        tags += [f"<{'+' if last else ''}NN>"]
        if feats.get('Gender'):
            tags += [f"<{feats.get('Gender')}>"]
        if feats.get('Case'):
            tags += [f"<{feats.get('Case')}>"]
        if feats.get('Number'):
            d = {'Sing': 'Sg', 'Plur': 'Pl'}
            tags += [f"<{d[feats.get('Number')]}>"]

    elif upos == "VERB":
        tags += [f"<{'+' if last else ''}V>"]
        if feats.get('Person'):
            tags += [f"<{feats.get('Person')}>"]
        if feats.get('Number'):
            d = {'Sing': 'Sg', 'Plur': 'Pl'}
            tags += [f"<{d[feats.get('Number')]}>"]
        if feats.get('Tense'):
            tags += [f"<{feats.get('Tense')}>"]
        if feats.get('Mood'):
            tags += [f"<{feats.get('Mood')}>"]

    elif upos == "ADJ":
        tags += [f"<{'+' if last else ''}ADJ>"]
        if feats.get('Degree'):
            tags += [f"<{feats.get('Degree')}>"]
        if feats.get('Gender'):
            if feats.get('Number') == "Plur" and feats.get('Gender') == "Neut":
                tags += ["<NoGend>"]
            else:
                tags += [f"<{feats.get('Gender')}>"]
        tags += ["<Nom>"]  # Case from word after!
        if feats.get('Number'):
            d = {'Sing': 'Sg', 'Plur': 'Pl'}
            tags += [f"<{d[feats.get('Number')]}>"]
        tags += ["<St>"]
    return tags

In [133]:
# morphemes[11]

## Adjektive

In [135]:
print(morphemes[11][0].get("form"))
tags = token2sfst(morphemes[11][0])
print(tags)
fst.generate(f"digital{''.join(tags)}")

analoge
['<+ADJ>', '<Pos>', '<NoGend>', '<Nom>', '<Pl>', '<St>']


['digitale']

In [136]:
fst.analyse("digital")  # die ersten 2 tags passen immer

['digital<+ADJ><Pos><Adv>', 'digital<+ADJ><Pos><Pred>']

## Verben

In [127]:
print(morphemes[5][0].get("form"))
tags = token2sfst(morphemes[5][0])
print(tags)
fst.generate(f"schreiben{''.join(tags)}")

wandelt
['<+V>', '<3>', '<Sg>', '<Pres>', '<Ind>']


['schreibt']

In [119]:
fst.analyse("schreiben")  # nur <V> passt

['schreiben<+V><3><Pl><Pres><Subj>',
 'schreiben<+V><3><Pl><Pres><Ind>',
 'schreiben<+V><1><Pl><Pres><Subj>',
 'schreiben<+V><1><Pl><Pres><Ind>',
 'schreiben<+V><Inf>']

## Nomen

In [128]:
print(morphemes[12][0].get("form"))
tags = token2sfst(morphemes[12][0])
print(tags)
fst.generate(f"Krankenhaus{''.join(tags)}")

Signale
['<+NN>', '<Neut>', '<Acc>', '<Pl>']


['Krankenhäuser']

In [121]:
fst.analyse("Schüssel")  # nehme <+NN><{Gender}> vom neuen Wort, und <{Case}><{Number}> vom alten Wort

['Schüssel<+NN><Fem><Acc><Sg>',
 'Schüssel<+NN><Fem><Dat><Sg>',
 'Schüssel<+NN><Fem><Gen><Sg>',
 'Schüssel<+NN><Fem><Nom><Sg>']

In [129]:
fst.generate(f"Schüssel<+NN><Fem>{''.join(tags[2:])}")

['Schüsseln']