In [17]:
from os import path
from typing import *
import itertools
import pandas as pd
import re
import spacy
import subprocess
import sys
sys.path.insert(0,'../..')
from shared import add_to_dict, log

In [3]:
iterations = []
for i in range(0, 7):
    df = pd.read_csv(path.join("query_results", "{}_iterations.csv".format(i)))
    previous = set(itertools.chain(*iterations))
    values = set(df["word"].values.tolist()).difference(previous)
    values = list(filter(lambda x: re.match(r"^[A-ZäÖÜ][a-zäöüß\-]+$", x), values))
    iterations.append(values)

print(list(map(len, iterations)))
words = sorted(list(itertools.chain(*iterations)))


[1776, 1862, 577, 90, 0, 0, 0]


In [4]:
open("openthesaurus_persons.csv", "w").write("\n".join(words))

48635

In [5]:
nlp = spacy.load("de_core_news_sm")

def grammatical_gender(s: str) -> str:
    return nlp(s)[0].morph.get("Gender")

print(*list(map(
    lambda a: (a, grammatical_gender(a)),
    [
        "Baum",
        "Mädchen",
        "Fachkraft",
        "Manager",
        "Managerin",
        "Beamte",
        "Beamtinnen",
        "Leiter", # does not recognize gender of the second meaning
        "Butter", # recognized incorrectly as 'Masc'
        "Teller", # not recognized
        "Kabbulmoffdi" # not a word, but recognized as 'Masc'
    ]
)), sep="\n")

('Baum', ['Masc'])
('Mädchen', ['Neut'])
('Fachkraft', ['Fem'])
('Manager', ['Masc'])
('Managerin', ['Fem'])
('Beamte', ['Masc'])
('Beamtinnen', ['Fem'])
('Leiter', ['Masc'])
('Butter', ['Masc'])
('Teller', [])
('Kabbulmoffdi', ['Masc'])


The grammatical gender detection of the chosen model is not very good in general, but since it is trained on news texts, it is hopefully good enough on person words.

In [6]:
genders = {}
for word in words:
    for gender in grammatical_gender(word):
        add_to_dict(gender, [word], genders)


In [7]:
pd.DataFrame.from_dict(genders, orient="index")

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2589,2590,2591,2592,2593,2594,2595,2596,2597,2598
Masc,Aas,Aasgeier,Abdecker,Abenteurer,Abgeordneter,Abgesandter,Abgeschobener,Abkömmling,Abnicker,Abschiebehäftling,...,Zögerer,Zögling,Zöllner,Ökonom,Ösi,Österreicher,Übeltäter,Überbringer,Übermittler,Übungsleiter
Neut,Abkomme,Abstinent,Abzuschiebender,Adelige,Adept,Adonis,Afghane,Allesbesserwisser,Alter,Amtsleiter,...,,,,,,,,,,
Fem,Ablegat,Ahne,Ahnfrau,Ahnherrin,Akrobat,Almerin,Alterchen,Angie,Anlerntätigkeit,Anthropophage,...,,,,,,,,,,


In [34]:
def grammatical_gender_rft_batch(tokens: List[str]) -> Dict[str, str]:
    temp_file = path.join(rftagger_path, "test/temp.txt")
    open(temp_file, "w").write("\n".join(tokens))
    result = subprocess.run(["src/rft-annotate", "lib/german.par", temp_file], cwd=rftagger_path, capture_output=True)
    result = result.stdout.decode("UTF-8")
    dic = {}
    for line in result.split("\n"):
        matches = re.findall(r"^.*\t", line)
        if len(matches) > 0 and len(matches[0]) > 1:
            word = matches[0][:-1]
            if re.match(".*Masc.*", line):
                add_to_dict("Masc", [word], dic)
            if re.match(".*Fem.*", line):
                add_to_dict("Fem", [word], dic)
            if re.match(".*Neut.*", line):
                add_to_dict("Neut", [word], dic)
    return dic

genders = grammatical_gender_rft_batch(words)

In [37]:
print(*list(map(
    lambda a: (a, list(grammatical_gender_rft_batch([a]).keys())[0]),
    [
        "Baum",
        "Mädchen",
        "Fachkraft",
        "Manager",
        "Managerin",
        "Beamte",
        "Beamtinnen",
        "Leiter", # does not recognize gender of the second meaning
        "Butter",
        "Teller",
        "Kabbulmoffdi" # not a word, but recognized as 'Neut'
    ]
)), sep="\n")

('Baum', 'Masc')
('Mädchen', 'Neut')
('Fachkraft', 'Fem')
('Manager', 'Masc')
('Managerin', 'Fem')
('Beamte', 'Masc')
('Beamtinnen', 'Fem')
('Leiter', 'Masc')
('Butter', 'Fem')
('Teller', 'Masc')
('Kabbulmoffdi', 'Neut')


In [32]:

pd.DataFrame.from_dict(genders, orient="index")

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3480,3481,3482,3483,3484,3485,3486,3487,3488,3489
Masc,Aasgeier,Abdecker,Abenteurer,Abgeordneter,Abkömmling,Abnicker,Absolutist,Abstinenzler,Abteilungsleiter,Abtrünniger,...,Zögling,Zöllner,Ökonom,Ösi,Österreicher,Übelmann,Übeltäter,Überbringer,Übermittler,Übungsleiter
Fem,Abgesandter,Abkomme,Adoptivtochter,Ahnfrau,Ahnherrin,Amtsperson,Angetraute,Angie,Ansprechperson,Arbeitskraft,...,,,,,,,,,,
Neut,Adoptivkind,Alter,Alterchen,Anerkennungsjahr,Arschloch,Assassine,Barbier,Berufsanerkennungsjahr,Berufspraktikum,Betthupferl,...,,,,,,,,,,


This is still far from perfect, but better than the Spacy model. Future work could use [deep-german](https://github.com/aakhundov/deep-german), but I have Mac-specific trouble installing it at the moment.

In [38]:
open("openthesaurus_persons_male_sg.csv", "w").write("\n".join(genders["Masc"]))

40027