In [54]:
from os import path
from typing import *
import itertools
import pandas as pd
import re
import spacy
import subprocess
import sys

sys.path.insert(0, "..")
from helpers import add_to_dict, log
from helpers_csv import csvs_to_dict, dict_to_csvs

We download a MySQL dump from [OpenThesaurus](https://www.openthesaurus.de/about/download) and run the queries given in `queries.sql` against it. The results are saved in the `query_results` folder.

In [55]:
iterations = []
for i in range(0, 7):
    df = pd.read_csv(path.join("query_results", "{}_iterations.csv".format(i)))
    previous = set(itertools.chain(*iterations))
    values = set(df["word"].values.tolist()).difference(previous)
    values = list(filter(lambda x: re.match(r"^[A-ZäÖÜ][a-zäöüß\-]+$", x), values))
    iterations.append(values)

print(list(map(len, iterations)))
words = sorted(list(itertools.chain(*iterations)))

[1776, 1862, 577, 90, 0, 0, 0]


In [56]:
open("openthesaurus_persons.csv", "w").write("\n".join(words))

48635

In [57]:
nlp = spacy.load("de_core_news_sm")


def grammatical_gender(s: str) -> str:
    return nlp(s)[0].morph.get("Gender")


print(
    *list(
        map(
            lambda a: (a, grammatical_gender(a)),
            [
                "Baum",
                "Mädchen",
                "Fachkraft",
                "Manager",
                "Managerin",
                "Beamte",
                "Beamtinnen",
                "Leiter",  # does not recognize gender of the second meaning
                "Butter",  # recognized incorrectly as 'Masc'
                "Teller",  # not recognized
                "Kabbulmoffdi",  # not a word, but recognized as 'Masc'
            ],
        )
    ),
    sep="\n"
)

('Baum', ['Masc'])
('Mädchen', ['Neut'])
('Fachkraft', ['Fem'])
('Manager', ['Masc'])
('Managerin', ['Fem'])
('Beamte', ['Masc'])
('Beamtinnen', ['Fem'])
('Leiter', ['Masc'])
('Butter', ['Masc'])
('Teller', [])
('Kabbulmoffdi', ['Masc'])


The grammatical gender detection of the chosen model is not very good in general, but since it is trained on news texts, it is hopefully good enough on person words.

In [58]:
genders = {}
for word in words:
    for gender in grammatical_gender(word):
        add_to_dict(gender, [word], genders)

In [59]:
pd.DataFrame.from_dict(genders, orient="index")

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2589,2590,2591,2592,2593,2594,2595,2596,2597,2598
Masc,Aas,Aasgeier,Abdecker,Abenteurer,Abgeordneter,Abgesandter,Abgeschobener,Abkömmling,Abnicker,Abschiebehäftling,...,Zögerer,Zögling,Zöllner,Ökonom,Ösi,Österreicher,Übeltäter,Überbringer,Übermittler,Übungsleiter
Neut,Abkomme,Abstinent,Abzuschiebender,Adelige,Adept,Adonis,Afghane,Allesbesserwisser,Alter,Amtsleiter,...,,,,,,,,,,
Fem,Ablegat,Ahne,Ahnfrau,Ahnherrin,Akrobat,Almerin,Alterchen,Angie,Anlerntätigkeit,Anthropophage,...,,,,,,,,,,


In [60]:
def grammatical_gender_rft_batch(tokens: List[str]) -> Dict[str, str]:
    rftagger_path = "../lib/RFTagger"
    temp_file = "test/temp.txt"
    open(path.join(rftagger_path, temp_file), "w").write("\n".join(tokens))
    result = subprocess.run(
        ["src/rft-annotate", "lib/german.par", temp_file],
        cwd=rftagger_path,
        capture_output=True,
    )
    result = result.stdout.decode("UTF-8")
    dic = {}
    for line in result.split("\n"):
        matches = re.findall(r"^.*\t", line)
        if len(matches) > 0 and len(matches[0]) > 1:
            word = matches[0][:-1]
            # spacy_genders = grammatical_gender(word)
            rft_genders = re.findall(r"Masc|Fem|Neut", line)
            if len(rft_genders) == 1:
                rft_gender = rft_genders[0]
                # spacy_gender = spacy_genders[0]
                # if rft_gender == spacy_gender:
                add_to_dict(rft_gender, [word], dic)
    return dic


genders = grammatical_gender_rft_batch(words)

In [61]:
print(
    *list(
        map(
            lambda a: (a, list(grammatical_gender_rft_batch([a]).keys())),
            [
                "Baum",
                "Mädchen",
                "Fachkraft",
                "Manager",
                "Managerin",
                "Beamte",
                "Beamtinnen",
                "Leiter",  # does not recognize gender of the second meaning
                "Butter",
                "Teller",
                "Kabbulmoffdi",  # not a word, but recognized as 'Neut'
            ],
        )
    ),
    sep="\n"
)

('Baum', ['Masc'])
('Mädchen', ['Neut'])
('Fachkraft', ['Fem'])
('Manager', ['Masc'])
('Managerin', ['Fem'])
('Beamte', ['Masc'])
('Beamtinnen', ['Fem'])
('Leiter', ['Masc'])
('Butter', ['Fem'])
('Teller', ['Masc'])
('Kabbulmoffdi', ['Neut'])


In [62]:
pd.DataFrame.from_dict(genders, orient="index")

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3477,3478,3479,3480,3481,3482,3483,3484,3485,3486
Masc,Aasgeier,Abdecker,Abenteurer,Abgeordneter,Abkömmling,Abnicker,Absolutist,Abstinenzler,Abteilungsleiter,Abtrünniger,...,Zögling,Zöllner,Ökonom,Ösi,Österreicher,Übelmann,Übeltäter,Überbringer,Übermittler,Übungsleiter
Fem,Abgesandter,Abkomme,Adoptivtochter,Ahnfrau,Ahnherrin,Amtsperson,Angetraute,Angie,Ansprechperson,Arbeitskraft,...,,,,,,,,,,
Neut,Adoptivkind,Alter,Alterchen,Anerkennungsjahr,Arschloch,Assassine,Barbier,Berufsanerkennungsjahr,Berufspraktikum,Betthupferl,...,,,,,,,,,,


This is still far from perfect, but better than the Spacy model. Future work could use [deep-german](https://github.com/aakhundov/deep-german), but I have Mac-specific trouble installing it at the moment.

In [63]:
open("openthesaurus_persons_male_sg.csv", "w").write("\n".join(genders["Masc"]))

39970

Next, we use opentheasurus once more to retrieve synonyms for the male-person words that we have found above. We create a new table `male_persons` with the single column `male_person`and import `openthesaurus_persons_male_sg.csv`. Then we run, and save the result in `query_results/synonyms.csv`:

```sql
select mp.male_person, t2.word as synonym from 
male_persons mp
join term t1 on mp.male_person = t1.word
join term t2 on t1.synset_id = t2.synset_id;
```

In [64]:
df = pd.read_csv(path.join("query_results", "synonyms.csv"))
df.head()

Unnamed: 0,male_person,synonym
0,Urmensch,Mensch der Altsteinzeit
1,Urmensch,Urmensch
2,Auftraggeber,Auftraggeber
3,Auftraggeber,Kunde
4,Auftraggeber,Mandant


In [65]:
synonyms = df.to_records()
synonyms[:10]

rec.array([(0, 'Urmensch', 'Mensch der Altsteinzeit'),
           (1, 'Urmensch', 'Urmensch'),
           (2, 'Auftraggeber', 'Auftraggeber'),
           (3, 'Auftraggeber', 'Kunde'), (4, 'Auftraggeber', 'Mandant'),
           (5, 'Auftraggeber', 'Adressat'), (6, 'Kunde', 'Auftraggeber'),
           (7, 'Kunde', 'Kunde'), (8, 'Kunde', 'Mandant'),
           (9, 'Kunde', 'Adressat')],
          dtype=[('index', '<i8'), ('male_person', 'O'), ('synonym', 'O')])

In [66]:
synonyms_by_gender = grammatical_gender_rft_batch([s for _, _, s in synonyms])

In [None]:
synonyms_nonmale = {}
for _, male, synonym in synonyms:
    if synonym in synonyms_by_gender["Fem"] or synonym in synonyms_by_gender["Neut"]:
        add_to_dict(male, [synonym], synonyms_nonmale)

# for a, b in synonyms_nonmale.items():
#     print(a, b)