In [59]:
import pandas as pd
import re
import requests
import sys
from typing import *

sys.path.insert(0, "..")
from shared import add_to_dict, csvs_to_dict, dict_to_csvs, log

In [60]:
excel = requests.get("https://geschicktgendern.de/download/1642/").content

In [61]:
open("geschicktgendern_raw.xlsx", "wb").write(excel)

110002

In [62]:
df = pd.read_excel(
    excel, header=None, names=["ungendered", "gendered"], skiprows=3, usecols=[1, 2]
)
df.sort_values(by="ungendered")
df.head()

Unnamed: 0,ungendered,gendered
0,"<div id=""A""><b>A</b><div>",
1,Abbrecherquote,Abbruchquote
2,Abenteurer (sg.),Waghals; abenteuerliebende Person; abenteuerlu...
3,Abgänger,absolvierende Person; Abschluss innehabende Pe...
4,Abiturient,"Abitur ablegende Person; Person, die Abitur macht"


In [63]:
df.to_csv("geschicktgendern_raw.csv", index=False)
dflen = len(df)

We drop rows like the first one, where there is merely some HTML description but no value.

In [64]:
df = df[df["gendered"].notna()]
df.head()

Unnamed: 0,ungendered,gendered
1,Abbrecherquote,Abbruchquote
2,Abenteurer (sg.),Waghals; abenteuerliebende Person; abenteuerlu...
3,Abgänger,absolvierende Person; Abschluss innehabende Pe...
4,Abiturient,"Abitur ablegende Person; Person, die Abitur macht"
5,Abkömmling,abstammende Person; nachkommende Person; Kind;...


Let's look at a more complicated row:

In [65]:
df.loc[13]

ungendered            Absolventenvorsprechen [Schauspielschule]
gendered      <a href="https://geschicktgendern.de/kontakt">...
Name: 13, dtype: object

We want to get rid of such rows that contain any kind of special annotation, such as HTML or square brackets.

In [66]:
def is_simple_ungendered(a: str) -> bool:
    return (
        re.match(r"^[A-ZÄÖÜa-zäöüß .\-]+(\(.*(sg|pl)\.\))? *$", a) != None
        and not "..." in a
    )

In [67]:
def is_simple_gendered(a: str) -> bool:
    return re.match(r"^[A-ZÄÖÜa-zäöüß .,/()\-]+$", a) != None

Some rules relate only to singular or plural words. We want to assign each rule to singular plural or both:

In [68]:
def number(key: str) -> List[str]:
    number = re.search("\(.*(sg|pl)\.\)", key)
    genders = []
    if number:
        return number[1]
    else:
        return "any"

Moreover, for each suggestion entry we want to extract all the suggestions that are not too complicated:

In [69]:
def get_suggestions(entry: str) -> List[str]:
    return [x for x in entry.split("; ") if is_simple_gendered(x)]

We want to remove the annotations like `(pl.)` from the ungendered words:

In [70]:
def clean(a: str) -> str:
    return re.sub(" ?\(.*\)", "", a)


assert clean("Baum (sg.)") == "Baum"

In [71]:
records = df.to_records()

dic: Dict[str, Dict[str, str]] =  {"any": {}, "sg": {}, "pl": {}}
i = 0
for (_, ungendered, gendered) in records:
    suggestions = get_suggestions(gendered)
    if is_simple_ungendered(ungendered) and len(suggestions) > 0:
        add_to_dict(clean(ungendered), suggestions, dic[number(ungendered)])
        i += 1

In [72]:
print("#rules raw:", dflen)
print("#rules after dropping NAs:", len(df))
print("#rules after filtering too complicated:", i)

#rules raw: 1819
#rules after dropping NAs: 1792
#rules after filtering too complicated: 1483


We save this as CSV:

In [73]:
dict_to_csvs(dic, "geschicktgendern")

And we provide a method to parse the CSV to a dictionary again, so the file can be used easily in other scripts again:

In [74]:
dic = csvs_to_dict("geschicktgendern")
list(dic["sg"].items())[:5]

[('Abenteurer',
  ['Waghals',
   'abenteuerliebende Person',
   'abenteuerlustige Person',
   'abenteuermutige Person',
   'Abenteuermensch']),
 ('Abteilungsleiter', ['Abteilungsleitung']),
 ('Alkoholiker', ['alkoholsüchtige Person', 'Person mit Alkoholsucht']),
 ('Alleinerziehende',
  ['Ein-Eltern-Haushalt', 'Ein-Eltern-Familie', 'alleinerziehende Person']),
 ('Amtsinhaber',
  ['Amt innehabende Person',
   'eine mit einem Amt betraute Person',
   'amtierende Person im Bereich XY/ in Position XY',
   'amtsverantwortliche Person für den Bereich XY'])]