In [1]:
import cache_magic
import pandas as pd
import re
import requests
import sys
from typing import *

sys.path.insert(0, "../..")
from shared import add_to_dict, log

%cache magic is now registered in ipython


In [2]:
%cache excel = requests.get("https://geschicktgendern.de/download/1642/").content

loading cached value for variable 'excel'. Time since pickling  2:31:22.442596


In [3]:
open("geschicktgendern_raw.xlsx", "wb").write(excel)

110002

In [4]:
df = pd.read_excel(
    excel, header=None, names=["ungendered", "gendered"], skiprows=3, usecols=[1, 2]
)
df.sort_values(by="ungendered")
df.head()

Unnamed: 0,ungendered,gendered
0,"<div id=""A""><b>A</b><div>",
1,Abbrecherquote,Abbruchquote
2,Abenteurer (sg.),Waghals; abenteuerliebende Person; abenteuerlu...
3,Abgänger,absolvierende Person; Abschluss innehabende Pe...
4,Abiturient,"Abitur ablegende Person; Person, die Abitur macht"


In [5]:
df.to_csv("geschicktgendern_raw.csv", index=False)
dflen = len(df)

We drop rows like the first one, where there is merely some HTML description but no value.

In [6]:
df = df[df["gendered"].notna()]
df.head()

Unnamed: 0,ungendered,gendered
1,Abbrecherquote,Abbruchquote
2,Abenteurer (sg.),Waghals; abenteuerliebende Person; abenteuerlu...
3,Abgänger,absolvierende Person; Abschluss innehabende Pe...
4,Abiturient,"Abitur ablegende Person; Person, die Abitur macht"
5,Abkömmling,abstammende Person; nachkommende Person; Kind;...


Let's look at a more complicated row:

In [7]:
df.loc[13]

ungendered            Absolventenvorsprechen [Schauspielschule]
gendered      <a href="https://geschicktgendern.de/kontakt">...
Name: 13, dtype: object

We want to get rid of such rows that contain any kind of special annotation, such as HTML or square brackets.

In [8]:
def is_simple_ungendered(a: str) -> bool:
    return re.match(r"^[A-ZÄÖÜa-zäöüß .\-]+(\(.*(sg|pl)\.\))? *$", a) != None and not "..." in a

In [9]:
def is_simple_gendered(a: str) -> bool:
    return re.match(r"^[A-ZÄÖÜa-zäöüß .,/()\-]+$", a) != None

Some rules relate only to singular or plural words. We want to assign each rule to singular plural or both:

In [10]:
def number(key: str) -> List[str]:
    number = re.search("\(.*(sg|pl)\.\)", key)
    genders = []
    if number:
        if number[1] == "sg":
            genders += ["sg"]
        if number[1] == "pl":
            genders += ["pl"]
    else:
        genders += ["sg", "pl"]
    return genders

Moreover, for each suggestion entry we want to extract all the suggestions that are not too complicated:

In [11]:
def get_suggestions(entry: str) -> List[str]:
    return [x for x in entry.split("; ") if is_simple_gendered(x)]

We want to remove the annotations like `(pl.)` from the ungendered words:

In [12]:
def clean(a: str) -> str:
    return re.sub(" ?\(.*\)", "", a)

assert clean("Baum (sg.)") == "Baum"

In [13]:
records = df.to_records()

dic: Dict[str, Dict[str, str]] = {"sg": {}, "pl": {}}
i = 0
for (_, ungendered, gendered) in records:
    suggestions = get_suggestions(gendered)
    if (
        is_simple_ungendered(ungendered)
        and len(suggestions) > 0
    ):
        for n in number(ungendered):
            add_to_dict(clean(ungendered), suggestions, dic[n])
        i += 1

In [14]:
print("#rules raw:", dflen)
print("#rules after dropping NAs:", len(df))
print("#rules after filtering too complicated:", i)

#rules raw: 1819
#rules after dropping NAs: 1792
#rules after filtering too complicated: 1483


In [15]:
df = pd.DataFrame.from_dict(dic["sg"], orient="index")
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
Abbrecherquote,Abbruchquote,,,,,,,,,,,
Abenteurer,Waghals,abenteuerliebende Person,abenteuerlustige Person,abenteuermutige Person,Abenteuermensch,,,,,,,
Abgänger,absolvierende Person,Abschluss innehabende Person,,,,,,,,,,
Abiturient,Abitur ablegende Person,"Person, die Abitur macht",,,,,,,,,,
Abkömmling,abstammende Person,nachkommende Person,Kind,Kindeskind,Person gleicher Abstammung,,,,,,,


We save this as CSV:

In [16]:
for n in ["sg", "pl"]:
    df = pd.DataFrame.from_dict(dic[n], orient="index")
    df.to_csv("geschicktgendern_{}.csv".format(n), header=False)

And we provide a method to parse the CSV to a dictionary again, so the file can be used easily in other scripts again:

In [17]:
def read_geschicktgendern_csv() -> Dict[str, Dict[str, List[str]]]:
    dic: Dict[str, Dict[str, List[str]]] = {"sg": {}, "pl": {}}
    for n in ["sg", "pl"]:
        df = pd.read_csv("geschicktgendern_{}.csv".format(n), header=None)
        d = df.set_index(0).T.to_dict('list')
        d = dict([(key, [val for val in vals if type(val) == str]) for key, vals in d.items()])
        dic[n] = d
    return dic

In [18]:
list(read_geschicktgendern_csv()["sg"].items())[:5]

[('Abbrecherquote', ['Abbruchquote']),
 ('Abenteurer',
  ['Waghals',
   'abenteuerliebende Person',
   'abenteuerlustige Person',
   'abenteuermutige Person',
   'Abenteuermensch']),
 ('Abgänger', ['absolvierende Person', 'Abschluss innehabende Person']),
 ('Abiturient', ['Abitur ablegende Person', 'Person, die Abitur macht']),
 ('Abkömmling',
  ['abstammende Person',
   'nachkommende Person',
   'Kind',
   'Kindeskind',
   'Person gleicher Abstammung'])]