# Creating the rule set

In [60]:
from os import path
from typing import *
import re
import subprocess
import sys

sys.path.insert(0, "../..")
from copy_files import copy_files, languagetool_path
from shared import add_to_dict, csvs_to_dict, dict_to_csvs, log

In [61]:
def strip_spaces(a):
    s = re.sub("  +|\"|'", " ", a)
    a = re.sub("^ | $|[.,:;!?]", "", a)
    return a

In [62]:
unified_dic: Dict[str, Dict[str, List[str]]] = {"sg": {}, "pl": {}, "any": {}}

files = {
    "dereko/dereko_unified": ["sg"],
    "geschicktgendern/geschicktgendern": ["sg", "pl"],
    "openthesaurus/openthesaurus_persons_male": ["sg"],
    "vienna_catalog/vienna_catalog": ["sg", "pl"],
}
dic = {}

for file, numbers in files.items():
    dic[file] = csvs_to_dict(file, numbers=numbers)
    for n in numbers:
        print(file, n, len(dic[file][n]))
        for key, vals in dic[file][n].items():
            add_to_dict(key, vals, unified_dic[n])
            add_to_dict(key, vals, unified_dic["any"])


sorted(list(unified_dic["sg"].items()))[:10]

dereko/dereko_unified sg 86
geschicktgendern/geschicktgendern sg 1185
geschicktgendern/geschicktgendern pl 1241
openthesaurus/openthesaurus_persons_male sg 3490
vienna_catalog/vienna_catalog sg 33
vienna_catalog/vienna_catalog pl 449


[('Aasgeier', []),
 ('Abbrecherquote', ['Abbruchquote']),
 ('Abdecker', []),
 ('Abenteurer',
  ['Waghals',
   'abenteuerliebende Person',
   'abenteuerlustige Person',
   'abenteuermutige Person',
   'Abenteuermensch']),
 ('Abgeordneter', []),
 ('Abgänger', ['absolvierende Person', 'Abschluss innehabende Person']),
 ('Abiturient', ['Abitur ablegende Person', 'Person, die Abitur macht']),
 ('Abkömmling',
  ['abstammende Person',
   'nachkommende Person',
   'Kind',
   'Kindeskind',
   'Person gleicher Abstammung']),
 ('Abnicker', []),
 ('Abonnent',
  ['Bezugsperson',
   'medienbeziehende Person',
   'ein Abonnement beziehende Person'])]

## Custom rules

We add some custom rules that we have written ourselves, inspired in part by the _retext-equality_ data set. 

In [63]:
custom_xml = open(path.join("retext-equality", "custom_rules_disability.xml")).read()

## Conversion to proper LanguageTool XML format

The LanguageTool rule format is described [over here](https://web.archive.org/web/20210910183442/https://dev.languagetool.org/development-overview) and [here](https://dev.languagetool.org/tips-and-tricks).

We devise a function to convert a _geschickt gendern_ entry to a XML LanguageTool entry.

In [64]:
def startupper(s: str) -> str:
    return s[0].capitalize() + s[1:]


assert startupper("absolvierende Person") == "Absolvierende Person"

In [65]:
def id(pattern, number):
    id_ = re.sub("\s", "_", pattern + "_" + number)
    return re.sub("[^A-ZÄÖÜa-zäöüß_]", "", id_)

In [66]:
def postag_attributes(number):
    if number == "any":
        return ""
    elif number == "sg":
        return 'postag=".*:SIN:.*" postag_regexp="yes" '
    elif number == "pl":
        return 'postag=".*:PLU:.*" postag_regexp="yes" '

In [67]:
xml_template = """
    <rule id="{id}" name="{pattern}">
        {antipatterns}
        <pattern>{replaced_tokens}</pattern>
        <message>Mit dem generischen Maskulinum werden nicht alle Geschlechter gleichermaßen assoziiert. Vielleicht passt einer der folgenden neutralen Begriffe besser: {suggestions}</message>
        <short>Generisches Maskulinum</short>
        <example correction="{corrections}"><marker>{pattern}</marker></example>
    </rule>
    """

In [68]:
def rule_to_xml(pattern: str, number: str, suggestions: List[str]) -> str:
    replaced_tokens = "".join(
        [
            '<token inflected="yes" {}>{}</token>'.format(
                postag_attributes(number), token
            )
            for token in pattern.split(" ")
        ]
    )
    suggestions_ = ",\n\t\t".join(
        ["<suggestion>{}</suggestion>".format(s) for s in suggestions]
    )
    antipatterns = "\n\t\t".join(
        [
            "<antipattern>\n\t\t{}\n\t\t</antipattern>".format(
                "\n\t\t".join(
                    [
                        '<token inflected="yes" {}>{}</token>'.format(
                            postag_attributes(number), token
                        )
                        for token in re.findall(r"\w+|[.,:;*_·/]", s)
                    ]
                )
            )
            for s in suggestions
        ]
    )
    # Avoid that the "any number" rule is activated when the number is known:
    antipatterns += (
        "<antipattern>\n\t\t{}\n\t\t</antipattern>".format(
            "\n\t\t".join(
                [
                    '<token inflected="yes" postag=".*:(SIN|PLU):.*" postag_regexp="yes">{}</token>'.format(
                        token
                    )
                    for token in pattern.split(" ")
                ]
            )
        )
        if number == "any"
        else ""
    )
    corrections = "|".join([startupper(s) for s in suggestions])
    return xml_template.format(
        id=id(pattern, number),
        pattern=pattern,
        antipatterns=antipatterns,
        replaced_tokens=replaced_tokens,
        suggestions=suggestions_,
        corrections=corrections,
    )

In [69]:
print(rule_to_xml("Angreifer", "pl", unified_dic["pl"]["Angreifer"]))


    <rule id="Angreifer_pl" name="Angreifer">
        <antipattern>
		<token inflected="yes" postag=".*:PLU:.*" postag_regexp="yes" >Angreifende</token>
		</antipattern>
        <pattern><token inflected="yes" postag=".*:PLU:.*" postag_regexp="yes" >Angreifer</token></pattern>
        <message>Mit dem generischen Maskulinum werden nicht alle Geschlechter gleichermaßen assoziiert. Vielleicht passt einer der folgenden neutralen Begriffe besser: <suggestion>Angreifende</suggestion></message>
        <short>Generisches Maskulinum</short>
        <example correction="Angreifende"><marker>Angreifer</marker></example>
    </rule>
    


In [70]:
print(rule_to_xml("Angreifer", "any", unified_dic["any"]["Angreifer"]))


    <rule id="Angreifer_any" name="Angreifer">
        <antipattern>
		<token inflected="yes" >angreifende</token>
		<token inflected="yes" >Person</token>
		</antipattern>
		<antipattern>
		<token inflected="yes" >Angreifende</token>
		</antipattern><antipattern>
		<token inflected="yes" postag=".*:(SIN|PLU):.*" postag_regexp="yes">Angreifer</token>
		</antipattern>
        <pattern><token inflected="yes" >Angreifer</token></pattern>
        <message>Mit dem generischen Maskulinum werden nicht alle Geschlechter gleichermaßen assoziiert. Vielleicht passt einer der folgenden neutralen Begriffe besser: <suggestion>angreifende Person</suggestion>,
		<suggestion>Angreifende</suggestion></message>
        <short>Generisches Maskulinum</short>
        <example correction="Angreifende Person|Angreifende"><marker>Angreifer</marker></example>
    </rule>
    


In [71]:
xml = custom_xml
for number in ["sg", "pl"]:
    xml += "\n\n" + "".join(
        [rule_to_xml(key, number, val) for key, val in unified_dic[number].items()]
    )
xml += "\n\n" + "".join(
    [rule_to_xml(key, "any", val) for key, val in unified_dic["any"].items()]
)
open("grammar_openminded.xml", "w").write(xml)

7616603

## Injecting the rules to the existing LanguageTool rule file

In [72]:
copy_files()

## Validating and using the rules

Running the LanguageTool rule validation:

In [73]:
# result = subprocess.run(["./testrules.sh", "de"], cwd=languagetool_path, capture_output=True)

In [None]:
# stdout = result.stdout.decode("UTF-8")
# stderr = result.stderr.decode("UTF-8")
# open("rule_validation_stdout.log", "w").write(stdout)
# open("rule_validation_stderr.log", "w").write(stderr)

Starting LanguageTool:

In [None]:
# subprocess.run(["java", "-jar", path.join(languagetool_path, "languagetool.jar")])