# Creating the rule set

In [5]:
from os import path
from typing import *
import re
import sys

sys.path.insert(0, "../..")
from shared import add_to_dict, csvs_to_dict, dict_to_csvs, log

In [6]:
def strip_spaces(a):
    s = re.sub("  +|\"|'", " ", a)
    a = re.sub("^ | $|[.,:;!?]", "", a)
    return a

In [20]:
unified_dic: Dict[str, Dict[str, List[str]]] = {"sg": {}, "pl": {}}

files = {"dereko/dereko_unified": ["sg"], "geschicktgendern/geschicktgendern": ["sg", "pl"], "openthesaurus/openthesaurus_persons_male": ["sg"], "vienna_catalog/vienna_catalog": ["sg", "pl"]}

for file, numbers in files.items():
    dic = csvs_to_dict(file, numbers=numbers)
    # print(sorted(list(dic["sg"].items()))[:10])
    for n in ["sg", "pl"]:
        print(file, n, len(dic[n]))
        for key, vals in dic[n].items():
            add_to_dict(key, vals, unified_dic[n])

sorted(list(unified_dic["sg"].items()))[:10]

dereko/dereko_unified sg 86
dereko/dereko_unified pl 0
geschicktgendern/geschicktgendern sg 1185
geschicktgendern/geschicktgendern pl 1241
openthesaurus/openthesaurus_persons_male sg 3490
openthesaurus/openthesaurus_persons_male pl 0
vienna_catalog/vienna_catalog sg 33
vienna_catalog/vienna_catalog pl 449


[('Aasgeier', []),
 ('Abbrecherquote', ['Abbruchquote']),
 ('Abdecker', []),
 ('Abenteurer',
  ['Waghals',
   'abenteuerliebende Person',
   'abenteuerlustige Person',
   'abenteuermutige Person',
   'Abenteuermensch']),
 ('Abgeordneter', []),
 ('Abgänger', ['absolvierende Person', 'Abschluss innehabende Person']),
 ('Abiturient', ['Abitur ablegende Person', 'Person, die Abitur macht']),
 ('Abkömmling',
  ['abstammende Person',
   'nachkommende Person',
   'Kind',
   'Kindeskind',
   'Person gleicher Abstammung']),
 ('Abnicker', []),
 ('Abonnent',
  ['Bezugsperson',
   'medienbeziehende Person',
   'ein Abonnement beziehende Person'])]

## Custom rules

We add some custom rules that we have written ourselves, inspired in part by the _retext-equality_ data set. 

In [None]:
custom_xml = open(path.join("retext-equality", "custom_rules_disability.xml")).read()

NameError: name 'data_dir' is not defined

## Conversion to proper LanguageTool XML format

The LanguageTool rule format is described [over here](https://web.archive.org/web/20210910183442/https://dev.languagetool.org/development-overview) and [here](https://dev.languagetool.org/tips-and-tricks).

We devise a function to convert a _geschickt gendern_ entry to a XML LanguageTool entry.

In [None]:
def startupper(s: str) -> str:
    return s[0].capitalize() + s[1:]

assert startupper("absolvierende Person") == "Absolvierende Person"

In [None]:
def rule_to_xml(pattern: str, numerus: str, suggestions: List[str]) -> str:
    id = re.sub("\s", "_", pattern + "_" + numerus)
    id = re.sub("[^A-ZÄÖÜa-zäöüß_]", "", id)
    if numerus == "sg":
        postag_attributes = 'postag=".*:SIN:.*" postag_regexp="yes" '
    elif numerus == "pl":
        postag_attributes = 'postag=".*:PLU:.*" postag_regexp="yes" '
    replaced_tokens = "".join([
        '<token inflected="yes" {}>{}</token>'.format(postag_attributes, token) 
        for token in pattern.split(" ")])
    suggestions_ = ",\n\t\t".join(["<suggestion>{}</suggestion>".format(s) for s in suggestions])
    antipatterns = "\n\t\t".join(
        ["<antipattern>\n\t\t{}\n\t\t</antipattern>".format("\n\t\t".join(
            ['<token inflected="yes">{}</token>'.format(token) for token in re.findall(r"\w+|[.,:;*_·/]", s)]
        )) for s in suggestions])
    corrections = "|".join([startupper(s) for s in suggestions])
    return """
    <rule id="{id}" name="{pattern}">
        {antipatterns}
        <pattern>{replaced_tokens}</pattern>
        <message>
        Mit dem generischen Maskulinum werden nicht alle Geschlechter gleichermaßen assoziiert. Vielleicht passt einer der folgenden neutralen Begriffe besser: 
        {suggestions}
        </message>
        <short>Generisches Maskulinum</short>
        <example correction="{corrections}"><marker>{pattern}</marker></example>
    </rule>
    """.format(id=id, pattern=pattern, antipatterns=antipatterns, replaced_tokens=replaced_tokens, suggestions=suggestions_, corrections=corrections)

In [None]:
# print(rule_to_xml("Wiener", "pl", data["sg"]["pl"]))

In [None]:
xml = custom_xml
for numerus in ["sg", "pl"]:
    xml += "\n\n" + "".join([rule_to_xml(key, numerus, val) for key, val in data[numerus].items()])

## Injecting the rules to the existing LanguageTool rule file

In [None]:
custom_filename = "grammar_custom.xml"
open(path.join(data_dir, custom_filename), "w").write(xml)
copy_files.copy_files()

## Validating and using the rules

Running the LanguageTool rule validation:

In [None]:
# subprocess.run(["./testrules.sh", "de"], cwd=languagetool_path)

Starting LanguageTool:

In [None]:
# subprocess.run(["java", "-jar", path.join(languagetool_path, "languagetool.jar")])