Badwords, informals, and words_to_watch for zh

Also adds pre-processing to regex matches so that we can have traditional chinese converted to simplified and expect matches to work.
wikimedia · May 18, 2019 · ec65506 · ec65506
1 parent e469800
commit ec65506
Show file tree

Hide file tree

Showing 8 changed files with 328 additions and 21 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -3,6 +3,7 @@ deltas >= 0.4.6, < 0.4.999
 docopt >= 0.6.2, < 0.6.999
 flake8 >= 3.3.0, < 3.3.999
 gensim >= 2.3.3, < 3.3.999
+hanziconv >= 0.3.2, < 0.3.999
 mmh3 >= 2.3.1, < 2.3.999
 more-itertools == 2.2
 mwapi >= 0.5.0, < 0.5.999

diff --git a/revscoring/languages/__init__.py b/revscoring/languages/__init__.py
@@ -36,6 +36,11 @@
 .. automodule:: revscoring.languages.catalan
     :members:
 
+chinese
++++++++
+.. automodule:: revscoring.languages.chinese
+    :members:
+
 croatian
 +++++
 .. automodule:: revscoring.languages.croatian

diff --git a/revscoring/languages/chinese.py b/revscoring/languages/chinese.py
@@ -0,0 +1,108 @@
+from hanziconv import HanziConv
+
+from .features import RegexMatches
+
+name = "chinese"
+
+badword_regexes = list(map(HanziConv.toSimplified, [
+    r"王八蛋",  # son of a bitch
+    r"他媽的",  # "his mother's"
+    r"去你媽",  # "to your mother"
+    r"去你的",  # "to yours"
+    r"婊子", r"妓女",  # prostitute
+    r"日了?狗",  # lonely dog
+    r"屁眼", r"混蛋",  # asshole
+    r"渾蛋",  # zh-hant of previous
+    r"混帳",  # variant of above
+    r"王八",  # bitch
+    r"白癡",  # idiot
+    r"腦殘",  # brain dead
+    r"智障",  # mentally retarded
+    r"婊", r"妓",  # prostitute
+    r"屎",  # shit
+    r"屌",  # dick
+    r"妈逼",  # (this is verbal but definitely bad)
+    r"艹", r"肏",  # fuck (in any context)
+    r"放屁",
+
+    # Variants (homonyms) of the use of "fuck" that use 操 ("operation") and
+    # 草 ("grass"), "肏" is the actual character.  "艹" is not a real character
+    # but it's used this way
+    r"操你", r"草你", r"日你",  # fuck you
+    r"操他", r"草他", r"日他",  # fuck his
+    r"操她", r"草她", r"日她",  # fuck her
+
+    # Discrimination (racial slurs)
+    r"小日本",  # little Japanese
+    r"台湾狗",  # Taiwanese dogs
+    r"共产中国",  # communist Chinese
+    r"流氓国家",  # rogue country
+    r"人渣",  # human slag
+    r"我去",  # this is verbal and bad
+    r"鬼子"
+]))
+
+badwords = RegexMatches(name + ".badwords", badword_regexes, wrapping=None,
+                        text_preprocess=HanziConv.toSimplified)
+"""
+:class:`~revscoring.languages.features.RegexMatches` features via a list of
+badword detecting regexes.
+"""
+
+# Formatted from https://resources.allsetlearning.com/chinese/grammar/Formal_and_informal_function_words
+informal_regexes = list(map(HanziConv.toSimplified, [
+    # Hello
+    r"你好",  # nǐ hǎo; The standard "hello" greeting.
+    r"您好",  # nín hǎo; The same "hello" greeting as above
+    r"你怎么样",  # nǐ zěnmeyàng?; "What's up?", "How are you doing?"
+
+    # Good afternoon
+    r"午安",  # wǔ'an; note: seldom used in the Mainland.
+    r"下午好",  # xìawǔ hǎo! Seldom used in the Republic of China
+
+    # Good evening / Good night
+    r"晚安",  # wǎn'an; Literally "Peace at night", Good night.
+    r"晚上好",  # wǎnshang hǎo; Good evening!
+
+    # Good-bye
+    r"再見",  # zàijian; Literally "See you again".
+    r"明天見",  # míngtian jiàn; Literally "See you tomorrow".
+    r"拜拜",  # bāibāi/báibái; From English "Bye-Bye".
+    r"回頭見",  # huítóujiàn: roughly equivalent to "see you soon"
+    r"回見",  # huíjiàn; usually used in Beijing or written Chinese.
+    r"再會",  # zàihuì: Literally "[we'll] hello again".
+    r"666+", r"233+",  # No one knows why.  But this belongs
+]))
+
+informals = RegexMatches(name + ".informals", informal_regexes, wrapping=None,
+                         text_preprocess=HanziConv.toSimplified)
+"""
+:class:`~revscoring.languages.features.RegexMatches` features via a list of
+informal word detecting regexes.
+"""
+
+words_to_watch_regexes = list(map(HanziConv.toSimplified, [
+    # Advertising language
+    r"本台",  # this channel
+    r"本公司",  # this company
+    r"代刷", r"代练", r"代抢",  # someone who plays games for you
+    r"强势回归",  # "mightly" return
+    r"超值",  # very cost-effective
+    r"一条龙",  # a proverb? "one line of dragon"
+    r"一夜情",  # selling one's body (advertising)
+    r"世界一流", r"国际一流",  # world first-class
+    r"用户第一", r"用户满意", r"用户至上",  # customer-first
+    r"核心价值", r"核心团队", r"核心宗旨",  # core value
+    r"服务小姐",  # service lady
+    r"服务范围",  # service area
+    r"服务项目",  # service items
+    r"服务理念",  # service philosophy
+]))
+
+words_to_watch = RegexMatches(name + ".words_to_watch", words_to_watch_regexes,
+                              wrapping=None,
+                              text_preprocess=HanziConv.toSimplified)
+"""
+:class:`~revscoring.languages.features.RegexMatches` features via a list of
+advertising language regexes.
+"""
diff --git a/revscoring/languages/features/regex_matches/datasources.py b/revscoring/languages/features/regex_matches/datasources.py
@@ -1,14 +1,23 @@
+from ....datasources import Datasource
 from ....datasources.meta import extractors, frequencies, mappers
 from ....dependencies import DependentSet
 
 
 class Revision(DependentSet):
 
-    def __init__(self, name, regexes, wikitext_revision, exclusions, wrapping):
+    def __init__(self, name, regexes, wikitext_revision, exclusions, wrapping,
+                 text_preprocess):
         super().__init__(name)
+        if text_preprocess is not None:
+            self.text = Datasource(
+                name + ".preprocessed_text",
+                depends_on=[wikitext_revision.text],
+                process=text_preprocess)
+        else:
+            self.text = wikitext_revision.text
 
         self.matches = extractors.regex(
-            regexes, wikitext_revision.text,
+            regexes, self.text,
             name=name + ".matches",
             exclusions=exclusions,
             wrapping=wrapping
@@ -22,29 +31,44 @@ def __init__(self, name, regexes, wikitext_revision, exclusions, wrapping):
             self.parent = Revision(name + ".parent", regexes,
                                    wikitext_revision.parent,
                                    exclusions=exclusions,
-                                   wrapping=wrapping)
+                                   wrapping=wrapping,
+                                   text_preprocess=text_preprocess)
 
         if hasattr(wikitext_revision, 'diff'):
             self.diff = Diff(name + ".diff", regexes,
                              wikitext_revision.diff, self,
                              exclusions=exclusions,
-                             wrapping=wrapping)
+                             wrapping=wrapping,
+                             text_preprocess=text_preprocess)
 
 
 class Diff(DependentSet):
 
     def __init__(self, name, regexes, wikitext_diff,
-                 revision, exclusions, wrapping):
+                 revision, exclusions, wrapping, text_preprocess):
         super().__init__(name)
 
+        if text_preprocess is not None:
+            segments_added = PreprocessedSegments(
+                name + ".preprocessed_segments_added",
+                wikitext_diff.segments_added,
+                text_preprocess)
+            segments_removed = Datasource(
+                name + ".preprocessed_segments_removed",
+                wikitext_diff.segments_removed,
+                text_preprocess)
+        else:
+            segments_added = wikitext_diff.segments_added
+            segments_removed = wikitext_diff.segments_removed
+
         self.matches_added = extractors.regex(
-            regexes, wikitext_diff.segments_added,
+            regexes, segments_added,
             name=name + ".matches_added",
             exclusions=exclusions,
             wrapping=wrapping
         )
         self.matches_removed = extractors.regex(
-            regexes, wikitext_diff.segments_removed,
+            regexes, segments_removed,
             name=name + ".matches_removed",
             exclusions=exclusions,
             wrapping=wrapping
@@ -60,3 +84,12 @@ def __init__(self, name, regexes, wikitext_diff,
             self.match_delta,
             name=name + ".match_prop_delta"
         )
+
+
+class PreprocessedSegments(Datasource):
+
+    def __init__(self, name, segments, text_preprocess):
+        self.text_preprocess = text_preprocess
+
+    def process(self, segments):
+        return [self.text_preprocess(segment) for segment in segments]
diff --git a/revscoring/languages/features/regex_matches/regex_matches.py b/revscoring/languages/features/regex_matches/regex_matches.py
@@ -1,6 +1,6 @@
+from . import datasources, features
 from ....dependencies import DependentSet
 from ....features import wikitext
-from . import datasources, features
 
 
 class RegexMatches(DependentSet):
@@ -17,7 +17,7 @@ class RegexMatches(DependentSet):
     """
 
     def __init__(self, name, regexes, exclusions=None,
-                 wrapping=(r'\b', r'\b')):
+                 wrapping=(r'\b', r'\b'), text_preprocess=None):
         super().__init__(name)
         self._regexes = regexes
         self._exclusions = exclusions
@@ -28,7 +28,8 @@ def __init__(self, name, regexes, exclusions=None,
                 name + ".revision", regexes,
                 wikitext.revision.datasources,
                 exclusions=exclusions,
-                wrapping=wrapping
+                wrapping=wrapping,
+                text_preprocess=text_preprocess
             )
         )
         """

diff --git a/tests/languages/features/regex_matches/tests/test_regexes.py b/tests/languages/features/regex_matches/tests/test_regexes.py
@@ -67,3 +67,22 @@ def test_pickling():
             badwords.revision.diff.matches_added)
     assert (pickle.loads(pickle.dumps(badwords.revision.diff.matches_removed)) ==
             badwords.revision.diff.matches_removed)
+
+
+def replace_foo_bar(text):
+    return text.replace("foo", "bar")
+
+my_barbaz = RegexMatches(
+    "my_barbaz",
+    [r"barbaz"],
+    text_preprocess=replace_foo_bar
+)
+
+
+def test_text_preprocessing():
+
+    cache = {r_text: "Hi foobaz. Derp"}
+    assert (solve(my_barbaz.revision.datasources.matches, cache=cache) ==
+            ['barbaz'])
+    assert (pickle.loads(pickle.dumps(badwords.revision.matches)) ==
+            badwords.revision.matches)