Badwords and informals for zh

Also adds pre-processing to regex matches so that we can have traditional chinese converted to simplified and expect matches to work.
wikimedia · May 18, 2019 · e36e0fc · e36e0fc
1 parent e469800
commit e36e0fc
Show file tree

Hide file tree

Showing 8 changed files with 303 additions and 13 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -3,6 +3,7 @@ deltas >= 0.4.6, < 0.4.999
 docopt >= 0.6.2, < 0.6.999
 flake8 >= 3.3.0, < 3.3.999
 gensim >= 2.3.3, < 3.3.999
+hanziconv >= 0.3.2, < 0.3.999
 mmh3 >= 2.3.1, < 2.3.999
 more-itertools == 2.2
 mwapi >= 0.5.0, < 0.5.999

diff --git a/revscoring/languages/__init__.py b/revscoring/languages/__init__.py
@@ -36,6 +36,11 @@
 .. automodule:: revscoring.languages.catalan
     :members:
 
+chinese
++++++++
+.. automodule:: revscoring.languages.chinese
+    :members:
+
 croatian
 +++++
 .. automodule:: revscoring.languages.croatian

diff --git a/revscoring/languages/chinese.py b/revscoring/languages/chinese.py
@@ -0,0 +1,106 @@
+from hanziconv import HanziConv
+
+from .features import RegexMatches
+
+name = "chinese"
+
+badword_regexes = [
+    r"王八蛋",  # son of a bitch
+    r"他媽的",  # "his mother's"
+    r"去你媽",  # "to your mother"
+    r"去你的",  # "to yours"
+    r"婊子", r"妓女",  # prostitute
+    r"狗日",  # "dog days" (inappropriate)
+    r"屁眼", r"混蛋",  # asshole
+    r"渾蛋",  # zh-hant of previous
+    r"混帳",  # variant of above
+    r"王八",  # bitch
+    r"白癡",  # idiot
+    r"腦殘",  # brain dead
+    r"智障",  # mentally retarded
+    r"婊", r"妓",  # prostitute
+    r"屎",  # shit
+    r"妈逼",  # (this is verbal but definitely bad)
+    r"艹", r"肏",  # fuck (in any context)
+    r"放屁",
+
+    # Variants (homonyms) of the use of "fuck" that use 操 ("operation") and
+    # 草 ("grass"), "肏" is the actual character.  "艹" is not a real character
+    # but it's used this way
+    r"操你", r"草你",  # fuck you
+    r"操他", r"草他",  # fuck his
+    r"操她", r"草她",  # fuck her
+
+    # Discrimination (racial slurs)
+    r"小日本",  # little Japanese
+    r"台湾狗",  # Taiwanese dogs
+    r"共产中国",  # communist Chinese
+    r"流氓国家",  # rogue country
+    r"人渣",  # human slag
+    r"我去",  # this is verbal and bad
+    r"鬼子"
+]
+
+badwords = RegexMatches(name + ".badwords", badword_regexes, wrapping=None,
+                        text_preprocess=HanziConv.toSimplified)
+"""
+:class:`~revscoring.languages.features.RegexMatches` features via a list of
+badword detecting regexes.
+"""
+
+# Formatted from https://resources.allsetlearning.com/chinese/grammar/Formal_and_informal_function_words
+informal_regexes = [
+    # Hello
+    r"你好",  # nǐ hǎo; The standard "hello" greeting.
+    r"您好",  # nín hǎo; The same "hello" greeting as above
+    r"你怎么样",  # nǐ zěnmeyàng?; "What's up?", "How are you doing?"
+
+    # Good afternoon
+    r"午安",  # wǔ'an; note: seldom used in the Mainland.
+    r"下午好",  # xìawǔ hǎo! Seldom used in the Republic of China
+
+    # Good evening / Good night
+    r"晚安",  # wǎn'an; Literally "Peace at night", Good night.
+    r"晚上好",  # wǎnshang hǎo; Good evening!
+
+    # Good-bye
+    r"再見",  # zàijian; Literally "See you again".
+    r"明天見",  # míngtian jiàn; Literally "See you tomorrow".
+    r"拜拜",  # bāibāi/báibái; From English "Bye-Bye".
+    r"回頭見",  # huítóujiàn: roughly equivalent to "see you soon"
+    r"回見",  # huíjiàn; usually used in Beijing or written Chinese.
+    r"再會"  # zàihuì: Literally "[we'll] hello again".
+]
+
+informals = RegexMatches(name + ".informals", informal_regexes, wrapping=None,
+                         text_preprocess=HanziConv.toSimplified)
+"""
+:class:`~revscoring.languages.features.RegexMatches` features via a list of
+informal word detecting regexes.
+"""
+
+words_to_watch_regexes = [
+    # Advertising language
+    r"本台",  # this channel
+    r"本公司",  # this company
+    r"代刷", r"代练", r"代抢",  # someone who plays games for you
+    r"强势回归",  # "mightly" return
+    r"超值",  # very cost-effective
+    r"一条龙",  # a proverb? "one line of dragon"
+    r"一夜情",  # selling one's body (advertising)
+    r"世界一流", r"国际一流",  # world first-class
+    r"用户第一", r"用户满意", r"用户至上",  # customer-first
+    r"核心价值", r"核心团队", r"核心宗旨",  # core value
+    r"服务小姐",  # service lady
+    r"服务范围",  # service area
+    r"服务项目",  # service items
+    r"服务理念",  # service philosophy
+]
+
+words_to_watch = RegexMatches(name + ".words_to_watch", words_to_watch_regexes,
+                              wrapping=None,
+                              text_preprocess=HanziConv.toSimplified)
+"""
+:class:`~revscoring.languages.features.RegexMatches` features via a list of
+advertising language regexes.
+"""
diff --git a/revscoring/languages/features/regex_matches/datasources.py b/revscoring/languages/features/regex_matches/datasources.py
@@ -1,14 +1,24 @@
+from ....datasources import Datasource
 from ....datasources.meta import extractors, frequencies, mappers
 from ....dependencies import DependentSet
 
 
 class Revision(DependentSet):
 
-    def __init__(self, name, regexes, wikitext_revision, exclusions, wrapping):
+    def __init__(self, name, regexes, wikitext_revision, exclusions, wrapping,
+                 text_preprocess):
         super().__init__(name)
 
+        if text_preprocess is not None:
+            text = Datasource(
+                name + ".preprocessed_text",
+                depends_on=[wikitext_revision.text],
+                process=text_preprocess)
+        else:
+            text = wikitext_revision.text
+
         self.matches = extractors.regex(
-            regexes, wikitext_revision.text,
+            regexes, text,
             name=name + ".matches",
             exclusions=exclusions,
             wrapping=wrapping
@@ -22,29 +32,46 @@ def __init__(self, name, regexes, wikitext_revision, exclusions, wrapping):
             self.parent = Revision(name + ".parent", regexes,
                                    wikitext_revision.parent,
                                    exclusions=exclusions,
-                                   wrapping=wrapping)
+                                   wrapping=wrapping,
+                                   text_preprocess=text_preprocess)
 
         if hasattr(wikitext_revision, 'diff'):
             self.diff = Diff(name + ".diff", regexes,
                              wikitext_revision.diff, self,
                              exclusions=exclusions,
-                             wrapping=wrapping)
+                             wrapping=wrapping,
+                             text_preprocess=text_preprocess)
 
 
 class Diff(DependentSet):
 
     def __init__(self, name, regexes, wikitext_diff,
-                 revision, exclusions, wrapping):
+                 revision, exclusions, wrapping, text_preprocess):
         super().__init__(name)
 
+        if text_preprocess is not None:
+            self.text_preprocess = text_preprocess
+            segments_added = PreprocessedSegments(
+                name + ".preprocessed_segments_added",
+                wikitext_diff.segments_added,
+                text_preprocess)
+            segments_removed = Datasource(
+                name + ".preprocessed_segments_removed",
+                wikitext_diff.segments_removed,
+                text_preprocess)
+        else:
+            self.text_preprocess = None
+            segments_added = wikitext_diff.segments_added
+            segments_removed = wikitext_diff.segments_removed
+
         self.matches_added = extractors.regex(
-            regexes, wikitext_diff.segments_added,
+            regexes, segments_added,
             name=name + ".matches_added",
             exclusions=exclusions,
             wrapping=wrapping
         )
         self.matches_removed = extractors.regex(
-            regexes, wikitext_diff.segments_removed,
+            regexes, segments_removed,
             name=name + ".matches_removed",
             exclusions=exclusions,
             wrapping=wrapping
@@ -60,3 +87,12 @@ def __init__(self, name, regexes, wikitext_diff,
             self.match_delta,
             name=name + ".match_prop_delta"
         )
+
+
+class PreprocessedSegments(Datasource):
+
+    def __init__(self, name, segments, text_preprocess):
+        self.text_preprocess = text_preprocess
+
+    def process(self, segments):
+        return [self.text_preprocess(segment) for segment in segments]
diff --git a/revscoring/languages/features/regex_matches/regex_matches.py b/revscoring/languages/features/regex_matches/regex_matches.py
@@ -1,6 +1,6 @@
+from . import datasources, features
 from ....dependencies import DependentSet
 from ....features import wikitext
-from . import datasources, features
 
 
 class RegexMatches(DependentSet):
@@ -17,7 +17,7 @@ class RegexMatches(DependentSet):
     """
 
     def __init__(self, name, regexes, exclusions=None,
-                 wrapping=(r'\b', r'\b')):
+                 wrapping=(r'\b', r'\b'), text_preprocess=None):
         super().__init__(name)
         self._regexes = regexes
         self._exclusions = exclusions
@@ -28,7 +28,8 @@ def __init__(self, name, regexes, exclusions=None,
                 name + ".revision", regexes,
                 wikitext.revision.datasources,
                 exclusions=exclusions,
-                wrapping=wrapping
+                wrapping=wrapping,
+                text_preprocess=text_preprocess
             )
         )
         """

diff --git a/tests/languages/features/regex_matches/tests/test_regexes.py b/tests/languages/features/regex_matches/tests/test_regexes.py
@@ -67,3 +67,22 @@ def test_pickling():
             badwords.revision.diff.matches_added)
     assert (pickle.loads(pickle.dumps(badwords.revision.diff.matches_removed)) ==
             badwords.revision.diff.matches_removed)
+
+
+def replace_foo_bar(text):
+    return text.replace("foo", "bar")
+
+my_barbaz = RegexMatches(
+    "my_barbaz",
+    [r"barbaz"],
+    text_preprocess=replace_foo_bar
+)
+
+
+def test_text_preprocessing():
+
+    cache = {r_text: "Hi foobaz. Derp"}
+    assert (solve(my_barbaz.revision.datasources.matches, cache=cache) ==
+            ['barbaz'])
+    assert (pickle.loads(pickle.dumps(badwords.revision.matches)) ==
+            badwords.revision.matches)
diff --git a/tests/languages/test_chinese.py b/tests/languages/test_chinese.py
@@ -0,0 +1,122 @@
+import pickle
+
+from hanziconv import HanziConv
+from revscoring.datasources import revision_oriented
+from revscoring.languages import chinese
+
+from .util import compare_extraction
+
+BAD = [
+    HanziConv.toTraditional("王八蛋"),  # son of a bitch
+    HanziConv.toSimplified("王八蛋"),  # son of a bitch
+    "他媽的",  # "his mother's"
+    "去你媽",  # "to your mother"
+    "去你的",  # "to yours"
+    "婊子", "妓女",  # prostitute
+    "狗日",  # "dog days" (inappropriate)
+    "屁眼", "混蛋", "渾蛋",  # asshole
+    "混帳",  # variant of above
+    "王八",  # bitch
+    "白癡",  # idiot
+    "腦殘",  # brain dead
+    "智障",  # mentally retarded
+    "婊", "妓",  # prostitute
+    "屎",  # shit
+    "妈逼",  # (this is verbal but definitely bad)
+    "艹", "肏",  # fuck (in any context)
+    "放屁",  # fart
+
+    # Variants (homonyms) of the use of "fuck" that use 操 ("operation") and
+    # 草 ("grass"), "肏" is the actual character.  "艹" is not a real character
+    # but it's used this way
+    "操你", "草你",  # fuck you
+    "操他", "草他",  # fuck his
+    "操她", "草她",  # fuck her
+
+    # Discrimination (racial slurs)
+    "小日本",  # little Japanese
+    "台湾狗",  # Taiwanese dogs
+    "共产中国",  # communist Chinese
+    "流氓国家",  # rogue country
+    "人渣",  # human slag
+    "我去",  # this is verbal and bad
+    "鬼子"  # devil, usually a suffix
+]
+
+INFORMAL = [
+    # Hello
+    "你好",  # nǐ hǎo; The standard "hello" greeting.
+    "您好",  # nín hǎo; The same "hello" greeting as above
+    "你怎么样",  # nǐ zěnmeyàng?; "What's up?", "How are you doing?"
+
+    # Good afternoon
+    "午安",  # wǔ'an; note: seldom used in the Mainland.
+    "下午好",  # xìawǔ hǎo! Seldom used in the Republic of China
+
+    # Good evening / Good night
+    "晚安",  # wǎn'an; Literally "Peace at night", Good night.
+    "晚上好",  # wǎnshang hǎo; Good evening!
+
+    # Good-bye
+    "再見",  # zàijian; Literally "See you again".
+    "明天見",  # míngtian jiàn; Literally "See you tomorrow".
+    "拜拜",  # bāibāi/báibái; From English "Bye-Bye".
+    "回頭見",  # huítóujiàn: roughly equivalent to "see you soon"
+    "回見",  # huíjiàn; usually used in Beijing or written Chinese.
+    "再會"  # zàihuì: Literally "[we'll] hello again".
+]
+
+WORDS_TO_WATCH = [
+    # Advertising language
+    "本台",  # this channel
+    "本公司",  # this company
+    "代刷", "代练", "代抢",  # someone who plays games for you
+    "强势回归",  # "mightly" return
+    "超值",  # very cost-effective
+    "一条龙",  # a proverb? "one line of dragon"
+    "一夜情",  # selling one's body (advertising)
+    "世界一流", "国际一流",  # world first-class
+    "用户第一", "用户满意", "用户至上",  # customer-first
+    "核心价值", "核心团队", "核心宗旨",  # core value
+    "服务小姐",  # service lady
+    "服务范围",  # service area
+    "服务项目",  # service items
+    "服务理念",  # service philosophy
+]
+
+OTHER = [
+    """2005年大西洋颶風季是有纪录以来最活跃的大西洋颶風季，至今仍保持着多项纪录。
+    全季对大范围地区造成毁灭性打击，共导致3,913人死亡，损失数额更创下新纪录，高达1592亿美元。
+    本季单大型飓风就有7场之多，其中5场在登陆时仍有大型飓风强度，分别是颶風丹尼斯、艾米莉、
+    卡特里娜、丽塔和威尔玛，大部分人员伤亡和财产损失都是这5场飓风引起。
+    墨西哥的金塔納羅奧州和尤卡坦州，
+    以及美国的佛罗里达州和路易斯安那州都曾两度受大型飓风袭击；古巴、巴哈马、海地，
+    美国的密西西比州和得克萨斯州，还有墨西哥的塔毛利帕斯州都曾直接受1场大型飓风冲击，
+    还有至少1场在附近掠过。美國墨西哥灣沿岸地區是本季受灾最严重的所在，
+    飓风卡特里娜产生高达10米的风暴潮，引发毁灭性洪灾，密西西比州沿海地区的大部分建筑物被毁，
+    风暴之后又令新奥尔良防洪堤决口，整个城市因此受到重创。此外，飓风斯坦同溫帶氣旋共同影响，
+    在中美洲多地引发致命的泥石流，其中又以危地马拉灾情最为严重。"""
+]
+
+r_text = revision_oriented.revision.text
+
+
+def test_badwords():
+    compare_extraction(chinese.badwords.revision.datasources.matches,
+                       BAD, OTHER)
+    assert chinese.badwords == pickle.loads(pickle.dumps(chinese.badwords))
+
+
+def test_informals():
+    compare_extraction(chinese.informals.revision.datasources.matches,
+                       INFORMAL, OTHER)
+
+    assert chinese.informals == pickle.loads(pickle.dumps(chinese.informals))
+
+
+def test_words_to_watch():
+    compare_extraction(chinese.words_to_watch.revision.datasources.matches,
+                       WORDS_TO_WATCH, OTHER)
+
+    assert chinese.words_to_watch == \
+           pickle.loads(pickle.dumps(chinese.words_to_watch))