Skip to content

Commit

Permalink
Badwords and informals for zh
Browse files Browse the repository at this point in the history
Also adds pre-processing to regex matches so that we can have traditional
chinese converted to simplified and expect matches to work.
  • Loading branch information
halfak committed May 18, 2019
1 parent e469800 commit e36e0fc
Show file tree
Hide file tree
Showing 8 changed files with 303 additions and 13 deletions.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ deltas >= 0.4.6, < 0.4.999
docopt >= 0.6.2, < 0.6.999
flake8 >= 3.3.0, < 3.3.999
gensim >= 2.3.3, < 3.3.999
hanziconv >= 0.3.2, < 0.3.999
mmh3 >= 2.3.1, < 2.3.999
more-itertools == 2.2
mwapi >= 0.5.0, < 0.5.999
Expand Down
5 changes: 5 additions & 0 deletions revscoring/languages/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@
.. automodule:: revscoring.languages.catalan
:members:
chinese
+++++++
.. automodule:: revscoring.languages.chinese
:members:
croatian
+++++
.. automodule:: revscoring.languages.croatian
Expand Down
106 changes: 106 additions & 0 deletions revscoring/languages/chinese.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
from hanziconv import HanziConv

from .features import RegexMatches

name = "chinese"

badword_regexes = [
r"王八蛋", # son of a bitch
r"他媽的", # "his mother's"
r"去你媽", # "to your mother"
r"去你的", # "to yours"
r"婊子", r"妓女", # prostitute
r"狗日", # "dog days" (inappropriate)
r"屁眼", r"混蛋", # asshole
r"渾蛋", # zh-hant of previous
r"混帳", # variant of above
r"王八", # bitch
r"白癡", # idiot
r"腦殘", # brain dead
r"智障", # mentally retarded
r"婊", r"妓", # prostitute
r"屎", # shit
r"妈逼", # (this is verbal but definitely bad)
r"艹", r"肏", # fuck (in any context)
r"放屁",

# Variants (homonyms) of the use of "fuck" that use 操 ("operation") and
# 草 ("grass"), "肏" is the actual character. "艹" is not a real character
# but it's used this way
r"操你", r"草你", # fuck you
r"操他", r"草他", # fuck his
r"操她", r"草她", # fuck her

# Discrimination (racial slurs)
r"小日本", # little Japanese
r"台湾狗", # Taiwanese dogs
r"共产中国", # communist Chinese
r"流氓国家", # rogue country
r"人渣", # human slag
r"我去", # this is verbal and bad
r"鬼子"
]

badwords = RegexMatches(name + ".badwords", badword_regexes, wrapping=None,
text_preprocess=HanziConv.toSimplified)
"""
:class:`~revscoring.languages.features.RegexMatches` features via a list of
badword detecting regexes.
"""

# Formatted from https://resources.allsetlearning.com/chinese/grammar/Formal_and_informal_function_words
informal_regexes = [
# Hello
r"你好", # nǐ hǎo; The standard "hello" greeting.
r"您好", # nín hǎo; The same "hello" greeting as above
r"你怎么样", # nǐ zěnmeyàng?; "What's up?", "How are you doing?"

# Good afternoon
r"午安", # wǔ'an; note: seldom used in the Mainland.
r"下午好", # xìawǔ hǎo! Seldom used in the Republic of China

# Good evening / Good night
r"晚安", # wǎn'an; Literally "Peace at night", Good night.
r"晚上好", # wǎnshang hǎo; Good evening!

# Good-bye
r"再見", # zàijian; Literally "See you again".
r"明天見", # míngtian jiàn; Literally "See you tomorrow".
r"拜拜", # bāibāi/báibái; From English "Bye-Bye".
r"回頭見", # huítóujiàn: roughly equivalent to "see you soon"
r"回見", # huíjiàn; usually used in Beijing or written Chinese.
r"再會" # zàihuì: Literally "[we'll] hello again".
]

informals = RegexMatches(name + ".informals", informal_regexes, wrapping=None,
text_preprocess=HanziConv.toSimplified)
"""
:class:`~revscoring.languages.features.RegexMatches` features via a list of
informal word detecting regexes.
"""

words_to_watch_regexes = [
# Advertising language
r"本台", # this channel
r"本公司", # this company
r"代刷", r"代练", r"代抢", # someone who plays games for you
r"强势回归", # "mightly" return
r"超值", # very cost-effective
r"一条龙", # a proverb? "one line of dragon"
r"一夜情", # selling one's body (advertising)
r"世界一流", r"国际一流", # world first-class
r"用户第一", r"用户满意", r"用户至上", # customer-first
r"核心价值", r"核心团队", r"核心宗旨", # core value
r"服务小姐", # service lady
r"服务范围", # service area
r"服务项目", # service items
r"服务理念", # service philosophy
]

words_to_watch = RegexMatches(name + ".words_to_watch", words_to_watch_regexes,
wrapping=None,
text_preprocess=HanziConv.toSimplified)
"""
:class:`~revscoring.languages.features.RegexMatches` features via a list of
advertising language regexes.
"""
50 changes: 43 additions & 7 deletions revscoring/languages/features/regex_matches/datasources.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,24 @@
from ....datasources import Datasource
from ....datasources.meta import extractors, frequencies, mappers
from ....dependencies import DependentSet


class Revision(DependentSet):

def __init__(self, name, regexes, wikitext_revision, exclusions, wrapping):
def __init__(self, name, regexes, wikitext_revision, exclusions, wrapping,
text_preprocess):
super().__init__(name)

if text_preprocess is not None:
text = Datasource(
name + ".preprocessed_text",
depends_on=[wikitext_revision.text],
process=text_preprocess)
else:
text = wikitext_revision.text

self.matches = extractors.regex(
regexes, wikitext_revision.text,
regexes, text,
name=name + ".matches",
exclusions=exclusions,
wrapping=wrapping
Expand All @@ -22,29 +32,46 @@ def __init__(self, name, regexes, wikitext_revision, exclusions, wrapping):
self.parent = Revision(name + ".parent", regexes,
wikitext_revision.parent,
exclusions=exclusions,
wrapping=wrapping)
wrapping=wrapping,
text_preprocess=text_preprocess)

if hasattr(wikitext_revision, 'diff'):
self.diff = Diff(name + ".diff", regexes,
wikitext_revision.diff, self,
exclusions=exclusions,
wrapping=wrapping)
wrapping=wrapping,
text_preprocess=text_preprocess)


class Diff(DependentSet):

def __init__(self, name, regexes, wikitext_diff,
revision, exclusions, wrapping):
revision, exclusions, wrapping, text_preprocess):
super().__init__(name)

if text_preprocess is not None:
self.text_preprocess = text_preprocess
segments_added = PreprocessedSegments(
name + ".preprocessed_segments_added",
wikitext_diff.segments_added,
text_preprocess)
segments_removed = Datasource(
name + ".preprocessed_segments_removed",
wikitext_diff.segments_removed,
text_preprocess)
else:
self.text_preprocess = None
segments_added = wikitext_diff.segments_added
segments_removed = wikitext_diff.segments_removed

self.matches_added = extractors.regex(
regexes, wikitext_diff.segments_added,
regexes, segments_added,
name=name + ".matches_added",
exclusions=exclusions,
wrapping=wrapping
)
self.matches_removed = extractors.regex(
regexes, wikitext_diff.segments_removed,
regexes, segments_removed,
name=name + ".matches_removed",
exclusions=exclusions,
wrapping=wrapping
Expand All @@ -60,3 +87,12 @@ def __init__(self, name, regexes, wikitext_diff,
self.match_delta,
name=name + ".match_prop_delta"
)


class PreprocessedSegments(Datasource):

def __init__(self, name, segments, text_preprocess):
self.text_preprocess = text_preprocess

def process(self, segments):
return [self.text_preprocess(segment) for segment in segments]
7 changes: 4 additions & 3 deletions revscoring/languages/features/regex_matches/regex_matches.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from . import datasources, features
from ....dependencies import DependentSet
from ....features import wikitext
from . import datasources, features


class RegexMatches(DependentSet):
Expand All @@ -17,7 +17,7 @@ class RegexMatches(DependentSet):
"""

def __init__(self, name, regexes, exclusions=None,
wrapping=(r'\b', r'\b')):
wrapping=(r'\b', r'\b'), text_preprocess=None):
super().__init__(name)
self._regexes = regexes
self._exclusions = exclusions
Expand All @@ -28,7 +28,8 @@ def __init__(self, name, regexes, exclusions=None,
name + ".revision", regexes,
wikitext.revision.datasources,
exclusions=exclusions,
wrapping=wrapping
wrapping=wrapping,
text_preprocess=text_preprocess
)
)
"""
Expand Down
19 changes: 19 additions & 0 deletions tests/languages/features/regex_matches/tests/test_regexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,3 +67,22 @@ def test_pickling():
badwords.revision.diff.matches_added)
assert (pickle.loads(pickle.dumps(badwords.revision.diff.matches_removed)) ==
badwords.revision.diff.matches_removed)


def replace_foo_bar(text):
return text.replace("foo", "bar")

my_barbaz = RegexMatches(
"my_barbaz",
[r"barbaz"],
text_preprocess=replace_foo_bar
)


def test_text_preprocessing():

cache = {r_text: "Hi foobaz. Derp"}
assert (solve(my_barbaz.revision.datasources.matches, cache=cache) ==
['barbaz'])
assert (pickle.loads(pickle.dumps(badwords.revision.matches)) ==
badwords.revision.matches)
122 changes: 122 additions & 0 deletions tests/languages/test_chinese.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
import pickle

from hanziconv import HanziConv
from revscoring.datasources import revision_oriented
from revscoring.languages import chinese

from .util import compare_extraction

BAD = [
HanziConv.toTraditional("王八蛋"), # son of a bitch
HanziConv.toSimplified("王八蛋"), # son of a bitch
"他媽的", # "his mother's"
"去你媽", # "to your mother"
"去你的", # "to yours"
"婊子", "妓女", # prostitute
"狗日", # "dog days" (inappropriate)
"屁眼", "混蛋", "渾蛋", # asshole
"混帳", # variant of above
"王八", # bitch
"白癡", # idiot
"腦殘", # brain dead
"智障", # mentally retarded
"婊", "妓", # prostitute
"屎", # shit
"妈逼", # (this is verbal but definitely bad)
"艹", "肏", # fuck (in any context)
"放屁", # fart

# Variants (homonyms) of the use of "fuck" that use 操 ("operation") and
# 草 ("grass"), "肏" is the actual character. "艹" is not a real character
# but it's used this way
"操你", "草你", # fuck you
"操他", "草他", # fuck his
"操她", "草她", # fuck her

# Discrimination (racial slurs)
"小日本", # little Japanese
"台湾狗", # Taiwanese dogs
"共产中国", # communist Chinese
"流氓国家", # rogue country
"人渣", # human slag
"我去", # this is verbal and bad
"鬼子" # devil, usually a suffix
]

INFORMAL = [
# Hello
"你好", # nǐ hǎo; The standard "hello" greeting.
"您好", # nín hǎo; The same "hello" greeting as above
"你怎么样", # nǐ zěnmeyàng?; "What's up?", "How are you doing?"

# Good afternoon
"午安", # wǔ'an; note: seldom used in the Mainland.
"下午好", # xìawǔ hǎo! Seldom used in the Republic of China

# Good evening / Good night
"晚安", # wǎn'an; Literally "Peace at night", Good night.
"晚上好", # wǎnshang hǎo; Good evening!

# Good-bye
"再見", # zàijian; Literally "See you again".
"明天見", # míngtian jiàn; Literally "See you tomorrow".
"拜拜", # bāibāi/báibái; From English "Bye-Bye".
"回頭見", # huítóujiàn: roughly equivalent to "see you soon"
"回見", # huíjiàn; usually used in Beijing or written Chinese.
"再會" # zàihuì: Literally "[we'll] hello again".
]

WORDS_TO_WATCH = [
# Advertising language
"本台", # this channel
"本公司", # this company
"代刷", "代练", "代抢", # someone who plays games for you
"强势回归", # "mightly" return
"超值", # very cost-effective
"一条龙", # a proverb? "one line of dragon"
"一夜情", # selling one's body (advertising)
"世界一流", "国际一流", # world first-class
"用户第一", "用户满意", "用户至上", # customer-first
"核心价值", "核心团队", "核心宗旨", # core value
"服务小姐", # service lady
"服务范围", # service area
"服务项目", # service items
"服务理念", # service philosophy
]

OTHER = [
"""2005年大西洋颶風季是有纪录以来最活跃的大西洋颶風季,至今仍保持着多项纪录。
全季对大范围地区造成毁灭性打击,共导致3,913人死亡,损失数额更创下新纪录,高达1592亿美元。
本季单大型飓风就有7场之多,其中5场在登陆时仍有大型飓风强度,分别是颶風丹尼斯、艾米莉、
卡特里娜、丽塔和威尔玛,大部分人员伤亡和财产损失都是这5场飓风引起。
墨西哥的金塔納羅奧州和尤卡坦州,
以及美国的佛罗里达州和路易斯安那州都曾两度受大型飓风袭击;古巴、巴哈马、海地,
美国的密西西比州和得克萨斯州,还有墨西哥的塔毛利帕斯州都曾直接受1场大型飓风冲击,
还有至少1场在附近掠过。美國墨西哥灣沿岸地區是本季受灾最严重的所在,
飓风卡特里娜产生高达10米的风暴潮,引发毁灭性洪灾,密西西比州沿海地区的大部分建筑物被毁,
风暴之后又令新奥尔良防洪堤决口,整个城市因此受到重创。此外,飓风斯坦同溫帶氣旋共同影响,
在中美洲多地引发致命的泥石流,其中又以危地马拉灾情最为严重。"""
]

r_text = revision_oriented.revision.text


def test_badwords():
compare_extraction(chinese.badwords.revision.datasources.matches,
BAD, OTHER)
assert chinese.badwords == pickle.loads(pickle.dumps(chinese.badwords))


def test_informals():
compare_extraction(chinese.informals.revision.datasources.matches,
INFORMAL, OTHER)

assert chinese.informals == pickle.loads(pickle.dumps(chinese.informals))


def test_words_to_watch():
compare_extraction(chinese.words_to_watch.revision.datasources.matches,
WORDS_TO_WATCH, OTHER)

assert chinese.words_to_watch == \
pickle.loads(pickle.dumps(chinese.words_to_watch))
Loading

0 comments on commit e36e0fc

Please sign in to comment.