Skip to content

Commit

Permalink
Badwords, informals, and words_to_watch for zh
Browse files Browse the repository at this point in the history
Also adds pre-processing to regex matches so that we can have traditional
chinese converted to simplified and expect matches to work.
  • Loading branch information
halfak committed May 18, 2019
1 parent e469800 commit ec65506
Show file tree
Hide file tree
Showing 8 changed files with 328 additions and 21 deletions.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ deltas >= 0.4.6, < 0.4.999
docopt >= 0.6.2, < 0.6.999
flake8 >= 3.3.0, < 3.3.999
gensim >= 2.3.3, < 3.3.999
hanziconv >= 0.3.2, < 0.3.999
mmh3 >= 2.3.1, < 2.3.999
more-itertools == 2.2
mwapi >= 0.5.0, < 0.5.999
Expand Down
5 changes: 5 additions & 0 deletions revscoring/languages/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@
.. automodule:: revscoring.languages.catalan
:members:
chinese
+++++++
.. automodule:: revscoring.languages.chinese
:members:
croatian
+++++
.. automodule:: revscoring.languages.croatian
Expand Down
108 changes: 108 additions & 0 deletions revscoring/languages/chinese.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
from hanziconv import HanziConv

from .features import RegexMatches

name = "chinese"

badword_regexes = list(map(HanziConv.toSimplified, [
r"王八蛋", # son of a bitch
r"他媽的", # "his mother's"
r"去你媽", # "to your mother"
r"去你的", # "to yours"
r"婊子", r"妓女", # prostitute
r"日了?狗", # lonely dog
r"屁眼", r"混蛋", # asshole
r"渾蛋", # zh-hant of previous
r"混帳", # variant of above
r"王八", # bitch
r"白癡", # idiot
r"腦殘", # brain dead
r"智障", # mentally retarded
r"婊", r"妓", # prostitute
r"屎", # shit
r"屌", # dick
r"妈逼", # (this is verbal but definitely bad)
r"艹", r"肏", # fuck (in any context)
r"放屁",

# Variants (homonyms) of the use of "fuck" that use 操 ("operation") and
# 草 ("grass"), "肏" is the actual character. "艹" is not a real character
# but it's used this way
r"操你", r"草你", r"日你", # fuck you
r"操他", r"草他", r"日他", # fuck his
r"操她", r"草她", r"日她", # fuck her

# Discrimination (racial slurs)
r"小日本", # little Japanese
r"台湾狗", # Taiwanese dogs
r"共产中国", # communist Chinese
r"流氓国家", # rogue country
r"人渣", # human slag
r"我去", # this is verbal and bad
r"鬼子"
]))

badwords = RegexMatches(name + ".badwords", badword_regexes, wrapping=None,
text_preprocess=HanziConv.toSimplified)
"""
:class:`~revscoring.languages.features.RegexMatches` features via a list of
badword detecting regexes.
"""

# Formatted from https://resources.allsetlearning.com/chinese/grammar/Formal_and_informal_function_words
informal_regexes = list(map(HanziConv.toSimplified, [
# Hello
r"你好", # nǐ hǎo; The standard "hello" greeting.
r"您好", # nín hǎo; The same "hello" greeting as above
r"你怎么样", # nǐ zěnmeyàng?; "What's up?", "How are you doing?"

# Good afternoon
r"午安", # wǔ'an; note: seldom used in the Mainland.
r"下午好", # xìawǔ hǎo! Seldom used in the Republic of China

# Good evening / Good night
r"晚安", # wǎn'an; Literally "Peace at night", Good night.
r"晚上好", # wǎnshang hǎo; Good evening!

# Good-bye
r"再見", # zàijian; Literally "See you again".
r"明天見", # míngtian jiàn; Literally "See you tomorrow".
r"拜拜", # bāibāi/báibái; From English "Bye-Bye".
r"回頭見", # huítóujiàn: roughly equivalent to "see you soon"
r"回見", # huíjiàn; usually used in Beijing or written Chinese.
r"再會", # zàihuì: Literally "[we'll] hello again".
r"666+", r"233+", # No one knows why. But this belongs
]))

informals = RegexMatches(name + ".informals", informal_regexes, wrapping=None,
text_preprocess=HanziConv.toSimplified)
"""
:class:`~revscoring.languages.features.RegexMatches` features via a list of
informal word detecting regexes.
"""

words_to_watch_regexes = list(map(HanziConv.toSimplified, [
# Advertising language
r"本台", # this channel
r"本公司", # this company
r"代刷", r"代练", r"代抢", # someone who plays games for you
r"强势回归", # "mightly" return
r"超值", # very cost-effective
r"一条龙", # a proverb? "one line of dragon"
r"一夜情", # selling one's body (advertising)
r"世界一流", r"国际一流", # world first-class
r"用户第一", r"用户满意", r"用户至上", # customer-first
r"核心价值", r"核心团队", r"核心宗旨", # core value
r"服务小姐", # service lady
r"服务范围", # service area
r"服务项目", # service items
r"服务理念", # service philosophy
]))

words_to_watch = RegexMatches(name + ".words_to_watch", words_to_watch_regexes,
wrapping=None,
text_preprocess=HanziConv.toSimplified)
"""
:class:`~revscoring.languages.features.RegexMatches` features via a list of
advertising language regexes.
"""
47 changes: 40 additions & 7 deletions revscoring/languages/features/regex_matches/datasources.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,23 @@
from ....datasources import Datasource
from ....datasources.meta import extractors, frequencies, mappers
from ....dependencies import DependentSet


class Revision(DependentSet):

def __init__(self, name, regexes, wikitext_revision, exclusions, wrapping):
def __init__(self, name, regexes, wikitext_revision, exclusions, wrapping,
text_preprocess):
super().__init__(name)
if text_preprocess is not None:
self.text = Datasource(
name + ".preprocessed_text",
depends_on=[wikitext_revision.text],
process=text_preprocess)
else:
self.text = wikitext_revision.text

self.matches = extractors.regex(
regexes, wikitext_revision.text,
regexes, self.text,
name=name + ".matches",
exclusions=exclusions,
wrapping=wrapping
Expand All @@ -22,29 +31,44 @@ def __init__(self, name, regexes, wikitext_revision, exclusions, wrapping):
self.parent = Revision(name + ".parent", regexes,
wikitext_revision.parent,
exclusions=exclusions,
wrapping=wrapping)
wrapping=wrapping,
text_preprocess=text_preprocess)

if hasattr(wikitext_revision, 'diff'):
self.diff = Diff(name + ".diff", regexes,
wikitext_revision.diff, self,
exclusions=exclusions,
wrapping=wrapping)
wrapping=wrapping,
text_preprocess=text_preprocess)


class Diff(DependentSet):

def __init__(self, name, regexes, wikitext_diff,
revision, exclusions, wrapping):
revision, exclusions, wrapping, text_preprocess):
super().__init__(name)

if text_preprocess is not None:
segments_added = PreprocessedSegments(
name + ".preprocessed_segments_added",
wikitext_diff.segments_added,
text_preprocess)
segments_removed = Datasource(
name + ".preprocessed_segments_removed",
wikitext_diff.segments_removed,
text_preprocess)
else:
segments_added = wikitext_diff.segments_added
segments_removed = wikitext_diff.segments_removed

self.matches_added = extractors.regex(
regexes, wikitext_diff.segments_added,
regexes, segments_added,
name=name + ".matches_added",
exclusions=exclusions,
wrapping=wrapping
)
self.matches_removed = extractors.regex(
regexes, wikitext_diff.segments_removed,
regexes, segments_removed,
name=name + ".matches_removed",
exclusions=exclusions,
wrapping=wrapping
Expand All @@ -60,3 +84,12 @@ def __init__(self, name, regexes, wikitext_diff,
self.match_delta,
name=name + ".match_prop_delta"
)


class PreprocessedSegments(Datasource):

def __init__(self, name, segments, text_preprocess):
self.text_preprocess = text_preprocess

def process(self, segments):
return [self.text_preprocess(segment) for segment in segments]
7 changes: 4 additions & 3 deletions revscoring/languages/features/regex_matches/regex_matches.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from . import datasources, features
from ....dependencies import DependentSet
from ....features import wikitext
from . import datasources, features


class RegexMatches(DependentSet):
Expand All @@ -17,7 +17,7 @@ class RegexMatches(DependentSet):
"""

def __init__(self, name, regexes, exclusions=None,
wrapping=(r'\b', r'\b')):
wrapping=(r'\b', r'\b'), text_preprocess=None):
super().__init__(name)
self._regexes = regexes
self._exclusions = exclusions
Expand All @@ -28,7 +28,8 @@ def __init__(self, name, regexes, exclusions=None,
name + ".revision", regexes,
wikitext.revision.datasources,
exclusions=exclusions,
wrapping=wrapping
wrapping=wrapping,
text_preprocess=text_preprocess
)
)
"""
Expand Down
19 changes: 19 additions & 0 deletions tests/languages/features/regex_matches/tests/test_regexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,3 +67,22 @@ def test_pickling():
badwords.revision.diff.matches_added)
assert (pickle.loads(pickle.dumps(badwords.revision.diff.matches_removed)) ==
badwords.revision.diff.matches_removed)


def replace_foo_bar(text):
return text.replace("foo", "bar")

my_barbaz = RegexMatches(
"my_barbaz",
[r"barbaz"],
text_preprocess=replace_foo_bar
)


def test_text_preprocessing():

cache = {r_text: "Hi foobaz. Derp"}
assert (solve(my_barbaz.revision.datasources.matches, cache=cache) ==
['barbaz'])
assert (pickle.loads(pickle.dumps(badwords.revision.matches)) ==
badwords.revision.matches)
Loading

0 comments on commit ec65506

Please sign in to comment.