-
Notifications
You must be signed in to change notification settings - Fork 52
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Also adds pre-processing to regex matches so that we can have traditional chinese converted to simplified and expect matches to work.
- Loading branch information
Showing
8 changed files
with
303 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
from hanziconv import HanziConv | ||
|
||
from .features import RegexMatches | ||
|
||
name = "chinese" | ||
|
||
badword_regexes = [ | ||
r"王八蛋", # son of a bitch | ||
r"他媽的", # "his mother's" | ||
r"去你媽", # "to your mother" | ||
r"去你的", # "to yours" | ||
r"婊子", r"妓女", # prostitute | ||
r"狗日", # "dog days" (inappropriate) | ||
r"屁眼", r"混蛋", # asshole | ||
r"渾蛋", # zh-hant of previous | ||
r"混帳", # variant of above | ||
r"王八", # bitch | ||
r"白癡", # idiot | ||
r"腦殘", # brain dead | ||
r"智障", # mentally retarded | ||
r"婊", r"妓", # prostitute | ||
r"屎", # shit | ||
r"妈逼", # (this is verbal but definitely bad) | ||
r"艹", r"肏", # fuck (in any context) | ||
r"放屁", | ||
|
||
# Variants (homonyms) of the use of "fuck" that use 操 ("operation") and | ||
# 草 ("grass"), "肏" is the actual character. "艹" is not a real character | ||
# but it's used this way | ||
r"操你", r"草你", # fuck you | ||
r"操他", r"草他", # fuck his | ||
r"操她", r"草她", # fuck her | ||
|
||
# Discrimination (racial slurs) | ||
r"小日本", # little Japanese | ||
r"台湾狗", # Taiwanese dogs | ||
r"共产中国", # communist Chinese | ||
r"流氓国家", # rogue country | ||
r"人渣", # human slag | ||
r"我去", # this is verbal and bad | ||
r"鬼子" | ||
] | ||
|
||
badwords = RegexMatches(name + ".badwords", badword_regexes, wrapping=None, | ||
text_preprocess=HanziConv.toSimplified) | ||
""" | ||
:class:`~revscoring.languages.features.RegexMatches` features via a list of | ||
badword detecting regexes. | ||
""" | ||
|
||
# Formatted from https://resources.allsetlearning.com/chinese/grammar/Formal_and_informal_function_words | ||
informal_regexes = [ | ||
# Hello | ||
r"你好", # nǐ hǎo; The standard "hello" greeting. | ||
r"您好", # nín hǎo; The same "hello" greeting as above | ||
r"你怎么样", # nǐ zěnmeyàng?; "What's up?", "How are you doing?" | ||
|
||
# Good afternoon | ||
r"午安", # wǔ'an; note: seldom used in the Mainland. | ||
r"下午好", # xìawǔ hǎo! Seldom used in the Republic of China | ||
|
||
# Good evening / Good night | ||
r"晚安", # wǎn'an; Literally "Peace at night", Good night. | ||
r"晚上好", # wǎnshang hǎo; Good evening! | ||
|
||
# Good-bye | ||
r"再見", # zàijian; Literally "See you again". | ||
r"明天見", # míngtian jiàn; Literally "See you tomorrow". | ||
r"拜拜", # bāibāi/báibái; From English "Bye-Bye". | ||
r"回頭見", # huítóujiàn: roughly equivalent to "see you soon" | ||
r"回見", # huíjiàn; usually used in Beijing or written Chinese. | ||
r"再會" # zàihuì: Literally "[we'll] hello again". | ||
] | ||
|
||
informals = RegexMatches(name + ".informals", informal_regexes, wrapping=None, | ||
text_preprocess=HanziConv.toSimplified) | ||
""" | ||
:class:`~revscoring.languages.features.RegexMatches` features via a list of | ||
informal word detecting regexes. | ||
""" | ||
|
||
words_to_watch_regexes = [ | ||
# Advertising language | ||
r"本台", # this channel | ||
r"本公司", # this company | ||
r"代刷", r"代练", r"代抢", # someone who plays games for you | ||
r"强势回归", # "mightly" return | ||
r"超值", # very cost-effective | ||
r"一条龙", # a proverb? "one line of dragon" | ||
r"一夜情", # selling one's body (advertising) | ||
r"世界一流", r"国际一流", # world first-class | ||
r"用户第一", r"用户满意", r"用户至上", # customer-first | ||
r"核心价值", r"核心团队", r"核心宗旨", # core value | ||
r"服务小姐", # service lady | ||
r"服务范围", # service area | ||
r"服务项目", # service items | ||
r"服务理念", # service philosophy | ||
] | ||
|
||
words_to_watch = RegexMatches(name + ".words_to_watch", words_to_watch_regexes, | ||
wrapping=None, | ||
text_preprocess=HanziConv.toSimplified) | ||
""" | ||
:class:`~revscoring.languages.features.RegexMatches` features via a list of | ||
advertising language regexes. | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
import pickle | ||
|
||
from hanziconv import HanziConv | ||
from revscoring.datasources import revision_oriented | ||
from revscoring.languages import chinese | ||
|
||
from .util import compare_extraction | ||
|
||
BAD = [ | ||
HanziConv.toTraditional("王八蛋"), # son of a bitch | ||
HanziConv.toSimplified("王八蛋"), # son of a bitch | ||
"他媽的", # "his mother's" | ||
"去你媽", # "to your mother" | ||
"去你的", # "to yours" | ||
"婊子", "妓女", # prostitute | ||
"狗日", # "dog days" (inappropriate) | ||
"屁眼", "混蛋", "渾蛋", # asshole | ||
"混帳", # variant of above | ||
"王八", # bitch | ||
"白癡", # idiot | ||
"腦殘", # brain dead | ||
"智障", # mentally retarded | ||
"婊", "妓", # prostitute | ||
"屎", # shit | ||
"妈逼", # (this is verbal but definitely bad) | ||
"艹", "肏", # fuck (in any context) | ||
"放屁", # fart | ||
|
||
# Variants (homonyms) of the use of "fuck" that use 操 ("operation") and | ||
# 草 ("grass"), "肏" is the actual character. "艹" is not a real character | ||
# but it's used this way | ||
"操你", "草你", # fuck you | ||
"操他", "草他", # fuck his | ||
"操她", "草她", # fuck her | ||
|
||
# Discrimination (racial slurs) | ||
"小日本", # little Japanese | ||
"台湾狗", # Taiwanese dogs | ||
"共产中国", # communist Chinese | ||
"流氓国家", # rogue country | ||
"人渣", # human slag | ||
"我去", # this is verbal and bad | ||
"鬼子" # devil, usually a suffix | ||
] | ||
|
||
INFORMAL = [ | ||
# Hello | ||
"你好", # nǐ hǎo; The standard "hello" greeting. | ||
"您好", # nín hǎo; The same "hello" greeting as above | ||
"你怎么样", # nǐ zěnmeyàng?; "What's up?", "How are you doing?" | ||
|
||
# Good afternoon | ||
"午安", # wǔ'an; note: seldom used in the Mainland. | ||
"下午好", # xìawǔ hǎo! Seldom used in the Republic of China | ||
|
||
# Good evening / Good night | ||
"晚安", # wǎn'an; Literally "Peace at night", Good night. | ||
"晚上好", # wǎnshang hǎo; Good evening! | ||
|
||
# Good-bye | ||
"再見", # zàijian; Literally "See you again". | ||
"明天見", # míngtian jiàn; Literally "See you tomorrow". | ||
"拜拜", # bāibāi/báibái; From English "Bye-Bye". | ||
"回頭見", # huítóujiàn: roughly equivalent to "see you soon" | ||
"回見", # huíjiàn; usually used in Beijing or written Chinese. | ||
"再會" # zàihuì: Literally "[we'll] hello again". | ||
] | ||
|
||
WORDS_TO_WATCH = [ | ||
# Advertising language | ||
"本台", # this channel | ||
"本公司", # this company | ||
"代刷", "代练", "代抢", # someone who plays games for you | ||
"强势回归", # "mightly" return | ||
"超值", # very cost-effective | ||
"一条龙", # a proverb? "one line of dragon" | ||
"一夜情", # selling one's body (advertising) | ||
"世界一流", "国际一流", # world first-class | ||
"用户第一", "用户满意", "用户至上", # customer-first | ||
"核心价值", "核心团队", "核心宗旨", # core value | ||
"服务小姐", # service lady | ||
"服务范围", # service area | ||
"服务项目", # service items | ||
"服务理念", # service philosophy | ||
] | ||
|
||
OTHER = [ | ||
"""2005年大西洋颶風季是有纪录以来最活跃的大西洋颶風季,至今仍保持着多项纪录。 | ||
全季对大范围地区造成毁灭性打击,共导致3,913人死亡,损失数额更创下新纪录,高达1592亿美元。 | ||
本季单大型飓风就有7场之多,其中5场在登陆时仍有大型飓风强度,分别是颶風丹尼斯、艾米莉、 | ||
卡特里娜、丽塔和威尔玛,大部分人员伤亡和财产损失都是这5场飓风引起。 | ||
墨西哥的金塔納羅奧州和尤卡坦州, | ||
以及美国的佛罗里达州和路易斯安那州都曾两度受大型飓风袭击;古巴、巴哈马、海地, | ||
美国的密西西比州和得克萨斯州,还有墨西哥的塔毛利帕斯州都曾直接受1场大型飓风冲击, | ||
还有至少1场在附近掠过。美國墨西哥灣沿岸地區是本季受灾最严重的所在, | ||
飓风卡特里娜产生高达10米的风暴潮,引发毁灭性洪灾,密西西比州沿海地区的大部分建筑物被毁, | ||
风暴之后又令新奥尔良防洪堤决口,整个城市因此受到重创。此外,飓风斯坦同溫帶氣旋共同影响, | ||
在中美洲多地引发致命的泥石流,其中又以危地马拉灾情最为严重。""" | ||
] | ||
|
||
r_text = revision_oriented.revision.text | ||
|
||
|
||
def test_badwords(): | ||
compare_extraction(chinese.badwords.revision.datasources.matches, | ||
BAD, OTHER) | ||
assert chinese.badwords == pickle.loads(pickle.dumps(chinese.badwords)) | ||
|
||
|
||
def test_informals(): | ||
compare_extraction(chinese.informals.revision.datasources.matches, | ||
INFORMAL, OTHER) | ||
|
||
assert chinese.informals == pickle.loads(pickle.dumps(chinese.informals)) | ||
|
||
|
||
def test_words_to_watch(): | ||
compare_extraction(chinese.words_to_watch.revision.datasources.matches, | ||
WORDS_TO_WATCH, OTHER) | ||
|
||
assert chinese.words_to_watch == \ | ||
pickle.loads(pickle.dumps(chinese.words_to_watch)) |
Oops, something went wrong.