/
norwegian.py
121 lines (111 loc) · 2.46 KB
/
norwegian.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from .features import Dictionary, RegexMatches, Stopwords
name = "norwegian"
try:
import enchant
dictionary = enchant.Dict("nb")
except enchant.errors.DictNotFoundError:
raise ImportError("No enchant-compatible dictionary found for 'nb'. " +
"Consider installing 'myspell-nb'.")
dictionary = Dictionary(name + ".dictionary", dictionary.check)
"""
:class:`~revscoring.languages.features.Dictionary` features via
`enchant.Dict <https://github.com/rfk/pyenchant>`_ "nb". Provided by `myspell-nb`
"""
try:
from nltk.corpus import stopwords as nltk_stopwords
stopwords = set(nltk_stopwords.words('norwegian'))
except LookupError:
raise ImportError("Could not load stopwords for {0}. ".format(__name__) +
"You may need to install the nltk 'stopwords' " +
"corpora. See http://www.nltk.org/data.html")
stopwords = Stopwords(name + ".stopwords", stopwords)
"""
:class:`~revscoring.languages.features.Stopwords` features provided by
`nltk.corpus.stopwords <https://www.nltk.org/api/nltk.corpus.html>`_ "norwegian"
"""
badword_regexes = [
r"b1tch",
r"bitch",
r"blabla",
r"boobs",
r"bullshit",
r"bæsj",
r"bæsje",
r"bæsjen",
r"bæsjer",
r"cool",
r"cunt",
r"drit",
r"dritt",
r"fack",
r"faen",
r"fitta",
r"fitte",
r"fuck",
r"fucka",
r"homo",
r"homoseksuell",
r"homse",
r"hore",
r"jævla",
r"jævlig",
r"knull",
r"knulle",
r"kuk",
r"kukk",
r"kåt",
r"kødd",
r"ludder",
r"mordi",
r"motherfucker",
r"niggah",
r"nigger",
r"p0rn",
r"p3nis",
r"p3n1s",
r"pen1s",
r"pikk",
r"porn",
r"pr0n",
r"pule",
r"pulte",
r"pupper",
r"pussy",
r"rompa",
r"rompe",
r"ræva",
r"stupid",
r"teit",
r"tissemann",
r"tits",
r"twat",
r"wanker",
r"weed",
r"whore"
]
badwords = RegexMatches(name + ".badwords", badword_regexes)
"""
:class:`~revscoring.languages.features.RegexMatches` features via a list of
badword detecting regexes.
"""
informal_regexes = [
r"haha",
r"hallo",
r"hehe",
r"hei",
r"heisann",
r"hey",
r"heya",
r"hihi",
r"lmao",
r"lol",
r"omg",
r"rofl",
r"yea",
r"yeah"
]
informals = RegexMatches(name + ".informals", informal_regexes)
"""
:class:`~revscoring.languages.features.RegexMatches` features via a list of
informal word detecting regexes.
"""