-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessing.py
337 lines (311 loc) · 11.1 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
# Import Needed Modules
import os, sys
import pattern_recognition as pat_r
# Defining folders pathes
dir_path = os.path.join(sys.path[0])
files_path = os.path.join(dir_path, 'files')
lexicon_path = os.path.join(files_path, 'lexicon')
prefix_path = os.path.join(lexicon_path, 'prefix')
suffix_path = os.path.join(lexicon_path, 'suffix')
characters = ['ا', 'أ', 'آ', 'إ', 'ب', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز',
'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن',
'ه', 'و', 'ي', 'ء', 'ؤ', 'ئ', 'ة']
def normalize(word):
"""
Removing diacritics in g given word (except the shaddah ' ّ' the letter that contains it will be duplicated)
trasfoming the character 'إ'and 'آ' to 'ا'
e.g. "كُتَّاب" --> "كتتاب"
"""
n_word = ""
for i in range(len(word)):
k = word[i]
if word[0] in ['إ', 'آ'] and n_word == "":
n_word += 'ا'
elif k == 'ّ': n_word += word[i-1] # shaddah ' ّ'
elif k in characters: n_word += k
elif k == 'ى': n_word += 'ي'
else: continue
return n_word
def ch_func(path, ch):
"""
creating a list of each in a text file of a specified name
- path - the path of the folder that contians the file
- ch - the character name that we want to create a list of (each file are named as the character's name)
"""
with open(os.path.join(path, ch + '.txt'), "r", encoding = "utf-8") as f:
d = f.readlines()
ch_list = []
for t in d:
ch_list.append(t[:-1])
return ch_list
def known_root(x, lexicon_list):
if len(x) != 0: # and x[-1] != 'ة'
root_list = []
for l in lexicon_list:
if l in x:
root_list.append(l)
if len(root_list) != 0: return max(root_list, key=len)
return False
def suff_finder(word):
y, suf = '', ''
for i in word[::-1]:
y = i + y
if y in suff_comb:
suf = y
if len(word) - len(suf) > 2:
return suf
else: return ''
def check_lexicon(word):
if len(word) != 0:
root_list = []
r = known_root(word, specialwords_list)
if r and len(r) > (len(word) - len(r)):
root_list.append(r)
elif word[0:2] in prefix_ch:
ch_list = ch_func(prefix_path, word[0:2])
r = known_root(word, ch_list)
if r and len(r) > (len(word) - len(r)):
root_list.append(r)
elif word[0] in prefix_ch:
ch_list = ch_func(prefix_path, word[0])
r = known_root(word, ch_list)
if r and len(r) > (len(word) - len(r)):
root_list.append(r)
suff = suff_finder(word)
if len(suff) != 0:
if len(suff) >= 2:
if suff[0:2] in suffix_ch:
ch_list = ch_func(suffix_path, suff[0:2])
r = known_root(word, ch_list)
if r and len(r) > (len(word) - len(r)):
root_list.append(r)
elif suff[0] in suffix_ch:
ch_list = ch_func(suffix_path, suff[0])
r = known_root(word, ch_list)
if r and len(r) > (len(word) - len(r)):
root_list.append(r)
else:
ch_list = ch_func(suffix_path, suff[0])
r = known_root(word, ch_list)
if r and len(r) > (len(word) - len(r)):
root_list.append(r)
if len(root_list) != 0: return max(root_list, key=len)
return False
else : return None
specialwords_list = ch_func(files_path, 'special_words')
prefix_ch = ['س', 'ال', 'أ', 'ل', 'ب', 'ك', 'ف', 'و']
suffix_ch = ['ين', 'ان', 'و', 'ه', 'ك', 'ا', 'ي', 'ن', 'ت', 'ات', 'ون', 'وا', 'تم', 'هم', 'كم', 'ة', 'ء', 'اء']
suff_comb = ['يا', 'ت', 'ة', 'ك', 'تم', 'هم', 'ي', 'اء', 'ان', 'هما'
, 'كم', 'وها', 'ا', 'ه', 'ين', 'يه', 'ون', 'ها', 'وا', 'ء'
, 'ات', 'ية', 'نا', 'تموها', 'تن', 'هنن', 'ني', 'اتي', 'تي', 'هن']
# Creating a list of each prefix and suffix
sen_list = ch_func(prefix_path, 'س')
al_list = ch_func(prefix_path, 'ال')
hamza_list = ch_func(prefix_path, 'أ')
lam_list = ch_func(prefix_path, 'ل')
baa_list = ch_func(prefix_path, 'ب')
kaph_list = ch_func(prefix_path, 'ك')
faa_list = ch_func(prefix_path, 'ف')
waw_list = ch_func(prefix_path, 'و')
# Creating a function for each prefix to check if it is really a prefix based or our roles we put, or not
def sen_pref(word, p):
"""
check if the sen 'س' is a prefix of not, and remove it if it was
"""
if len(word) > 3:
pat = pat_r.pattern_finder(word, p)
if check_lexicon(word):
p.append(check_lexicon(word))
return '', p
elif len(pat) != 0:
p = pat
if word[1] in ['أ', 'ي', 'ت', 'ن']:
return word[2:], p
else: return word, p
else: return word, p
def al_pref(word, p):
"""
check if the al 'ال' is a prefix of not, and remove it if it was
"""
if len(word) > 4:
pat = pat_r.pattern_finder(word, p)
if check_lexicon(word):
p.append(check_lexicon(word))
return '', p
elif check_lexicon(word[1:]):
p.append(check_lexicon(word[1:]))
return '', p
elif len(pat) != 0:
p = pat
if word[2] in ['أ', 'ا', 'آ', 'إ']:
return word[2:], p
else: return word[2:], p
else: return word, p
def hamza_pref(word, p):
"""
check if the hamza 'أ' is a prefix of not, and remove it if it was
"""
if len(word) > 3:
pat = pat_r.pattern_finder(word, p)
if check_lexicon(word):
p.append(check_lexicon(word))
return '', p
elif len(pat) != 0:
p = pat
if word[1] == 'أ':
return word[1:], p
else: return 'ا' + word[1:], p
else: return word, p
def lam_pref(word, p):
"""
check if the lam 'ل' is a prefix of not, and remove it if it was
"""
if len(word) > 3:
pat = pat_r.pattern_finder(word, p)
if check_lexicon(word):
p.append(check_lexicon(word))
return '', p
elif len(pat) != 0:
p = pat
if word[1] in ['أ', 'ي', 'ت', 'ن']:
return word[2:], p
else: return word[1:], p
else: return word[2:], p
def baa_pref(word, p):
"""
check if the baa 'ب' is a prefix of not, and remove it if it was
"""
if len(word) > 3:
pat = pat_r.pattern_finder(word, p)
if check_lexicon(word):
p.append(check_lexicon(word))
return '', p
elif len(pat) != 0:
p = pat
if word[1:3] == 'ال':
return al_pref(word[1:], p)
else: return word[1:], p
else: return word, p
def kaph_pref(word, p):
"""
check if the kaph 'ك' is a prefix of not, and remove it if it was
"""
if len(word) > 3:
pat = pat_r.pattern_finder(word, p)
if check_lexicon(word):
p.append(check_lexicon(word))
return '', p
elif len(pat) != 0:
p = pat
if word[1:3] == 'ال':
return al_pref(word[1:], p)
else: return word, p
else: return word, p
def faa_pref(word, p):
"""
check if the faa 'ف' is a prefix of not, and remove it if it was
"""
if len(word) > 3:
pat = pat_r.pattern_finder(word, p)
if check_lexicon(word):
p.append(check_lexicon(word))
return '', p
elif len(pat) != 0:
p = pat
if word[1] == 'ب':
return baa_pref(word[1:], p)
elif word[1] == 'ك':
return kaph_pref(word[1:], p)
elif word[1:3] == 'ال':
return al_pref(word[1:], p)
elif word[1] == 'س':
return sen_pref(word[1:], p)
elif word[1] == 'ل':
return lam_pref(word[1:], p)
elif word[1] == 'أ':
return hamza_pref(word[1:], p)
else: return word, p
else: return word, p
def waw_pref(word, p):
"""
check if the waw 'و' is a prefix of not, and remove it if it was
"""
if len(word) > 3:
pat = pat_r.pattern_finder(word, p)
if check_lexicon(word):
p.append(check_lexicon(word))
return '', p
elif len(pat) != 0:
p = pat
if word[1] == 'ب':
return baa_pref(word[1:], p)
elif word[1] == 'ك':
return kaph_pref(word[1:], p)
elif word[1:3] == 'ال':
return al_pref(word[1:], p)
elif word[1] == 'س':
return sen_pref(word[1:], p)
elif word[1] == 'ل':
return lam_pref(word[1:], p)
elif word[1] == 'أ':
return hamza_pref(word[1:], p)
else: return word, p
else: return word, p
duplicate_p_letters = ['تت', 'بب', 'كك', 'فف', 'لل', 'وو']
multiletter = ['نست', 'لن', 'مست', 'سي', 'سن', 'ست', 'سأ', 'تنن', 'است', 'ات']
def multiletter_pre(word):
x, p = '', ''
for i in word:
x += i
if x in multiletter:
p = x
if p != '': return word[len(p):]
else: return False
def pref_handler(word, possible_roots):
"""
Handling the prefix of a word
"""
if len(word) > 3:
pos_r = possible_roots
p = pat_r.pattern_finder(word, pos_r)
possible_roots = []
if word[0:2] in duplicate_p_letters:
possible_roots.append(word[1:])
return '', possible_roots
elif multiletter_pre(word):
possible_roots.append(multiletter_pre(word))
return multiletter_pre(word), []
elif check_lexicon(word):
possible_roots.append(check_lexicon(word))
return '', possible_roots
elif len(p) != 0:
possible_roots = p
if word[0] == 'س':
return sen_pref(word, possible_roots)
elif word[0:2] == 'ال' and len(word) > 4:
return al_pref(word, possible_roots)
elif word[0] == 'أ':
return hamza_pref(word, possible_roots)
elif word[0] == 'ل':
return lam_pref(word, possible_roots)
elif word[0] == 'ب':
return baa_pref(word, possible_roots)
elif word[0] == 'ك':
return kaph_pref(word, possible_roots)
elif word[0] == 'ف':
return faa_pref(word, possible_roots)
elif word[0] == 'و':
return waw_pref(word, possible_roots)
else: return word, possible_roots
return word, possible_roots
def suff_handler(word, possible_roots):
c = suff_finder(word)
p = pat_r.pattern_finder(word, possible_roots)
if len(p) != 0:
possible_roots = p
if len(word) != 0 and c != '':
if check_lexicon(word):
possible_roots.append(check_lexicon(word))
return '', possible_roots
return suff_handler(word[0:len(word)-len(c)], possible_roots)
else: return word, possible_roots