@@ -142,6 +142,122 @@ def test_icu_filtered_stuff_is_not_trimmed(self):
142
142
self .assertEqual (5 , tokens [0 ]["end_offset" ])
143
143
return
144
144
145
+ def test_correct_split_offset_with_icu_filter (self ):
146
+ body = {
147
+ "tokenizer" : "sudachi_tokenizer" ,
148
+ "char_filter" : {
149
+ "type" : "icu_normalizer" ,
150
+ "name" : "nfkc_cf" ,
151
+ "mode" : "compose"
152
+ },
153
+ "filter" : {
154
+ "type" : "sudachi_split" ,
155
+ "mode" : "search"
156
+ },
157
+ "text" : "六三四㍿のアッフ\u309A ルハ\u309A イ" ,
158
+ }
159
+ resp = es_instance .analyze (body )
160
+ self .assertEqual (200 , resp .status , f"data: { resp .data } " )
161
+
162
+ tokens = json .loads (resp .data .decode ())["tokens" ]
163
+ self .assertEqual (8 , len (tokens ))
164
+ self .assertEqual ("株式会社" , tokens [1 ]["token" ])
165
+ self .assertEqual (1 , tokens [1 ]["position" ])
166
+ self .assertEqual (2 , tokens [1 ]["positionLength" ])
167
+ self .assertEqual (3 , tokens [1 ]["start_offset" ])
168
+ self .assertEqual (4 , tokens [1 ]["end_offset" ])
169
+
170
+ self .assertEqual ("株式" , tokens [2 ]["token" ])
171
+ self .assertEqual (1 , tokens [2 ]["position" ])
172
+ self .assertEqual (3 , tokens [2 ]["start_offset" ])
173
+ self .assertEqual (3 , tokens [2 ]["end_offset" ])
174
+ self .assertEqual ("会社" , tokens [3 ]["token" ])
175
+ self .assertEqual (2 , tokens [3 ]["position" ])
176
+ self .assertEqual (3 , tokens [3 ]["start_offset" ])
177
+ self .assertEqual (4 , tokens [3 ]["end_offset" ])
178
+
179
+ self .assertEqual ("アップルパイ" , tokens [5 ]["token" ])
180
+ self .assertEqual (4 , tokens [5 ]["position" ])
181
+ self .assertEqual (2 , tokens [1 ]["positionLength" ])
182
+ self .assertEqual (5 , tokens [5 ]["start_offset" ])
183
+ self .assertEqual (13 , tokens [5 ]["end_offset" ])
184
+
185
+ self .assertEqual ("アップル" , tokens [6 ]["token" ])
186
+ self .assertEqual (4 , tokens [6 ]["position" ])
187
+ self .assertEqual (5 , tokens [6 ]["start_offset" ])
188
+ self .assertEqual (10 , tokens [6 ]["end_offset" ])
189
+ self .assertEqual ("パイ" , tokens [7 ]["token" ])
190
+ self .assertEqual (5 , tokens [7 ]["position" ])
191
+ self .assertEqual (10 , tokens [7 ]["start_offset" ])
192
+ self .assertEqual (13 , tokens [7 ]["end_offset" ])
193
+ return
194
+
195
+ def test_correct_OOV_offset_with_icu_filter (self ):
196
+ body = {
197
+ "tokenizer" : "sudachi_tokenizer" ,
198
+ "char_filter" : {
199
+ "type" : "icu_normalizer" ,
200
+ "name" : "nfkc_cf" ,
201
+ "mode" : "compose"
202
+ },
203
+ "filter" : {
204
+ "type" : "sudachi_split" ,
205
+ "mode" : "extended"
206
+ },
207
+ "text" : "10㍉㌢進んでホ\u3099 ホ\u3099 ホ\u3099 " ,
208
+ }
209
+ resp = es_instance .analyze (body )
210
+ self .assertEqual (200 , resp .status , f"data: { resp .data } " )
211
+
212
+ tokens = json .loads (resp .data .decode ())["tokens" ]
213
+ self .assertEqual (13 , len (tokens ))
214
+ self .assertEqual ("ミリセンチ" , tokens [1 ]["token" ])
215
+ self .assertEqual (1 , tokens [1 ]["position" ])
216
+ self .assertEqual (5 , tokens [1 ]["positionLength" ])
217
+ self .assertEqual (2 , tokens [1 ]["start_offset" ])
218
+ self .assertEqual (4 , tokens [1 ]["end_offset" ])
219
+
220
+ self .assertEqual ("ミ" , tokens [2 ]["token" ])
221
+ self .assertEqual (1 , tokens [2 ]["position" ])
222
+ self .assertEqual (2 , tokens [2 ]["start_offset" ])
223
+ self .assertEqual (2 , tokens [2 ]["end_offset" ])
224
+ self .assertEqual ("リ" , tokens [3 ]["token" ])
225
+ self .assertEqual (2 , tokens [3 ]["position" ])
226
+ self .assertEqual (2 , tokens [3 ]["start_offset" ])
227
+ self .assertEqual (3 , tokens [3 ]["end_offset" ])
228
+ self .assertEqual ("セ" , tokens [4 ]["token" ])
229
+ self .assertEqual (3 , tokens [4 ]["position" ])
230
+ self .assertEqual (3 , tokens [4 ]["start_offset" ])
231
+ self .assertEqual (3 , tokens [4 ]["end_offset" ])
232
+ self .assertEqual ("ン" , tokens [5 ]["token" ])
233
+ self .assertEqual (4 , tokens [5 ]["position" ])
234
+ self .assertEqual (3 , tokens [5 ]["start_offset" ])
235
+ self .assertEqual (3 , tokens [5 ]["end_offset" ])
236
+ self .assertEqual ("チ" , tokens [6 ]["token" ])
237
+ self .assertEqual (5 , tokens [6 ]["position" ])
238
+ self .assertEqual (3 , tokens [6 ]["start_offset" ])
239
+ self .assertEqual (4 , tokens [6 ]["end_offset" ])
240
+
241
+ self .assertEqual ("ボボボ" , tokens [9 ]["token" ])
242
+ self .assertEqual (8 , tokens [9 ]["position" ])
243
+ self .assertEqual (3 , tokens [9 ]["positionLength" ])
244
+ self .assertEqual (7 , tokens [9 ]["start_offset" ])
245
+ self .assertEqual (13 , tokens [9 ]["end_offset" ])
246
+
247
+ self .assertEqual ("ボ" , tokens [10 ]["token" ])
248
+ self .assertEqual (8 , tokens [10 ]["position" ])
249
+ self .assertEqual (7 , tokens [10 ]["start_offset" ])
250
+ self .assertEqual (9 , tokens [10 ]["end_offset" ])
251
+ self .assertEqual ("ボ" , tokens [11 ]["token" ])
252
+ self .assertEqual (9 , tokens [11 ]["position" ])
253
+ self .assertEqual (9 , tokens [11 ]["start_offset" ])
254
+ self .assertEqual (11 , tokens [11 ]["end_offset" ])
255
+ self .assertEqual ("ボ" , tokens [12 ]["token" ])
256
+ self .assertEqual (10 , tokens [12 ]["position" ])
257
+ self .assertEqual (11 , tokens [12 ]["start_offset" ])
258
+ self .assertEqual (13 , tokens [12 ]["end_offset" ])
259
+ return
260
+
145
261
146
262
class TestSubplugin (unittest .TestCase ):
147
263
# requires :subplugin is installed with :testlib
0 commit comments