Skip to content

Commit 0a0950a

Browse files
add integration test with icu normalizer
1 parent c5483c3 commit 0a0950a

File tree

1 file changed

+116
-0
lines changed

1 file changed

+116
-0
lines changed

test-scripts/01-integration-test.py

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,122 @@ def test_icu_filtered_stuff_is_not_trimmed(self):
142142
self.assertEqual(5, tokens[0]["end_offset"])
143143
return
144144

145+
def test_correct_split_offset_with_icu_filter(self):
146+
body = {
147+
"tokenizer": "sudachi_tokenizer",
148+
"char_filter": {
149+
"type": "icu_normalizer",
150+
"name": "nfkc_cf",
151+
"mode": "compose"
152+
},
153+
"filter": {
154+
"type": "sudachi_split",
155+
"mode": "search"
156+
},
157+
"text": "六三四㍿のアッフ\u309Aルハ\u309Aイ",
158+
}
159+
resp = es_instance.analyze(body)
160+
self.assertEqual(200, resp.status, f"data: {resp.data}")
161+
162+
tokens = json.loads(resp.data.decode())["tokens"]
163+
self.assertEqual(8, len(tokens))
164+
self.assertEqual("株式会社", tokens[1]["token"])
165+
self.assertEqual(1, tokens[1]["position"])
166+
self.assertEqual(2, tokens[1]["positionLength"])
167+
self.assertEqual(3, tokens[1]["start_offset"])
168+
self.assertEqual(4, tokens[1]["end_offset"])
169+
170+
self.assertEqual("株式", tokens[2]["token"])
171+
self.assertEqual(1, tokens[2]["position"])
172+
self.assertEqual(3, tokens[2]["start_offset"])
173+
self.assertEqual(3, tokens[2]["end_offset"])
174+
self.assertEqual("会社", tokens[3]["token"])
175+
self.assertEqual(2, tokens[3]["position"])
176+
self.assertEqual(3, tokens[3]["start_offset"])
177+
self.assertEqual(4, tokens[3]["end_offset"])
178+
179+
self.assertEqual("アップルパイ", tokens[5]["token"])
180+
self.assertEqual(4, tokens[5]["position"])
181+
self.assertEqual(2, tokens[1]["positionLength"])
182+
self.assertEqual(5, tokens[5]["start_offset"])
183+
self.assertEqual(13, tokens[5]["end_offset"])
184+
185+
self.assertEqual("アップル", tokens[6]["token"])
186+
self.assertEqual(4, tokens[6]["position"])
187+
self.assertEqual(5, tokens[6]["start_offset"])
188+
self.assertEqual(10, tokens[6]["end_offset"])
189+
self.assertEqual("パイ", tokens[7]["token"])
190+
self.assertEqual(5, tokens[7]["position"])
191+
self.assertEqual(10, tokens[7]["start_offset"])
192+
self.assertEqual(13, tokens[7]["end_offset"])
193+
return
194+
195+
def test_correct_OOV_offset_with_icu_filter(self):
196+
body = {
197+
"tokenizer": "sudachi_tokenizer",
198+
"char_filter": {
199+
"type": "icu_normalizer",
200+
"name": "nfkc_cf",
201+
"mode": "compose"
202+
},
203+
"filter": {
204+
"type": "sudachi_split",
205+
"mode": "extended"
206+
},
207+
"text": "10㍉㌢進んでホ\u3099\u3099\u3099",
208+
}
209+
resp = es_instance.analyze(body)
210+
self.assertEqual(200, resp.status, f"data: {resp.data}")
211+
212+
tokens = json.loads(resp.data.decode())["tokens"]
213+
self.assertEqual(13, len(tokens))
214+
self.assertEqual("ミリセンチ", tokens[1]["token"])
215+
self.assertEqual(1, tokens[1]["position"])
216+
self.assertEqual(5, tokens[1]["positionLength"])
217+
self.assertEqual(2, tokens[1]["start_offset"])
218+
self.assertEqual(4, tokens[1]["end_offset"])
219+
220+
self.assertEqual("ミ", tokens[2]["token"])
221+
self.assertEqual(1, tokens[2]["position"])
222+
self.assertEqual(2, tokens[2]["start_offset"])
223+
self.assertEqual(2, tokens[2]["end_offset"])
224+
self.assertEqual("リ", tokens[3]["token"])
225+
self.assertEqual(2, tokens[3]["position"])
226+
self.assertEqual(2, tokens[3]["start_offset"])
227+
self.assertEqual(3, tokens[3]["end_offset"])
228+
self.assertEqual("セ", tokens[4]["token"])
229+
self.assertEqual(3, tokens[4]["position"])
230+
self.assertEqual(3, tokens[4]["start_offset"])
231+
self.assertEqual(3, tokens[4]["end_offset"])
232+
self.assertEqual("ン", tokens[5]["token"])
233+
self.assertEqual(4, tokens[5]["position"])
234+
self.assertEqual(3, tokens[5]["start_offset"])
235+
self.assertEqual(3, tokens[5]["end_offset"])
236+
self.assertEqual("チ", tokens[6]["token"])
237+
self.assertEqual(5, tokens[6]["position"])
238+
self.assertEqual(3, tokens[6]["start_offset"])
239+
self.assertEqual(4, tokens[6]["end_offset"])
240+
241+
self.assertEqual("ボボボ", tokens[9]["token"])
242+
self.assertEqual(8, tokens[9]["position"])
243+
self.assertEqual(3, tokens[9]["positionLength"])
244+
self.assertEqual(7, tokens[9]["start_offset"])
245+
self.assertEqual(13, tokens[9]["end_offset"])
246+
247+
self.assertEqual("ボ", tokens[10]["token"])
248+
self.assertEqual(8, tokens[10]["position"])
249+
self.assertEqual(7, tokens[10]["start_offset"])
250+
self.assertEqual(9, tokens[10]["end_offset"])
251+
self.assertEqual("ボ", tokens[11]["token"])
252+
self.assertEqual(9, tokens[11]["position"])
253+
self.assertEqual(9, tokens[11]["start_offset"])
254+
self.assertEqual(11, tokens[11]["end_offset"])
255+
self.assertEqual("ボ", tokens[12]["token"])
256+
self.assertEqual(10, tokens[12]["position"])
257+
self.assertEqual(11, tokens[12]["start_offset"])
258+
self.assertEqual(13, tokens[12]["end_offset"])
259+
return
260+
145261

146262
class TestSubplugin(unittest.TestCase):
147263
# requires :subplugin is installed with :testlib

0 commit comments

Comments
 (0)