Skip to content

Commit 4dae125

Browse files
add split filter test with input normalization
1 parent b3c8c64 commit 4dae125

File tree

4 files changed

+96
-7
lines changed

4 files changed

+96
-7
lines changed

src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiSplitFilter.kt

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2022-2023 Works Applications Co., Ltd.
2+
* Copyright (c) 2022-2024 Works Applications Co., Ltd.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -196,6 +196,69 @@ class TestSudachiSplitFilter : BaseTokenStreamTestCase() {
196196
)
197197
}
198198

199+
@Test
200+
fun testWithCharNormalizationBySearchMode() {
201+
val tokenStream = setUpTokenStream("search", "六三四㍿に行くカ゛カ゛カ゛")
202+
assertTokenStreamContents(
203+
tokenStream,
204+
arrayOf("六三四", "", "", "", "", "行く", "カ゛カ゛カ゛", "カ゛カ゛", "カ゛"),
205+
intArrayOf(0, 3, 3, 4, 4, 5, 7, 7, 11),
206+
intArrayOf(3, 4, 4, 4, 5, 7, 13, 11, 13),
207+
intArrayOf(1, 1, 0, 1, 1, 1, 1, 0, 1),
208+
intArrayOf(1, 2, 1, 1, 1, 1, 2, 1, 1),
209+
13,
210+
)
211+
}
212+
213+
@Test
214+
fun testWithCharNormalizationInNormalizedFormBySearchMode() {
215+
var tokenStream = setUpTokenStream("search", "六三四㍿に行くカ゛カ゛カ゛")
216+
val normFactory = SudachiNormalizedFormFilterFactory(mutableMapOf())
217+
tokenStream = normFactory.create(tokenStream)
218+
219+
assertTokenStreamContents(
220+
tokenStream,
221+
arrayOf("六三四", "株式会社", "株式", "会社", "", "行く", "ガガガ", "ガガ", ""),
222+
intArrayOf(0, 3, 3, 4, 4, 5, 7, 7, 11),
223+
intArrayOf(3, 4, 4, 4, 5, 7, 13, 11, 13),
224+
intArrayOf(1, 1, 0, 1, 1, 1, 1, 0, 1),
225+
intArrayOf(1, 2, 1, 1, 1, 1, 2, 1, 1),
226+
13,
227+
)
228+
}
229+
230+
@Test
231+
fun testWithCharNormalizationByExtendedMode() {
232+
// extending normalized form seems more natural, but we cannot calculate their offsets.
233+
val tokenStream = setUpTokenStream("extended", "10㌢㍍いったソ゛")
234+
assertTokenStreamContents(
235+
tokenStream,
236+
arrayOf("1", "0", "㌢㍍", "", "", "いっ", "", "ソ゛", "", ""),
237+
intArrayOf(0, 1, 2, 2, 3, 4, 6, 7, 7, 8),
238+
intArrayOf(1, 2, 4, 3, 4, 6, 7, 9, 8, 9),
239+
intArrayOf(1, 1, 1, 0, 1, 1, 1, 1, 0, 1),
240+
intArrayOf(1, 1, 2, 1, 1, 1, 1, 2, 1, 1),
241+
9,
242+
)
243+
}
244+
245+
@Test
246+
fun testWithCharNormalizationInNormalizedFormByExtendedMode() {
247+
// extending normalized form seems more natural, but we cannot calculate their offsets.
248+
var tokenStream = setUpTokenStream("extended", "10㌢㍍いったソ゛")
249+
val normFactory = SudachiNormalizedFormFilterFactory(mutableMapOf())
250+
tokenStream = normFactory.create(tokenStream)
251+
252+
assertTokenStreamContents(
253+
tokenStream,
254+
arrayOf("1", "0", "センチメートル", "", "", "行く", "", "", "", ""),
255+
intArrayOf(0, 1, 2, 2, 3, 4, 6, 7, 7, 8),
256+
intArrayOf(1, 2, 4, 3, 4, 6, 7, 9, 8, 9),
257+
intArrayOf(1, 1, 1, 0, 1, 1, 1, 1, 0, 1),
258+
intArrayOf(1, 1, 2, 1, 1, 1, 1, 2, 1, 1),
259+
9)
260+
}
261+
199262
fun setUpTokenStream(mode: String, input: String): TokenStream {
200263
val factory =
201264
SudachiSplitFilterFactory(

src/test/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImplTest.kt

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616

1717
package com.worksap.nlp.lucene.sudachi.ja.attributes
1818

19-
import com.worksap.nlp.lucene.aliases.ToXContent
2019
import com.worksap.nlp.lucene.aliases.XContentBuilder
2120
import com.worksap.nlp.search.aliases.XContentType
2221
import com.worksap.nlp.sudachi.Config
@@ -65,6 +64,19 @@ class MorphemeAttributeImplTest {
6564
assertNull(morphemeAtt.getMorpheme())
6665
}
6766

67+
@Test
68+
fun setOffsets() {
69+
var morphemeAtt = MorphemeAttributeImpl()
70+
assertTrue(morphemeAtt.getOffsets().isEmpty())
71+
72+
val intlist = listOf(1, 2, 3)
73+
morphemeAtt.setOffsets(intlist)
74+
assertEquals(intlist, morphemeAtt.getOffsets())
75+
76+
morphemeAtt.setOffsets(listOf())
77+
assertTrue(morphemeAtt.getOffsets().isEmpty())
78+
}
79+
6880
@Test
6981
fun copyTo() {
7082
var morphemeAtt1 = MorphemeAttributeImpl()
@@ -85,15 +97,14 @@ class MorphemeAttributeImplTest {
8597
var morphemeAtt = MorphemeAttributeImpl()
8698
val morpheme = getFirstMorpheme("東京都")!!
8799
morphemeAtt.setMorpheme(morpheme)
100+
val offsets = listOf(0, 3)
101+
morphemeAtt.setOffsets(offsets)
88102

89103
val builder = XContentBuilder.builder(XContentType.JSON.xContent())
90104
builder.startObject()
91105
morphemeAtt.reflectWith(
92106
fun(attClass, key, value) {
93107
assertEquals(MorphemeAttribute::class.java, attClass)
94-
assertEquals("morpheme", key)
95-
assertTrue(value is ToXContent)
96-
97108
builder.field(key, value)
98109
})
99110
builder.endObject()
@@ -103,15 +114,21 @@ class MorphemeAttributeImplTest {
103114
val deserialized = Json.decodeFromString<MorphemeHolder>(serialized)
104115

105116
assertNotNull(deserialized.morpheme)
117+
assertNotNull(deserialized.offsetMap)
106118
assertEquals(morpheme.surface(), deserialized.morpheme.surface)
107119
assertEquals(morpheme.dictionaryForm(), deserialized.morpheme.dictionaryForm)
108120
assertEquals(morpheme.normalizedForm(), deserialized.morpheme.normalizedForm)
109121
assertEquals(morpheme.readingForm(), deserialized.morpheme.readingForm)
110122
assertEquals(morpheme.partOfSpeech(), deserialized.morpheme.partOfSpeech)
123+
assertEquals(offsets, deserialized.offsetMap)
111124
}
112125
}
113126

114-
@Serializable data class MorphemeHolder(val morpheme: MorphemeAttributeHolder)
127+
@Serializable
128+
data class MorphemeHolder(
129+
val morpheme: MorphemeAttributeHolder,
130+
val offsetMap: List<Int>,
131+
)
115132

116133
@Serializable
117134
data class MorphemeAttributeHolder(

src/test/resources/com/worksap/nlp/lucene/sudachi/ja/sudachi.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
{
22
"systemDict" : "system_core.dic",
3+
"inputTextPlugin" : [
4+
{ "class" : "com.worksap.nlp.sudachi.DefaultInputTextPlugin" }
5+
],
36
"oovProviderPlugin" : [
47
{ "class" : "com.worksap.nlp.sudachi.SimpleOovProviderPlugin",
58
"oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ],

src/test/resources/dict/lex.csv

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,4 +37,10 @@
3737
012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,9,9,-30000,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,名詞,数詞,*,*,*,*,ゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウ,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,*,A,*,*,*,*
3838
特a,8,8,2914,特a,名詞,普通名詞,一般,*,*,*,トクエー,特a,*,A,*,*,*,*
3939
な。な,8,8,2914,な。な,名詞,普通名詞,一般,*,*,*,ナナ,な。な,*,A,*,*,*,*
40-
ふく,4,4,5105,ふく,動詞,非自立可能,*,*,五段-カ行,終止形-一般,フク,吹く,*,A,*,*,*,*
40+
ふく,4,4,5105,ふく,動詞,非自立可能,*,*,五段-カ行,終止形-一般,フク,吹く,*,A,*,*,*,*
41+
株式,8,8,5611,株式,名詞,普通名詞,一般,*,*,*,カブシキ,株式,*,A,*,*,*,*
42+
会社,8,8,2914,会社,名詞,普通名詞,一般,*,*,*,カイシャ,会社,*,A,*,*,*,*
43+
株式会社,8,8,6000,株式会社,名詞,普通名詞,一般,*,*,*,カブシキガイシャ,株式会社,*,C,40/41,40/41,40/41,*
44+
ガ,5,5,3500,ガ,副詞,*,*,*,*,*,ガ,ガ,*,A,*,*,*,*
45+
ガガ,5,5,5500,ガガ,副詞,*,*,*,*,*,ガガ,ガガ,*,A,*,*,*,*
46+
ガガガ,5,5,8494,ガガガ,副詞,*,*,*,*,*,ガガガ,ガガガ,*,B,44/43,*,44/43,*

0 commit comments

Comments
 (0)