Skip to content

Commit 0585329

Browse files
add and update test for new disallow-empty-morpheme default
1 parent d2d08a5 commit 0585329

File tree

7 files changed

+136
-33
lines changed

7 files changed

+136
-33
lines changed

src/test/java/com/worksap/nlp/elasticsearch/sudachi/index/CustomAnalyzerTest.kt

Lines changed: 64 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -45,15 +45,68 @@ class CustomAnalyzerTest : SearchEngineTestBase {
4545
""".jsonSettings()
4646
val analyzers = engine.indexAnalyzers(settings)
4747
val basic = analyzers.get("sudachi_basic")
48-
basic.assertTerms("東京に行く", "東京", "", "行く")
48+
basic.assertTerms("東京に行く。", "東京", "", "行く")
49+
}
50+
51+
@Test
52+
fun discardPunctuationFalse() {
53+
val settings =
54+
"""
55+
{
56+
"index.analysis": {
57+
"analyzer": {
58+
"sudachi_basic": {
59+
"type": "custom",
60+
"tokenizer": "sudachi_tokenizer"
61+
}
62+
},
63+
"tokenizer": {
64+
"sudachi_tokenizer": {
65+
"type": "sudachi_tokenizer",
66+
"discard_punctuation": false
67+
}
68+
}
69+
}
70+
}
71+
""".jsonSettings()
72+
val analyzers = engine.indexAnalyzers(settings)
73+
val basic = analyzers.get("sudachi_basic")
74+
basic.assertTerms("東京に行く。", "東京", "", "行く", "")
75+
}
76+
77+
@Test
78+
fun allowEmptyMorphemeTrue() {
79+
val settings =
80+
"""
81+
{
82+
"index.analysis": {
83+
"analyzer": {
84+
"sudachi_basic": {
85+
"type": "custom",
86+
"tokenizer": "sudachi_tokenizer"
87+
}
88+
},
89+
"tokenizer": {
90+
"sudachi_tokenizer": {
91+
"type": "sudachi_tokenizer",
92+
"split_mode": "A",
93+
"allow_empty_morpheme": true
94+
}
95+
}
96+
}
97+
}
98+
""".jsonSettings()
99+
val analyzers = engine.indexAnalyzers(settings)
100+
val basic = analyzers.get("sudachi_basic")
101+
basic.assertTerms("㍿に行く", "", "", "", "行く")
49102
}
50103

51104
@Test
52105
fun stoptagsEmpty() {
53106
val settings =
54107
"""
55108
{
56-
"index.analysis": {
109+
"index.analysis": {
57110
"analyzer": {
58111
"sudachi_basic": {
59112
"type": "custom",
@@ -71,7 +124,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
71124
},
72125
"filter": {
73126
"pos": {
74-
"type": "sudachi_part_of_speech"
127+
"type": "sudachi_part_of_speech"
75128
}
76129
}
77130
}
@@ -87,7 +140,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
87140
val settings =
88141
"""
89142
{
90-
"index.analysis": {
143+
"index.analysis": {
91144
"analyzer": {
92145
"sudachi_basic": {
93146
"type": "custom",
@@ -122,7 +175,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
122175
val settings =
123176
"""
124177
{
125-
"index.analysis": {
178+
"index.analysis": {
126179
"analyzer": {
127180
"sudachi_test": {
128181
"type": "custom",
@@ -156,7 +209,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
156209
val settings =
157210
"""
158211
{
159-
"index.analysis": {
212+
"index.analysis": {
160213
"analyzer": {
161214
"sudachi_test": {
162215
"type": "custom",
@@ -190,7 +243,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
190243
val settings =
191244
"""
192245
{
193-
"index.analysis": {
246+
"index.analysis": {
194247
"analyzer": {
195248
"sudachi_test": {
196249
"type": "custom",
@@ -224,7 +277,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
224277
val settings =
225278
"""
226279
{
227-
"index.analysis": {
280+
"index.analysis": {
228281
"analyzer": {
229282
"sudachi_test": {
230283
"type": "custom",
@@ -259,7 +312,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
259312
val settings =
260313
"""
261314
{
262-
"index.analysis": {
315+
"index.analysis": {
263316
"analyzer": {
264317
"sudachi_test": {
265318
"type": "custom",
@@ -294,7 +347,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
294347
val settings =
295348
"""
296349
{
297-
"index.analysis": {
350+
"index.analysis": {
298351
"analyzer": {
299352
"sudachi_test": {
300353
"type": "custom",
@@ -329,7 +382,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
329382
val settings =
330383
"""
331384
{
332-
"index.analysis": {
385+
"index.analysis": {
333386
"analyzer": {
334387
"sudachi_test": {
335388
"type": "custom",

src/test/java/com/worksap/nlp/elasticsearch/sudachi/index/TestSudachiAnalysis.kt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ open class TestSudachiAnalysis : SearchEngineTestBase {
5151
val settings =
5252
"""
5353
{
54-
"index.analysis": {
54+
"index.analysis": {
5555
"analyzer": {
5656
"sudachi": {
5757
"type": "sudachi",
@@ -71,7 +71,7 @@ open class TestSudachiAnalysis : SearchEngineTestBase {
7171
val settings =
7272
"""
7373
{
74-
"index.analysis": {
74+
"index.analysis": {
7575
"analyzer": {
7676
"sudachi": {
7777
"type": "sudachi",

src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiSplitFilter.kt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -201,8 +201,8 @@ class TestSudachiSplitFilter : BaseTokenStreamTestCase() {
201201
val tokenStream = setUpTokenStream("search", "六三四㍿に行くカ゛カ゛カ゛")
202202
assertTokenStreamContents(
203203
tokenStream,
204-
arrayOf("六三四", "", "", "", "", "行く", "カ゛カ゛カ゛", "カ゛カ゛", "カ゛"),
205-
intArrayOf(0, 3, 3, 4, 4, 5, 7, 7, 11),
204+
arrayOf("六三四", "", "", "", "", "行く", "カ゛カ゛カ゛", "カ゛カ゛", "カ゛"),
205+
intArrayOf(0, 3, 3, 3, 4, 5, 7, 7, 11),
206206
intArrayOf(3, 4, 4, 4, 5, 7, 13, 11, 13),
207207
intArrayOf(1, 1, 0, 1, 1, 1, 1, 0, 1),
208208
intArrayOf(1, 2, 1, 1, 1, 1, 2, 1, 1),
@@ -219,7 +219,7 @@ class TestSudachiSplitFilter : BaseTokenStreamTestCase() {
219219
assertTokenStreamContents(
220220
tokenStream,
221221
arrayOf("六三四", "株式会社", "株式", "会社", "", "行く", "ガガガ", "ガガ", ""),
222-
intArrayOf(0, 3, 3, 4, 4, 5, 7, 7, 11),
222+
intArrayOf(0, 3, 3, 3, 4, 5, 7, 7, 11),
223223
intArrayOf(3, 4, 4, 4, 5, 7, 13, 11, 13),
224224
intArrayOf(1, 1, 0, 1, 1, 1, 1, 0, 1),
225225
intArrayOf(1, 2, 1, 1, 1, 1, 2, 1, 1),

src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiTokenizer.kt

Lines changed: 63 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,10 @@ open class TestSudachiTokenizer : BaseTokenStreamTestCase() {
4545
fun makeTokenizer(
4646
mode: SplitMode,
4747
noPunctuation: Boolean = true,
48+
allowEmptyMorpheme: Boolean = false,
4849
capacity: Int = 0
4950
): SudachiTokenizer {
50-
val dict = ReloadableDictionary(config)
51+
val dict = ReloadableDictionary(config.allowEmptyMorpheme(allowEmptyMorpheme))
5152
val extractor =
5253
if (capacity == 0) {
5354
NoopInputExtractor.INSTANCE
@@ -113,7 +114,7 @@ open class TestSudachiTokenizer : BaseTokenStreamTestCase() {
113114

114115
@Test
115116
fun incrementTokenByPunctuationMode() {
116-
val tokenizer = makeTokenizer(SplitMode.C, false)
117+
val tokenizer = makeTokenizer(SplitMode.C, noPunctuation = false)
117118
tokenizer.setReader(StringReader("東京都に行った。"))
118119
assertTokenStreamContents(
119120
tokenizer,
@@ -128,7 +129,7 @@ open class TestSudachiTokenizer : BaseTokenStreamTestCase() {
128129

129130
@Test
130131
fun incrementTokenWithPunctuationsByDefaultMode() {
131-
val tokenizer = makeTokenizer(SplitMode.C, true)
132+
val tokenizer = makeTokenizer(SplitMode.C, noPunctuation = true)
132133
tokenizer.setReader(StringReader("東京都に行った。東京都に行った。"))
133134
assertTokenStreamContents(
134135
tokenizer,
@@ -143,7 +144,7 @@ open class TestSudachiTokenizer : BaseTokenStreamTestCase() {
143144

144145
@Test
145146
fun incrementTokenWithPunctuationsByPunctuationMode() {
146-
val tokenizer = makeTokenizer(SplitMode.C, false)
147+
val tokenizer = makeTokenizer(SplitMode.C, noPunctuation = false)
147148
tokenizer.setReader(StringReader("東京都に行った。東京都に行った。"))
148149
assertTokenStreamContents(
149150
tokenizer,
@@ -158,7 +159,7 @@ open class TestSudachiTokenizer : BaseTokenStreamTestCase() {
158159

159160
@Test
160161
fun incrementTokenWithPunctuationsByPunctuationModeCached() {
161-
val tokenizer = makeTokenizer(SplitMode.C, false, capacity = 10)
162+
val tokenizer = makeTokenizer(SplitMode.C, noPunctuation = false, capacity = 10)
162163
tokenizer.setReader(StringReader("東京都に行った。東京都に行った。"))
163164
assertTokenStreamContents(
164165
tokenizer,
@@ -173,7 +174,7 @@ open class TestSudachiTokenizer : BaseTokenStreamTestCase() {
173174

174175
@Test
175176
fun incrementTokenWithOOVByDefaultMode() {
176-
val tokenizer = makeTokenizer(SplitMode.C, true)
177+
val tokenizer = makeTokenizer(SplitMode.C)
177178
tokenizer.setReader(StringReader("アマゾンに行った。"))
178179
assertTokenStreamContents(
179180
tokenizer,
@@ -188,7 +189,7 @@ open class TestSudachiTokenizer : BaseTokenStreamTestCase() {
188189

189190
@Test
190191
fun incrementTokenWithOOVByPunctuationMode() {
191-
val tokenizerPunctuation = makeTokenizer(SplitMode.C, false)
192+
val tokenizerPunctuation = makeTokenizer(SplitMode.C, noPunctuation = false)
192193
tokenizerPunctuation.setReader(StringReader("アマゾンに行った。"))
193194
assertTokenStreamContents(
194195
tokenizerPunctuation,
@@ -203,7 +204,7 @@ open class TestSudachiTokenizer : BaseTokenStreamTestCase() {
203204

204205
@Test
205206
fun incrementTokenByAMode() {
206-
val tokenizerA = makeTokenizer(SplitMode.A, true)
207+
val tokenizerA = makeTokenizer(SplitMode.A)
207208
tokenizerA.setReader(StringReader("東京都に行った。"))
208209
assertTokenStreamContents(
209210
tokenizerA,
@@ -218,7 +219,7 @@ open class TestSudachiTokenizer : BaseTokenStreamTestCase() {
218219

219220
@Test
220221
fun incrementTokenByBMode() {
221-
val tokenizerB = makeTokenizer(SplitMode.B, true)
222+
val tokenizerB = makeTokenizer(SplitMode.B)
222223
tokenizerB.setReader(StringReader("東京都に行った。"))
223224
assertTokenStreamContents(
224225
tokenizerB,
@@ -236,7 +237,7 @@ open class TestSudachiTokenizer : BaseTokenStreamTestCase() {
236237
val builder = NormalizeCharMap.Builder()
237238
builder.add("東京都", "京都")
238239
val filter = MappingCharFilter(builder.build(), StringReader("東京都に行った。"))
239-
val tokenizer = makeTokenizer(SplitMode.C, true)
240+
val tokenizer = makeTokenizer(SplitMode.C)
240241
tokenizer.setReader(filter)
241242
assertTokenStreamContents(
242243
tokenizer,
@@ -249,9 +250,57 @@ open class TestSudachiTokenizer : BaseTokenStreamTestCase() {
249250
)
250251
}
251252

253+
@Test
254+
fun incrementTokenWithCorrectSplitOffset() {
255+
val builder = NormalizeCharMap.Builder()
256+
builder.add("(株)", "株式会社")
257+
val filter = MappingCharFilter(builder.build(), StringReader("(株)に行った。"))
258+
val tokenizer = makeTokenizer(SplitMode.A)
259+
tokenizer.setReader(filter)
260+
assertTokenStreamContents(
261+
tokenizer,
262+
arrayOf("株式", "会社", "", "行っ", ""),
263+
intArrayOf(0, 2, 3, 4, 6),
264+
intArrayOf(2, 3, 4, 6, 7),
265+
intArrayOf(1, 1, 1, 1, 1),
266+
intArrayOf(1, 1, 1, 1, 1),
267+
8,
268+
)
269+
}
270+
271+
@Test
272+
fun incrementTokenWithDisallowEmptyMorpheme() {
273+
val tokenizer = makeTokenizer(SplitMode.A, allowEmptyMorpheme = false)
274+
tokenizer.setReader(StringReader("㍿に行った。"))
275+
assertTokenStreamContents(
276+
tokenizer,
277+
arrayOf("", "", "", "行っ", ""),
278+
intArrayOf(0, 0, 1, 2, 4),
279+
intArrayOf(1, 1, 2, 4, 5),
280+
intArrayOf(1, 1, 1, 1, 1),
281+
intArrayOf(1, 1, 1, 1, 1),
282+
6,
283+
)
284+
}
285+
286+
@Test
287+
fun incrementTokenWithAllowEmptyMorpheme() {
288+
val tokenizer = makeTokenizer(SplitMode.A, allowEmptyMorpheme = true)
289+
tokenizer.setReader(StringReader("㍿に行った。"))
290+
assertTokenStreamContents(
291+
tokenizer,
292+
arrayOf("", "", "", "行っ", ""),
293+
intArrayOf(0, 1, 1, 2, 4),
294+
intArrayOf(1, 1, 2, 4, 5),
295+
intArrayOf(1, 1, 1, 1, 1),
296+
intArrayOf(1, 1, 1, 1, 1),
297+
6,
298+
)
299+
}
300+
252301
@Test
253302
fun additionalSettings() {
254-
val tokenizer = makeTokenizer(SplitMode.C, true)
303+
val tokenizer = makeTokenizer(SplitMode.C)
255304
tokenizer.setReader(StringReader("自然言語"))
256305
assertTokenStreamContents(
257306
tokenizer,
@@ -268,7 +317,7 @@ open class TestSudachiTokenizer : BaseTokenStreamTestCase() {
268317
config =
269318
Config.fromClasspath(ResourceUtil::class.java.getResource("additional.json"), anchor)
270319
.withFallback(config)
271-
val tokenizer2 = makeTokenizer(SplitMode.C, true)
320+
val tokenizer2 = makeTokenizer(SplitMode.C)
272321
tokenizer2.setReader(StringReader("自然言語"))
273322
assertTokenStreamContents(
274323
tokenizer2,
@@ -283,8 +332,8 @@ open class TestSudachiTokenizer : BaseTokenStreamTestCase() {
283332

284333
@Test
285334
fun equalsHashCodeCoverage() {
286-
val tokenizerA = makeTokenizer(SplitMode.A, true)
287-
val tokenizerB = makeTokenizer(SplitMode.B, true)
335+
val tokenizerA = makeTokenizer(SplitMode.A)
336+
val tokenizerB = makeTokenizer(SplitMode.B)
288337
assertNotEquals(tokenizerA, tokenizerB)
289338
assertNotEquals(tokenizerA.hashCode().toLong(), tokenizerB.hashCode().toLong())
290339
}

src/test/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttributeImplTest.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ class MorphemeAttributeImplTest {
4848
@Before
4949
fun setup() {
5050
val configDir = testDic.root.toPath().resolve("config/sudachi")
51-
config = Config.fromFile(configDir.resolve("sudachi.json"))
51+
config = Config.fromFile(configDir.resolve("sudachi.json")).allowEmptyMorpheme(false)
5252
}
5353

5454
@Test

src/test/java/com/worksap/nlp/test/TestDictionary.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ class InMemoryDictionary {
187187
val base = Config.fromClasspath(ResourceUtil.resource("sudachi.json"), anchor)
188188
val dic = TestDictionary.inMemorySystemData.duplicate()
189189
dic.order(ByteOrder.LITTLE_ENDIAN)
190-
base.systemDictionary(BinaryDictionary(dic))
190+
base.systemDictionary(BinaryDictionary(dic)).allowEmptyMorpheme(false)
191191
}
192192

193193
val dic = newDictionary()

src/test/resources/dict/lex.csv

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,4 +43,5 @@
4343
株式会社,8,8,6000,株式会社,名詞,普通名詞,一般,*,*,*,カブシキガイシャ,株式会社,*,C,40/41,40/41,40/41,*
4444
ガ,5,5,3500,ガ,副詞,*,*,*,*,*,ガ,ガ,*,A,*,*,*,*
4545
ガガ,5,5,5500,ガガ,副詞,*,*,*,*,*,ガガ,ガガ,*,A,*,*,*,*
46-
ガガガ,5,5,8494,ガガガ,副詞,*,*,*,*,*,ガガガ,ガガガ,*,B,44/43,*,44/43,*
46+
ガガガ,5,5,8494,ガガガ,副詞,*,*,*,*,*,ガガガ,ガガガ,*,B,44/43,*,44/43,*
47+
。,6,6,1861,。,補助記号,句点,*,*,*,*,。,。,*,A,*,*,*,*

0 commit comments

Comments
 (0)