Skip to content

Commit 03224f6

Browse files
Merge pull request #151 from WorksApplications/fix/disallow-empty-morpheme
Disallow empty morpheme by default
2 parents ee664ba + 6e45504 commit 03224f6

File tree

10 files changed

+162
-49
lines changed

10 files changed

+162
-49
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,8 @@ The `sudachi_tokenizer` tokenizer tokenizes input texts using Sudachi.
102102
- A: The shortest units equivalent to the UniDic short unit
103103
- Ex) 選挙,管理,委員,会
104104
- discard\_punctuation: Select to discard punctuation or not. (bool, default: true)
105+
- allow\_empty\_morpheme: Allow output morpheme to have an empty span. (bool, default: false)
106+
- This happens when an input text contains a composite character (e.g. ㍿) and it is split into morphemes. If false (default), all split morphemes will contain the span of the character. If true, only the first morpheme will contain the span and the span of other morphemes can be empty.
105107
- settings\_path: Sudachi setting file path. The path may be absolute or relative; relative paths are resolved with respect to es\_config. (string, default: null)
106108
- resources\_path: Sudachi dictionary path. The path may be absolute or relative; relative paths are resolved with respect to es\_config. (string, default: null)
107109
- additional_settings: Describes a configuration JSON string for Sudachi. This JSON string will be merged into the default configuration. If this property is set, `settings_path` will be overridden.

src/main/java/com/worksap/nlp/elasticsearch/sudachi/ConfigAdapter.kt

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -29,52 +29,61 @@ class ConfigAdapter(anchor: PathAnchor, settings: Settings, env: Environment) {
2929
private val basePath = resourcesPath(env, settings)
3030
private val fullAnchor = PathAnchor.filesystem(basePath).andThen(anchor)
3131

32+
val discardPunctuation: Boolean = settings.getAsBoolean(PARAM_DISCARD_PUNCTUATION, true)
33+
// default false to let every morpheme have non-null span in the input text
34+
val allowEmptyMorpheme: Boolean = settings.getAsBoolean(PARAM_ALLOW_EMPTY_MORPHEME, false)
35+
val mode = splitMode(settings)
36+
3237
val compiled: Config = run {
3338
val base = settingsFile(settings)
3439
val additional = settingsInlineString(settings, fullAnchor)
3540
additional.withFallback(base).anchoredWith(fullAnchor)
3641
}
3742

38-
val discardPunctuation: Boolean = settings.getAsBoolean(PARAM_DISCARD_PUNCTUATION, true)
39-
40-
val mode = splitMode(settings)
41-
4243
private fun settingsFile(settings: Settings): Config {
4344
val settingsPath = settings.get(PARAM_SETTINGS_PATH)
44-
return if (settingsPath == null) {
45-
readDefaultConfig(basePath, fullAnchor)
46-
} else {
47-
val configObject = fullAnchor.resource<Any>(settingsPath)
48-
Config.fromResource(configObject, fullAnchor)
49-
}
45+
val base =
46+
if (settingsPath == null) {
47+
readDefaultConfig(basePath, fullAnchor)
48+
} else {
49+
val configObject = fullAnchor.resource<Any>(settingsPath)
50+
Config.fromResource(configObject, fullAnchor)
51+
}
52+
return base.allowEmptyMorpheme(allowEmptyMorpheme)
5053
}
5154

5255
companion object {
5356
const val PARAM_SPLIT_MODE_DEPRECATED = "mode"
57+
const val PARAM_SPLIT_MODE = "split_mode"
5458
const val PARAM_SETTINGS_PATH = "settings_path"
59+
const val PARAM_RESOURCES_PATH = "resources_path"
5560
const val PARAM_ADDITIONAL_SETTINGS = "additional_settings"
5661
const val PARAM_DISCARD_PUNCTUATION = "discard_punctuation"
62+
const val PARAM_ALLOW_EMPTY_MORPHEME = "allow_empty_morpheme"
63+
64+
const val DEFAULT_SETTINGS_FILENAME = "sudachi.json"
65+
const val DEFAULT_RESOURCE_PATH = "sudachi"
5766

58-
private object SplitModeFlag : EnumFlag<SplitMode>("split_mode", SplitMode.C)
67+
private object SplitModeFlag : EnumFlag<SplitMode>(PARAM_SPLIT_MODE, SplitMode.C)
5968

6069
@JvmStatic
6170
fun splitMode(settings: Settings): SplitMode {
6271
if (settings.get(PARAM_SPLIT_MODE_DEPRECATED, null) != null) {
6372
throw IllegalArgumentException(
64-
"Setting $PARAM_SPLIT_MODE_DEPRECATED is deprecated, use SudachiSplitFilter instead",
73+
"Setting $PARAM_SPLIT_MODE_DEPRECATED is deprecated, use $PARAM_SPLIT_MODE instead",
6574
)
6675
}
6776
return SplitModeFlag.get(settings)
6877
}
6978

7079
@JvmStatic
7180
fun resourcesPath(env: Environment, settings: Settings): Path {
72-
return env.configFile().resolve(settings.get("resources_path", "sudachi"))
81+
return env.configFile().resolve(settings.get(PARAM_RESOURCES_PATH, DEFAULT_RESOURCE_PATH))
7382
}
7483

7584
private fun readDefaultConfig(root: Path, baseAnchor: PathAnchor): Config {
7685
val anchor = PathAnchor.filesystem(root).andThen(baseAnchor)
77-
val resolved = root.resolve("sudachi.json")
86+
val resolved = root.resolve(DEFAULT_SETTINGS_FILENAME)
7887
val exists =
7988
try {
8089
resolved.exists()

src/main/java/com/worksap/nlp/elasticsearch/sudachi/index/SudachiTokenizerFactory.kt

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,8 @@ class SudachiTokenizerFactory(
5050
}
5151
}
5252

53-
private val mode = ConfigAdapter.splitMode(settings)
54-
5553
private val config = ConfigAdapter(service.anchor, settings, env)
54+
private val mode = config.mode
5655

5756
private val dictionary by lazy { service.forConfig(config.compiled) }
5857

src/test/java/com/worksap/nlp/elasticsearch/sudachi/index/CustomAnalyzerTest.kt

Lines changed: 64 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -45,15 +45,68 @@ class CustomAnalyzerTest : SearchEngineTestBase {
4545
""".jsonSettings()
4646
val analyzers = engine.indexAnalyzers(settings)
4747
val basic = analyzers.get("sudachi_basic")
48-
basic.assertTerms("東京に行く", "東京", "", "行く")
48+
basic.assertTerms("東京に行く。", "東京", "", "行く")
49+
}
50+
51+
@Test
52+
fun discardPunctuationFalse() {
53+
val settings =
54+
"""
55+
{
56+
"index.analysis": {
57+
"analyzer": {
58+
"sudachi_basic": {
59+
"type": "custom",
60+
"tokenizer": "sudachi_tokenizer"
61+
}
62+
},
63+
"tokenizer": {
64+
"sudachi_tokenizer": {
65+
"type": "sudachi_tokenizer",
66+
"discard_punctuation": false
67+
}
68+
}
69+
}
70+
}
71+
""".jsonSettings()
72+
val analyzers = engine.indexAnalyzers(settings)
73+
val basic = analyzers.get("sudachi_basic")
74+
basic.assertTerms("東京に行く。", "東京", "", "行く", "")
75+
}
76+
77+
@Test
78+
fun allowEmptyMorphemeTrue() {
79+
val settings =
80+
"""
81+
{
82+
"index.analysis": {
83+
"analyzer": {
84+
"sudachi_basic": {
85+
"type": "custom",
86+
"tokenizer": "sudachi_tokenizer"
87+
}
88+
},
89+
"tokenizer": {
90+
"sudachi_tokenizer": {
91+
"type": "sudachi_tokenizer",
92+
"split_mode": "A",
93+
"allow_empty_morpheme": true
94+
}
95+
}
96+
}
97+
}
98+
""".jsonSettings()
99+
val analyzers = engine.indexAnalyzers(settings)
100+
val basic = analyzers.get("sudachi_basic")
101+
basic.assertTerms("㍿に行く", "", "", "", "行く")
49102
}
50103

51104
@Test
52105
fun stoptagsEmpty() {
53106
val settings =
54107
"""
55108
{
56-
"index.analysis": {
109+
"index.analysis": {
57110
"analyzer": {
58111
"sudachi_basic": {
59112
"type": "custom",
@@ -71,7 +124,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
71124
},
72125
"filter": {
73126
"pos": {
74-
"type": "sudachi_part_of_speech"
127+
"type": "sudachi_part_of_speech"
75128
}
76129
}
77130
}
@@ -87,7 +140,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
87140
val settings =
88141
"""
89142
{
90-
"index.analysis": {
143+
"index.analysis": {
91144
"analyzer": {
92145
"sudachi_basic": {
93146
"type": "custom",
@@ -122,7 +175,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
122175
val settings =
123176
"""
124177
{
125-
"index.analysis": {
178+
"index.analysis": {
126179
"analyzer": {
127180
"sudachi_test": {
128181
"type": "custom",
@@ -156,7 +209,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
156209
val settings =
157210
"""
158211
{
159-
"index.analysis": {
212+
"index.analysis": {
160213
"analyzer": {
161214
"sudachi_test": {
162215
"type": "custom",
@@ -190,7 +243,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
190243
val settings =
191244
"""
192245
{
193-
"index.analysis": {
246+
"index.analysis": {
194247
"analyzer": {
195248
"sudachi_test": {
196249
"type": "custom",
@@ -224,7 +277,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
224277
val settings =
225278
"""
226279
{
227-
"index.analysis": {
280+
"index.analysis": {
228281
"analyzer": {
229282
"sudachi_test": {
230283
"type": "custom",
@@ -259,7 +312,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
259312
val settings =
260313
"""
261314
{
262-
"index.analysis": {
315+
"index.analysis": {
263316
"analyzer": {
264317
"sudachi_test": {
265318
"type": "custom",
@@ -294,7 +347,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
294347
val settings =
295348
"""
296349
{
297-
"index.analysis": {
350+
"index.analysis": {
298351
"analyzer": {
299352
"sudachi_test": {
300353
"type": "custom",
@@ -329,7 +382,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
329382
val settings =
330383
"""
331384
{
332-
"index.analysis": {
385+
"index.analysis": {
333386
"analyzer": {
334387
"sudachi_test": {
335388
"type": "custom",

src/test/java/com/worksap/nlp/elasticsearch/sudachi/index/TestSudachiAnalysis.kt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ open class TestSudachiAnalysis : SearchEngineTestBase {
5151
val settings =
5252
"""
5353
{
54-
"index.analysis": {
54+
"index.analysis": {
5555
"analyzer": {
5656
"sudachi": {
5757
"type": "sudachi",
@@ -71,7 +71,7 @@ open class TestSudachiAnalysis : SearchEngineTestBase {
7171
val settings =
7272
"""
7373
{
74-
"index.analysis": {
74+
"index.analysis": {
7575
"analyzer": {
7676
"sudachi": {
7777
"type": "sudachi",

src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiSplitFilter.kt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -201,8 +201,8 @@ class TestSudachiSplitFilter : BaseTokenStreamTestCase() {
201201
val tokenStream = setUpTokenStream("search", "六三四㍿に行くカ゛カ゛カ゛")
202202
assertTokenStreamContents(
203203
tokenStream,
204-
arrayOf("六三四", "", "", "", "", "行く", "カ゛カ゛カ゛", "カ゛カ゛", "カ゛"),
205-
intArrayOf(0, 3, 3, 4, 4, 5, 7, 7, 11),
204+
arrayOf("六三四", "", "", "", "", "行く", "カ゛カ゛カ゛", "カ゛カ゛", "カ゛"),
205+
intArrayOf(0, 3, 3, 3, 4, 5, 7, 7, 11),
206206
intArrayOf(3, 4, 4, 4, 5, 7, 13, 11, 13),
207207
intArrayOf(1, 1, 0, 1, 1, 1, 1, 0, 1),
208208
intArrayOf(1, 2, 1, 1, 1, 1, 2, 1, 1),
@@ -219,7 +219,7 @@ class TestSudachiSplitFilter : BaseTokenStreamTestCase() {
219219
assertTokenStreamContents(
220220
tokenStream,
221221
arrayOf("六三四", "株式会社", "株式", "会社", "", "行く", "ガガガ", "ガガ", ""),
222-
intArrayOf(0, 3, 3, 4, 4, 5, 7, 7, 11),
222+
intArrayOf(0, 3, 3, 3, 4, 5, 7, 7, 11),
223223
intArrayOf(3, 4, 4, 4, 5, 7, 13, 11, 13),
224224
intArrayOf(1, 1, 0, 1, 1, 1, 1, 0, 1),
225225
intArrayOf(1, 2, 1, 1, 1, 1, 2, 1, 1),

0 commit comments

Comments
 (0)