Skip to content

Commit ee664ba

Browse files
Merge pull request #149 from WorksApplications/fix/148-correct-offset
Add offset correction for split filter
2 parents 59f9b99 + a598575 commit ee664ba

File tree

12 files changed

+422
-102
lines changed

12 files changed

+422
-102
lines changed

.github/workflows/build.yml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ jobs:
4646
- 'os:2.6.0'
4747
env:
4848
mainJob: ${{ matrix.es-version == 'es:8.15.2' }}
49+
sudachiVersion: 20241021
50+
sudachiKind: core
4951
continue-on-error: true
5052

5153
steps:
@@ -93,15 +95,16 @@ jobs:
9395
- name: Cache dictionary download
9496
uses: actions/cache@v4
9597
with:
96-
path: build/integration/sudachi-dictionary-20230110-small.zip
97-
key: sudachi-dictionary-20230110
98+
path: build/integration/sudachi-dictionary-${{ env.sudachiVersion }}-${{ env.sudachiKind }}.zip
99+
key: sudachi-dictionary-${{ env.sudachiVersion }}-${{ env.sudachiKind }}
98100
- name: Integration test
99101
env:
100102
ES_KIND: ${{ env.ENGINE_KIND }}
101103
ES_VERSION: ${{ env.ENGINE_VERSION }}
102104
PLUGIN_VERSION: ${{ env.PROJ_VERSION }}
103105
RUN_ES_DAEMON: 1
104-
DIC_VERSION: 20230110
106+
DIC_VERSION: ${{ env.sudachiVersion }}
107+
DIC_KIND: ${{ env.sudachiKind }}
105108
run: |
106109
bash test-scripts/00-install-elasticsearch.sh
107110
sleep 30

spi/src/main/java/com/worksap/nlp/lucene/sudachi/ja/attributes/MorphemeAttribute.java

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2023 Works Applications Co., Ltd.
2+
* Copyright (c) 2023-2024 Works Applications Co., Ltd.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -16,6 +16,8 @@
1616

1717
package com.worksap.nlp.lucene.sudachi.ja.attributes;
1818

19+
import java.util.List;
20+
1921
import com.worksap.nlp.sudachi.Morpheme;
2022
import org.apache.lucene.analysis.TokenStream;
2123
import org.apache.lucene.util.Attribute;
@@ -36,4 +38,17 @@ public interface MorphemeAttribute extends Attribute {
3638
* new object
3739
*/
3840
void setMorpheme(Morpheme morpheme);
41+
42+
/**
43+
* @return The offset mapping for the current morpheme
44+
*/
45+
List<Integer> getOffsets();
46+
47+
/**
48+
* Set the offset mapping for the morpheme
49+
*
50+
* @param offsets
51+
* actual offset for each offset in the morpheme
52+
*/
53+
void setOffsets(List<Integer> offsets);
3954
}

src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java

Lines changed: 138 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
import com.worksap.nlp.lucene.sudachi.ja.attributes.*;
2424
import com.worksap.nlp.lucene.sudachi.ja.util.Strings;
2525
import com.worksap.nlp.sudachi.Morpheme;
26-
2726
import com.worksap.nlp.sudachi.Tokenizer;
2827
import org.apache.lucene.analysis.TokenFilter;
2928
import org.apache.lucene.analysis.TokenStream;
@@ -40,56 +39,18 @@ public enum Mode {
4039

4140
public static final Mode DEFAULT_MODE = Mode.SEARCH;
4241

43-
static class OovChars {
44-
private int length;
45-
private char[] buffer = new char[0];
46-
private int reserved;
47-
private int index;
48-
private int baseOffset;
49-
50-
public void setOov(int offset, char[] src, int length) {
51-
baseOffset = offset;
52-
this.length = length;
53-
if (reserved < length) {
54-
buffer = new char[length];
55-
reserved = length;
56-
}
57-
System.arraycopy(src, 0, buffer, 0, length);
58-
index = 0;
59-
}
60-
61-
public boolean hasNext() {
62-
return index < length;
63-
}
64-
65-
public char next() {
66-
if (index < length) {
67-
return buffer[index++];
68-
} else {
69-
throw new IllegalStateException();
70-
}
71-
}
72-
73-
public int index() {
74-
return index;
75-
}
76-
77-
public int offset() {
78-
return baseOffset + index;
79-
}
80-
}
81-
8242
private final Mode mode;
8343
private final Tokenizer.SplitMode splitMode;
44+
8445
private final CharTermAttribute termAtt;
8546
private final OffsetAttribute offsetAtt;
8647
private final PositionIncrementAttribute posIncAtt;
8748
private final PositionLengthAttribute posLengthAtt;
8849
private final MorphemeAttribute morphemeAtt;
89-
private ListIterator<Morpheme> aUnitIterator;
90-
private final OovChars oovChars = new OovChars();
9150

92-
private int aUnitOffset = 0;
51+
private final MorphemeSubunits subunits = new MorphemeSubunits();
52+
private final OovChars oovChars = new OovChars();
53+
private List<Integer> offsetMap;
9354

9455
public SudachiSplitFilter(TokenStream input, Mode mode, Tokenizer.SplitMode splitMode) {
9556
super(input);
@@ -105,72 +66,174 @@ public SudachiSplitFilter(TokenStream input, Mode mode, Tokenizer.SplitMode spli
10566

10667
@Override
10768
public final boolean incrementToken() throws IOException {
69+
// continue to write current split
10870
if (oovChars.hasNext()) {
10971
clearAttributes();
11072
setOOVAttribute();
11173
return true;
11274
}
113-
if (aUnitIterator != null && aUnitIterator.hasNext()) {
75+
if (subunits.hasNext()) {
11476
clearAttributes();
115-
setAUnitAttribute(aUnitIterator.next());
77+
setAUnitAttribute();
78+
return true;
79+
}
80+
81+
// move to next morpheme
82+
if (!input.incrementToken()) {
83+
return false;
84+
}
85+
86+
Morpheme m = morphemeAtt.getMorpheme();
87+
this.offsetMap = morphemeAtt.getOffsets();
88+
if (m == null) {
11689
return true;
11790
}
11891

119-
if (input.incrementToken()) {
92+
// oov does not have splits
93+
// split into characters in extended mode
94+
if (m.isOOV()) {
12095
int length = 0;
121-
Morpheme m = morphemeAtt.getMorpheme();
122-
if (m == null) {
123-
return true;
124-
}
125-
termAtt.setEmpty().append(m.surface());
126-
if (mode == Mode.EXTENDED && m.isOOV() && (length = Strings.codepointCount(termAtt)) > 1) {
127-
oovChars.setOov(offsetAtt.startOffset(), termAtt.buffer(), termAtt.length());
96+
if (mode == Mode.EXTENDED && (length = Strings.codepointCount(termAtt)) > 1) {
97+
// OovChars requires character length
98+
oovChars.setOov(termAtt.buffer(), termAtt.length());
99+
// Position length should be codepoint length
128100
posLengthAtt.setPositionLength(length);
129-
} else if (splitMode != Tokenizer.SplitMode.C) {
130-
List<Morpheme> subUnits = m.split(splitMode);
131-
if (subUnits.size() > 1) {
132-
aUnitIterator = subUnits.listIterator();
133-
aUnitOffset = offsetAtt.startOffset();
134-
posLengthAtt.setPositionLength(subUnits.size());
135-
} else {
136-
posLengthAtt.setPositionLength(1);
137-
}
138101
}
139102
return true;
140-
} else {
141-
return false;
142103
}
104+
105+
// C split is the longest split
106+
if (splitMode == Tokenizer.SplitMode.C) {
107+
return true;
108+
}
109+
110+
// split into A/B units
111+
List<Morpheme> subsplits = m.split(splitMode);
112+
if (subsplits.size() > 1) {
113+
subunits.setUnits(subsplits);
114+
posLengthAtt.setPositionLength(subunits.size());
115+
}
116+
117+
return true;
118+
}
119+
120+
private int correctOffset(int currectOff) {
121+
// assert (0 <= currectOff && currectOff <= this.offsetMap.size());
122+
return this.offsetMap.get(currectOff);
143123
}
144124

145-
private void setAUnitAttribute(Morpheme morpheme) {
125+
private void setAUnitAttribute() {
146126
posLengthAtt.setPositionLength(1);
147-
if (aUnitIterator.previousIndex() == 0) {
127+
if (subunits.index() == 0) {
148128
posIncAtt.setPositionIncrement(0);
149129
} else {
150130
posIncAtt.setPositionIncrement(1);
151131
}
152-
int length = morpheme.end() - morpheme.begin();
153-
offsetAtt.setOffset(aUnitOffset, aUnitOffset + length);
154-
aUnitOffset += length;
155-
morphemeAtt.setMorpheme(morpheme);
156-
termAtt.setEmpty().append(morpheme.surface());
132+
133+
MorphemeSubunits.Subunit su = subunits.next();
134+
termAtt.setEmpty().append(su.morpheme.surface());
135+
morphemeAtt.setMorpheme(su.morpheme);
136+
morphemeAtt.setOffsets(offsetMap.subList(su.begin, su.end + 1));
137+
offsetAtt.setOffset(correctOffset(su.begin), correctOffset(su.end));
157138
}
158139

159140
private void setOOVAttribute() {
160-
int offset = oovChars.offset();
161141
posLengthAtt.setPositionLength(1);
162142
if (oovChars.index() == 0) {
163143
posIncAtt.setPositionIncrement(0);
164144
} else {
165145
posIncAtt.setPositionIncrement(1);
166146
}
147+
148+
int startOffset = oovChars.offset();
167149
char c = oovChars.next();
168150
termAtt.setEmpty().append(c);
169151
if (Character.isSurrogate(c) && oovChars.hasNext()) {
170152
termAtt.append(oovChars.next());
171-
offsetAtt.setOffset(offset, offset + 2);
172-
} else {
173-
offsetAtt.setOffset(offset, offset + 1);
153+
}
154+
int endOffset = oovChars.offset();
155+
offsetAtt.setOffset(correctOffset(startOffset), correctOffset(endOffset));
156+
}
157+
158+
static class OovChars {
159+
private int reserved;
160+
private char[] buffer = new char[0];
161+
private int length;
162+
private int index;
163+
164+
public void setOov(char[] src, int length) {
165+
this.length = length;
166+
if (reserved < length) {
167+
buffer = new char[length];
168+
reserved = length;
169+
}
170+
System.arraycopy(src, 0, buffer, 0, length);
171+
index = 0;
172+
}
173+
174+
public boolean hasNext() {
175+
return index < length;
176+
}
177+
178+
public char next() {
179+
if (index < length) {
180+
return buffer[index++];
181+
}
182+
throw new IllegalStateException();
183+
}
184+
185+
public int index() {
186+
return index;
187+
}
188+
189+
public int offset() {
190+
return index;
191+
}
192+
}
193+
194+
static class MorphemeSubunits {
195+
static class Subunit {
196+
final Morpheme morpheme;
197+
final int begin;
198+
final int end;
199+
200+
public Subunit(Morpheme morpheme, int begin, int end) {
201+
this.morpheme = morpheme;
202+
this.begin = begin;
203+
this.end = end;
204+
}
205+
}
206+
207+
private List<Morpheme> morphemes;
208+
private int size;
209+
private int index;
210+
private int baseOffset;
211+
212+
public void setUnits(List<Morpheme> morphemes) {
213+
this.morphemes = morphemes;
214+
size = morphemes.size();
215+
index = 0;
216+
baseOffset = morphemes.get(0).begin();
217+
}
218+
219+
public boolean hasNext() {
220+
return index < size;
221+
}
222+
223+
public Subunit next() {
224+
if (!hasNext()) {
225+
throw new IllegalStateException();
226+
}
227+
Morpheme m = morphemes.get(index++);
228+
return new Subunit(m, m.begin() - baseOffset, m.end() - baseOffset);
229+
}
230+
231+
public int size() {
232+
return size;
233+
}
234+
235+
public int index() {
236+
return index;
174237
}
175238
}
176239
}

src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.kt

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,11 +55,12 @@ class SudachiTokenizer(
5555
override fun incrementToken(): Boolean {
5656
clearAttributes()
5757
var m = iterator.next() ?: return false
58+
val baseOffset = iterator.baseOffset
5859

5960
morphemeAtt.setMorpheme(m)
60-
posLenAtt.positionLength = 1
61-
posIncAtt.positionIncrement = 1
62-
val baseOffset = iterator.baseOffset
61+
morphemeAtt.setOffsets((m.begin()..m.end()).map { i -> correctOffset(baseOffset + i) })
62+
posLenAtt.setPositionLength(1)
63+
posIncAtt.setPositionIncrement(1)
6364
offsetAtt.setOffset(correctOffset(baseOffset + m.begin()), correctOffset(baseOffset + m.end()))
6465
termAtt.setEmpty().append(m.surface())
6566
return true

0 commit comments

Comments
 (0)