Skip to content

Commit a598575

Browse files
calculate offset based on each morpheme's one
1 parent 85a66af commit a598575

File tree

1 file changed

+21
-16
lines changed

1 file changed

+21
-16
lines changed

src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -130,13 +130,11 @@ private void setAUnitAttribute() {
130130
posIncAtt.setPositionIncrement(1);
131131
}
132132

133-
int startOffset = subunits.offset();
134-
Morpheme morpheme = subunits.next();
135-
int endOffset = subunits.offset();
136-
termAtt.setEmpty().append(morpheme.surface());
137-
morphemeAtt.setMorpheme(morpheme);
138-
morphemeAtt.setOffsets(offsetMap.subList(startOffset, endOffset + 1));
139-
offsetAtt.setOffset(correctOffset(startOffset), correctOffset(endOffset));
133+
MorphemeSubunits.Subunit su = subunits.next();
134+
termAtt.setEmpty().append(su.morpheme.surface());
135+
morphemeAtt.setMorpheme(su.morpheme);
136+
morphemeAtt.setOffsets(offsetMap.subList(su.begin, su.end + 1));
137+
offsetAtt.setOffset(correctOffset(su.begin), correctOffset(su.end));
140138
}
141139

142140
private void setOOVAttribute() {
@@ -194,29 +192,40 @@ public int offset() {
194192
}
195193

196194
static class MorphemeSubunits {
195+
static class Subunit {
196+
final Morpheme morpheme;
197+
final int begin;
198+
final int end;
199+
200+
public Subunit(Morpheme morpheme, int begin, int end) {
201+
this.morpheme = morpheme;
202+
this.begin = begin;
203+
this.end = end;
204+
}
205+
}
206+
197207
private List<Morpheme> morphemes;
198208
private int size;
199209
private int index;
200-
private int offset;
210+
private int baseOffset;
201211

202212
public void setUnits(List<Morpheme> morphemes) {
203213
this.morphemes = morphemes;
204214
size = morphemes.size();
205215
index = 0;
206-
offset = 0;
216+
baseOffset = morphemes.get(0).begin();
207217
}
208218

209219
public boolean hasNext() {
210220
return index < size;
211221
}
212222

213-
public Morpheme next() {
223+
public Subunit next() {
214224
if (!hasNext()) {
215225
throw new IllegalStateException();
216226
}
217227
Morpheme m = morphemes.get(index++);
218-
offset += m.end() - m.begin();
219-
return m;
228+
return new Subunit(m, m.begin() - baseOffset, m.end() - baseOffset);
220229
}
221230

222231
public int size() {
@@ -226,9 +235,5 @@ public int size() {
226235
public int index() {
227236
return index;
228237
}
229-
230-
public int offset() {
231-
return offset;
232-
}
233238
}
234239
}

0 commit comments

Comments
 (0)