23
23
import com .worksap .nlp .lucene .sudachi .ja .attributes .*;
24
24
import com .worksap .nlp .lucene .sudachi .ja .util .Strings ;
25
25
import com .worksap .nlp .sudachi .Morpheme ;
26
-
27
26
import com .worksap .nlp .sudachi .Tokenizer ;
28
27
import org .apache .lucene .analysis .TokenFilter ;
29
28
import org .apache .lucene .analysis .TokenStream ;
@@ -40,56 +39,18 @@ public enum Mode {
40
39
41
40
public static final Mode DEFAULT_MODE = Mode .SEARCH ;
42
41
43
- static class OovChars {
44
- private int length ;
45
- private char [] buffer = new char [0 ];
46
- private int reserved ;
47
- private int index ;
48
- private int baseOffset ;
49
-
50
- public void setOov (int offset , char [] src , int length ) {
51
- baseOffset = offset ;
52
- this .length = length ;
53
- if (reserved < length ) {
54
- buffer = new char [length ];
55
- reserved = length ;
56
- }
57
- System .arraycopy (src , 0 , buffer , 0 , length );
58
- index = 0 ;
59
- }
60
-
61
- public boolean hasNext () {
62
- return index < length ;
63
- }
64
-
65
- public char next () {
66
- if (index < length ) {
67
- return buffer [index ++];
68
- } else {
69
- throw new IllegalStateException ();
70
- }
71
- }
72
-
73
- public int index () {
74
- return index ;
75
- }
76
-
77
- public int offset () {
78
- return baseOffset + index ;
79
- }
80
- }
81
-
82
42
private final Mode mode ;
83
43
private final Tokenizer .SplitMode splitMode ;
44
+
84
45
private final CharTermAttribute termAtt ;
85
46
private final OffsetAttribute offsetAtt ;
86
47
private final PositionIncrementAttribute posIncAtt ;
87
48
private final PositionLengthAttribute posLengthAtt ;
88
49
private final MorphemeAttribute morphemeAtt ;
89
- private ListIterator <Morpheme > aUnitIterator ;
90
- private final OovChars oovChars = new OovChars ();
91
50
92
- private int aUnitOffset = 0 ;
51
+ private final MorphemeSubunits subunits = new MorphemeSubunits ();
52
+ private final OovChars oovChars = new OovChars ();
53
+ private List <Integer > offsetMap ;
93
54
94
55
public SudachiSplitFilter (TokenStream input , Mode mode , Tokenizer .SplitMode splitMode ) {
95
56
super (input );
@@ -105,72 +66,174 @@ public SudachiSplitFilter(TokenStream input, Mode mode, Tokenizer.SplitMode spli
105
66
106
67
@ Override
107
68
public final boolean incrementToken () throws IOException {
69
+ // continue to write current split
108
70
if (oovChars .hasNext ()) {
109
71
clearAttributes ();
110
72
setOOVAttribute ();
111
73
return true ;
112
74
}
113
- if (aUnitIterator != null && aUnitIterator .hasNext ()) {
75
+ if (subunits .hasNext ()) {
114
76
clearAttributes ();
115
- setAUnitAttribute (aUnitIterator .next ());
77
+ setAUnitAttribute ();
78
+ return true ;
79
+ }
80
+
81
+ // move to next morpheme
82
+ if (!input .incrementToken ()) {
83
+ return false ;
84
+ }
85
+
86
+ Morpheme m = morphemeAtt .getMorpheme ();
87
+ this .offsetMap = morphemeAtt .getOffsets ();
88
+ if (m == null ) {
116
89
return true ;
117
90
}
118
91
119
- if (input .incrementToken ()) {
92
+ // oov does not have splits
93
+ // split into characters in extended mode
94
+ if (m .isOOV ()) {
120
95
int length = 0 ;
121
- Morpheme m = morphemeAtt .getMorpheme ();
122
- if (m == null ) {
123
- return true ;
124
- }
125
- termAtt .setEmpty ().append (m .surface ());
126
- if (mode == Mode .EXTENDED && m .isOOV () && (length = Strings .codepointCount (termAtt )) > 1 ) {
127
- oovChars .setOov (offsetAtt .startOffset (), termAtt .buffer (), termAtt .length ());
96
+ if (mode == Mode .EXTENDED && (length = Strings .codepointCount (termAtt )) > 1 ) {
97
+ // OovChars requires character length
98
+ oovChars .setOov (termAtt .buffer (), termAtt .length ());
99
+ // Position length should be codepoint length
128
100
posLengthAtt .setPositionLength (length );
129
- } else if (splitMode != Tokenizer .SplitMode .C ) {
130
- List <Morpheme > subUnits = m .split (splitMode );
131
- if (subUnits .size () > 1 ) {
132
- aUnitIterator = subUnits .listIterator ();
133
- aUnitOffset = offsetAtt .startOffset ();
134
- posLengthAtt .setPositionLength (subUnits .size ());
135
- } else {
136
- posLengthAtt .setPositionLength (1 );
137
- }
138
101
}
139
102
return true ;
140
- } else {
141
- return false ;
142
103
}
104
+
105
+ // C split is the longest split
106
+ if (splitMode == Tokenizer .SplitMode .C ) {
107
+ return true ;
108
+ }
109
+
110
+ // split into A/B units
111
+ List <Morpheme > subsplits = m .split (splitMode );
112
+ if (subsplits .size () > 1 ) {
113
+ subunits .setUnits (subsplits );
114
+ posLengthAtt .setPositionLength (subunits .size ());
115
+ }
116
+
117
+ return true ;
118
+ }
119
+
120
+ private int correctOffset (int currectOff ) {
121
+ // assert (0 <= currectOff && currectOff <= this.offsetMap.size());
122
+ return this .offsetMap .get (currectOff );
143
123
}
144
124
145
- private void setAUnitAttribute (Morpheme morpheme ) {
125
+ private void setAUnitAttribute () {
146
126
posLengthAtt .setPositionLength (1 );
147
- if (aUnitIterator . previousIndex () == 0 ) {
127
+ if (subunits . index () == 0 ) {
148
128
posIncAtt .setPositionIncrement (0 );
149
129
} else {
150
130
posIncAtt .setPositionIncrement (1 );
151
131
}
152
- int length = morpheme .end () - morpheme .begin ();
153
- offsetAtt .setOffset (aUnitOffset , aUnitOffset + length );
154
- aUnitOffset += length ;
155
- morphemeAtt .setMorpheme (morpheme );
156
- termAtt .setEmpty ().append (morpheme .surface ());
132
+
133
+ MorphemeSubunits .Subunit su = subunits .next ();
134
+ termAtt .setEmpty ().append (su .morpheme .surface ());
135
+ morphemeAtt .setMorpheme (su .morpheme );
136
+ morphemeAtt .setOffsets (offsetMap .subList (su .begin , su .end + 1 ));
137
+ offsetAtt .setOffset (correctOffset (su .begin ), correctOffset (su .end ));
157
138
}
158
139
159
140
private void setOOVAttribute () {
160
- int offset = oovChars .offset ();
161
141
posLengthAtt .setPositionLength (1 );
162
142
if (oovChars .index () == 0 ) {
163
143
posIncAtt .setPositionIncrement (0 );
164
144
} else {
165
145
posIncAtt .setPositionIncrement (1 );
166
146
}
147
+
148
+ int startOffset = oovChars .offset ();
167
149
char c = oovChars .next ();
168
150
termAtt .setEmpty ().append (c );
169
151
if (Character .isSurrogate (c ) && oovChars .hasNext ()) {
170
152
termAtt .append (oovChars .next ());
171
- offsetAtt .setOffset (offset , offset + 2 );
172
- } else {
173
- offsetAtt .setOffset (offset , offset + 1 );
153
+ }
154
+ int endOffset = oovChars .offset ();
155
+ offsetAtt .setOffset (correctOffset (startOffset ), correctOffset (endOffset ));
156
+ }
157
+
158
+ static class OovChars {
159
+ private int reserved ;
160
+ private char [] buffer = new char [0 ];
161
+ private int length ;
162
+ private int index ;
163
+
164
+ public void setOov (char [] src , int length ) {
165
+ this .length = length ;
166
+ if (reserved < length ) {
167
+ buffer = new char [length ];
168
+ reserved = length ;
169
+ }
170
+ System .arraycopy (src , 0 , buffer , 0 , length );
171
+ index = 0 ;
172
+ }
173
+
174
+ public boolean hasNext () {
175
+ return index < length ;
176
+ }
177
+
178
+ public char next () {
179
+ if (index < length ) {
180
+ return buffer [index ++];
181
+ }
182
+ throw new IllegalStateException ();
183
+ }
184
+
185
+ public int index () {
186
+ return index ;
187
+ }
188
+
189
+ public int offset () {
190
+ return index ;
191
+ }
192
+ }
193
+
194
+ static class MorphemeSubunits {
195
+ static class Subunit {
196
+ final Morpheme morpheme ;
197
+ final int begin ;
198
+ final int end ;
199
+
200
+ public Subunit (Morpheme morpheme , int begin , int end ) {
201
+ this .morpheme = morpheme ;
202
+ this .begin = begin ;
203
+ this .end = end ;
204
+ }
205
+ }
206
+
207
+ private List <Morpheme > morphemes ;
208
+ private int size ;
209
+ private int index ;
210
+ private int baseOffset ;
211
+
212
+ public void setUnits (List <Morpheme > morphemes ) {
213
+ this .morphemes = morphemes ;
214
+ size = morphemes .size ();
215
+ index = 0 ;
216
+ baseOffset = morphemes .get (0 ).begin ();
217
+ }
218
+
219
+ public boolean hasNext () {
220
+ return index < size ;
221
+ }
222
+
223
+ public Subunit next () {
224
+ if (!hasNext ()) {
225
+ throw new IllegalStateException ();
226
+ }
227
+ Morpheme m = morphemes .get (index ++);
228
+ return new Subunit (m , m .begin () - baseOffset , m .end () - baseOffset );
229
+ }
230
+
231
+ public int size () {
232
+ return size ;
233
+ }
234
+
235
+ public int index () {
236
+ return index ;
174
237
}
175
238
}
176
239
}
0 commit comments