@@ -135,6 +135,38 @@ namespace {
135135 return length;
136136 }
137137
138+ void BuildNgrams (const TString& token, size_t lengthMin, size_t lengthMax, bool edge, TVector<TString>& ngrams) {
139+ TVector<wchar32> characters;
140+
141+ const unsigned char * ptr = (const unsigned char *)token.data ();
142+ const unsigned char * end = ptr + token.size ();
143+ wchar32 symbol;
144+ size_t symbolBytes;
145+ while (ptr < end) {
146+ if (SafeReadUTF8Char (symbol, symbolBytes, ptr, end) != RECODE_OK) {
147+ Y_ASSERT (false ); // should already be validated during tokenization
148+ return ;
149+ }
150+ characters.push_back (symbol);
151+ ptr += symbolBytes;
152+ }
153+
154+ TVector<unsigned char > ngram (token.size ());
155+ for (size_t len : xrange (lengthMin, Min (lengthMax, characters.size ()) + 1 )) {
156+ for (size_t start : xrange<size_t >(0 , characters.size () - len + 1 )) {
157+ unsigned char * ptr = (unsigned char *)ngram.data ();
158+ for (size_t i : xrange (len)) {
159+ WriteUTF8Char (characters[start + i], symbolBytes, ptr);
160+ ptr += symbolBytes;
161+ }
162+ ngrams.emplace_back ((const char *)ngram.data (), ptr - ngram.data ());
163+ if (edge) {
164+ break ; // only prefixes
165+ }
166+ }
167+ }
168+ }
169+
138170 bool ValidateSettings (const Ydb::Table::FulltextIndexSettings::Analyzers& settings, TString& error) {
139171 if (!settings.has_tokenizer () || settings.tokenizer () == Ydb::Table::FulltextIndexSettings::TOKENIZER_UNSPECIFIED) {
140172 error = " tokenizer should be set" ;
@@ -151,21 +183,38 @@ namespace {
151183 return false ;
152184 }
153185
154- if (settings.use_filter_ngram ()) {
155- error = " Unsupported use_filter_ngram setting" ;
156- return false ;
157- }
158- if (settings.use_filter_edge_ngram ()) {
159- error = " Unsupported use_filter_edge_ngram setting" ;
160- return false ;
161- }
162- if (settings.has_filter_ngram_min_length ()) {
163- error = " Unsupported filter_ngram_min_length setting" ;
164- return false ;
165- }
166- if (settings.has_filter_ngram_max_length ()) {
167- error = " Unsupported filter_ngram_max_length setting" ;
168- return false ;
186+ if (settings.use_filter_ngram () || settings.use_filter_edge_ngram ()) {
187+ if (settings.use_filter_ngram () && settings.use_filter_edge_ngram ()) {
188+ error = " only one of use_filter_ngram or use_filter_edge_ngram should be set, not both" ;
189+ return false ;
190+ }
191+ if (!settings.has_filter_ngram_min_length ()) {
192+ error = " filter_ngram_min_length should be set with use_filter_ngram/use_filter_edge_ngram" ;
193+ return false ;
194+ }
195+ if (!settings.has_filter_ngram_max_length ()) {
196+ error = " filter_ngram_max_length should be set with use_filter_ngram/use_filter_edge_ngram" ;
197+ return false ;
198+ }
199+ if (!ValidateSettingInRange (" filter_ngram_min_length" , settings.filter_ngram_min_length (), 1 , 20 , error)) {
200+ return false ;
201+ }
202+ if (!ValidateSettingInRange (" filter_ngram_max_length" , settings.filter_ngram_max_length (), 1 , 20 , error)) {
203+ return false ;
204+ }
205+ if (settings.filter_ngram_min_length () > settings.filter_ngram_max_length ()) {
206+ error = " Invalid filter_ngram_min_length: should be less than or equal to filter_ngram_max_length" ;
207+ return false ;
208+ }
209+ } else {
210+ if (settings.has_filter_ngram_min_length ()) {
211+ error = " use_filter_ngram or use_filter_edge_ngram should be set with filter_ngram_min_length" ;
212+ return false ;
213+ }
214+ if (settings.has_filter_ngram_max_length ()) {
215+ error = " use_filter_ngram or use_filter_edge_ngram should be set with filter_ngram_max_length" ;
216+ return false ;
217+ }
169218 }
170219
171220 if (settings.use_filter_length ()) {
@@ -180,7 +229,7 @@ namespace {
180229 return false ;
181230 }
182231 if (settings.has_filter_length_min () && settings.has_filter_length_max () && settings.filter_length_min () > settings.filter_length_max ()) {
183- error = " Invalid filter_length_min: should be less or equal than filter_length_max" ;
232+ error = " Invalid filter_length_min: should be less than or equal to filter_length_max" ;
184233 return false ;
185234 }
186235 } else {
@@ -220,6 +269,14 @@ TVector<TString> Analyze(const TString& text, const Ydb::Table::FulltextIndexSet
220269 }), tokens.end ());
221270 }
222271
272+ if (settings.use_filter_ngram () || settings.use_filter_edge_ngram ()) {
273+ TVector<TString> ngrams;
274+ for (const auto & token : tokens) {
275+ BuildNgrams (token, settings.filter_ngram_min_length (), settings.filter_ngram_max_length (), settings.use_filter_edge_ngram (), ngrams);
276+ }
277+ tokens.swap (ngrams);
278+ }
279+
223280 return tokens;
224281}
225282
0 commit comments