Skip to content

Commit 2386aba

Browse files
authored
Merge 4f6ecbd into a04e6ba
2 parents a04e6ba + 4f6ecbd commit 2386aba

File tree

2 files changed

+101
-17
lines changed

2 files changed

+101
-17
lines changed

ydb/core/base/fulltext.cpp

Lines changed: 73 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,38 @@ namespace {
135135
return length;
136136
}
137137

138+
void BuildNgrams(const TString& token, size_t lengthMin, size_t lengthMax, bool edge, TVector<TString>& ngrams) {
139+
TVector<wchar32> characters;
140+
141+
const unsigned char* ptr = (const unsigned char*)token.data();
142+
const unsigned char* end = ptr + token.size();
143+
wchar32 symbol;
144+
size_t symbolBytes;
145+
while (ptr < end) {
146+
if (SafeReadUTF8Char(symbol, symbolBytes, ptr, end) != RECODE_OK) {
147+
Y_ASSERT(false); // should already be validated during tokenization
148+
return;
149+
}
150+
characters.push_back(symbol);
151+
ptr += symbolBytes;
152+
}
153+
154+
TVector<unsigned char> ngram(token.size());
155+
for (size_t len : xrange(lengthMin, Min(lengthMax, characters.size()) + 1)) {
156+
for (size_t start : xrange<size_t>(0, characters.size() - len + 1)) {
157+
unsigned char* ptr = (unsigned char*)ngram.data();
158+
for (size_t i : xrange(len)) {
159+
WriteUTF8Char(characters[start + i], symbolBytes, ptr);
160+
ptr += symbolBytes;
161+
}
162+
ngrams.emplace_back((const char*)ngram.data(), ptr - ngram.data());
163+
if (edge) {
164+
break; // only prefixes
165+
}
166+
}
167+
}
168+
}
169+
138170
bool ValidateSettings(const Ydb::Table::FulltextIndexSettings::Analyzers& settings, TString& error) {
139171
if (!settings.has_tokenizer() || settings.tokenizer() == Ydb::Table::FulltextIndexSettings::TOKENIZER_UNSPECIFIED) {
140172
error = "tokenizer should be set";
@@ -151,21 +183,38 @@ namespace {
151183
return false;
152184
}
153185

154-
if (settings.use_filter_ngram()) {
155-
error = "Unsupported use_filter_ngram setting";
156-
return false;
157-
}
158-
if (settings.use_filter_edge_ngram()) {
159-
error = "Unsupported use_filter_edge_ngram setting";
160-
return false;
161-
}
162-
if (settings.has_filter_ngram_min_length()) {
163-
error = "Unsupported filter_ngram_min_length setting";
164-
return false;
165-
}
166-
if (settings.has_filter_ngram_max_length()) {
167-
error = "Unsupported filter_ngram_max_length setting";
168-
return false;
186+
if (settings.use_filter_ngram() || settings.use_filter_edge_ngram()) {
187+
if (settings.use_filter_ngram() && settings.use_filter_edge_ngram()) {
188+
error = "only one of use_filter_ngram or use_filter_edge_ngram should be set, not both";
189+
return false;
190+
}
191+
if (!settings.has_filter_ngram_min_length()) {
192+
error = "filter_ngram_min_length should be set with use_filter_ngram/use_filter_edge_ngram";
193+
return false;
194+
}
195+
if (!settings.has_filter_ngram_max_length()) {
196+
error = "filter_ngram_max_length should be set with use_filter_ngram/use_filter_edge_ngram";
197+
return false;
198+
}
199+
if (!ValidateSettingInRange("filter_ngram_min_length", settings.filter_ngram_min_length(), 1, 20, error)) {
200+
return false;
201+
}
202+
if (!ValidateSettingInRange("filter_ngram_max_length", settings.filter_ngram_max_length(), 1, 20, error)) {
203+
return false;
204+
}
205+
if (settings.filter_ngram_min_length() > settings.filter_ngram_max_length()) {
206+
error = "Invalid filter_ngram_min_length: should be less than or equal to filter_ngram_max_length";
207+
return false;
208+
}
209+
} else {
210+
if (settings.has_filter_ngram_min_length()) {
211+
error = "use_filter_ngram or use_filter_edge_ngram should be set with filter_ngram_min_length";
212+
return false;
213+
}
214+
if (settings.has_filter_ngram_max_length()) {
215+
error = "use_filter_ngram or use_filter_edge_ngram should be set with filter_ngram_max_length";
216+
return false;
217+
}
169218
}
170219

171220
if (settings.use_filter_length()) {
@@ -180,7 +229,7 @@ namespace {
180229
return false;
181230
}
182231
if (settings.has_filter_length_min() && settings.has_filter_length_max() && settings.filter_length_min() > settings.filter_length_max()) {
183-
error = "Invalid filter_length_min: should be less or equal than filter_length_max";
232+
error = "Invalid filter_length_min: should be less than or equal to filter_length_max";
184233
return false;
185234
}
186235
} else {
@@ -220,6 +269,14 @@ TVector<TString> Analyze(const TString& text, const Ydb::Table::FulltextIndexSet
220269
}), tokens.end());
221270
}
222271

272+
if (settings.use_filter_ngram() || settings.use_filter_edge_ngram()) {
273+
TVector<TString> ngrams;
274+
for (const auto& token : tokens) {
275+
BuildNgrams(token, settings.filter_ngram_min_length(), settings.filter_ngram_max_length(), settings.use_filter_edge_ngram(), ngrams);
276+
}
277+
tokens.swap(ngrams);
278+
}
279+
223280
return tokens;
224281
}
225282

ydb/core/base/ut/fulltext_ut.cpp

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ Y_UNIT_TEST_SUITE(NFulltext) {
6969

7070
columnAnalyzers->set_filter_length_max(3);
7171
UNIT_ASSERT_C(!ValidateSettings(settings, error), error);
72-
UNIT_ASSERT_VALUES_EQUAL(error, "Invalid filter_length_min: should be less or equal than filter_length_max");
72+
UNIT_ASSERT_VALUES_EQUAL(error, "Invalid filter_length_min: should be less than or equal to filter_length_max");
7373

7474
columnAnalyzers->set_filter_length_min(-5);
7575
UNIT_ASSERT_C(!ValidateSettings(settings, error), error);
@@ -243,6 +243,33 @@ Y_UNIT_TEST_SUITE(NFulltext) {
243243
analyzers.clear_filter_length_min();
244244
UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector<TString>{"кот", "ест", "день"}));
245245
}
246+
247+
Y_UNIT_TEST(AnalyzeFilterNgram) {
248+
Ydb::Table::FulltextIndexSettings::Analyzers analyzers;
249+
analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::WHITESPACE);
250+
TString text = "это текст";
251+
252+
UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector<TString>{"это", "текст"}));
253+
254+
analyzers.set_use_filter_ngram(true);
255+
analyzers.set_filter_ngram_min_length(2);
256+
analyzers.set_filter_ngram_max_length(3);
257+
UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector<TString>{"эт", "то", "это", "те", "ек", "кс", "ст", "тек", "екс", "кст"}));
258+
259+
analyzers.set_filter_ngram_min_length(4);
260+
analyzers.set_filter_ngram_max_length(10);
261+
UNIT_ASSERT_VALUES_EQUAL(Analyze("слово", analyzers), (TVector<TString>{"слов", "лово", "слово"}));
262+
263+
analyzers.set_filter_ngram_min_length(10);
264+
analyzers.set_filter_ngram_max_length(10);
265+
UNIT_ASSERT_VALUES_EQUAL(Analyze("слово", analyzers), (TVector<TString>{}));
266+
267+
analyzers.set_use_filter_ngram(false);
268+
analyzers.set_use_filter_edge_ngram(true);
269+
analyzers.set_filter_ngram_min_length(2);
270+
analyzers.set_filter_ngram_max_length(3);
271+
UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector<TString>{"эт", "это", "те", "тек"}));
272+
}
246273
}
247274

248275
}

0 commit comments

Comments
 (0)