From a314519707a16e5929a840915b99fb7921817043 Mon Sep 17 00:00:00 2001 From: Stanislav Tebloev Date: Wed, 12 Nov 2025 15:36:51 +0000 Subject: [PATCH 01/20] Add use_filter_snowball and validation --- ydb/core/base/fulltext.cpp | 25 ++++++++++++++++++++++++- ydb/core/base/ya.make | 1 + ydb/public/api/protos/ydb_table.proto | 4 ++++ 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/ydb/core/base/fulltext.cpp b/ydb/core/base/fulltext.cpp index dabd55a25850..562264f53fbb 100644 --- a/ydb/core/base/fulltext.cpp +++ b/ydb/core/base/fulltext.cpp @@ -172,7 +172,28 @@ namespace { return false; } - if (settings.has_language()) { + if (settings.use_filter_snowball()) { + if (settings.use_filter_ngram() || settings.use_filter_edge_ngram()) { + error = "cannot set use_filter_snowball with use_filter_ngam or use_filter_edge_ngram at the same time"; + return false; + } + + if (!settings.has_language()) { + error = "language required when use_filter_snowball is set"; + return false; + } + + bool supportedLanguage = false; + for (auto ptr = sb_stemmer_list(); *ptr != nullptr; ++ptr) { + if (settings.language() == *ptr) { + supportedLanguage = true; + } + } + if (!supportedLanguage) { + error = "language is not supported by snowball"; + return false; + } + } else if (settings.has_language()) { error = "Unsupported language setting"; return false; } @@ -367,6 +388,8 @@ bool FillSetting(Ydb::Table::FulltextIndexSettings& settings, const TString& nam analyzers->set_filter_length_min(ParseInt32(name, value, error)); } else if (nameLower == "filter_length_max") { analyzers->set_filter_length_max(ParseInt32(name, value, error)); + } else if (nameLower == "use_filter_snowball") { + analyzers->set_use_filter_snowball(ParseBool(name, value, error)); } else { error = TStringBuilder() << "Unknown index setting: " << name; return false; diff --git a/ydb/core/base/ya.make b/ydb/core/base/ya.make index 785df55aadcb..2f763f91b31f 100644 --- a/ydb/core/base/ya.make +++ b/ydb/core/base/ya.make @@ -88,6 +88,7 @@ SRCS( ) PEERDIR( + contrib/libs/snowball ydb/library/actors/core ydb/library/actors/helpers ydb/library/actors/interconnect diff --git a/ydb/public/api/protos/ydb_table.proto b/ydb/public/api/protos/ydb_table.proto index e22ce6145054..5bc3abb840f6 100644 --- a/ydb/public/api/protos/ydb_table.proto +++ b/ydb/public/api/protos/ydb_table.proto @@ -229,6 +229,10 @@ message FulltextIndexSettings { // Maximum token length to keep (inclusive) // Must be used with use_filter_length optional int32 filter_length_max = 132 [(Ydb.value) = ">= 0"]; + + // Wether to apply stemming for each token + // TODO + optional bool use_filter_snowball = 140; } // Represents text analyzers settings for a specific column From 98f6406c22e0ee7a175762038f445287187585b4 Mon Sep 17 00:00:00 2001 From: Stanislav Tebloev Date: Wed, 12 Nov 2025 17:01:44 +0000 Subject: [PATCH 02/20] Add actual filter in index building --- ydb/core/base/fulltext.cpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/ydb/core/base/fulltext.cpp b/ydb/core/base/fulltext.cpp index 562264f53fbb..18777be45782 100644 --- a/ydb/core/base/fulltext.cpp +++ b/ydb/core/base/fulltext.cpp @@ -1,4 +1,7 @@ #include "fulltext.h" + +#include + #include #include @@ -289,6 +292,20 @@ TVector Analyze(const TString& text, const Ydb::Table::FulltextIndexSet }), tokens.end()); } + if (settings.use_filter_snowball()) { + struct sb_stemmer* stemmer = sb_stemmer_new(settings.language().c_str(), nullptr); + for (auto& token : tokens) { + const sb_symbol* stemmed = sb_stemmer_stem( + stemmer, + reinterpret_cast(token.data()), + token.size() + ); + + const size_t resultLength = sb_stemmer_length(stemmer); + token = std::string(reinterpret_cast(stemmed), resultLength); + } + } + if (settings.use_filter_ngram() || settings.use_filter_edge_ngram()) { TVector ngrams; for (const auto& token : tokens) { From eb16f6d809a3d2b4b29e437fd9bdbfd2d0291523 Mon Sep 17 00:00:00 2001 From: Stanislav Tebloev Date: Wed, 12 Nov 2025 17:02:11 +0000 Subject: [PATCH 03/20] Add kqp test for fulltext index with snowball filter --- .../ut/indexes/kqp_indexes_fulltext_ut.cpp | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/ydb/core/kqp/ut/indexes/kqp_indexes_fulltext_ut.cpp b/ydb/core/kqp/ut/indexes/kqp_indexes_fulltext_ut.cpp index 80d08eb29b3e..03f149107948 100644 --- a/ydb/core/kqp/ut/indexes/kqp_indexes_fulltext_ut.cpp +++ b/ydb/core/kqp/ut/indexes/kqp_indexes_fulltext_ut.cpp @@ -91,6 +91,23 @@ void AddIndexCovered(NQuery::TQueryClient& db) { UNIT_ASSERT_VALUES_EQUAL_C(result.GetStatus(), EStatus::SUCCESS, result.GetIssues().ToString()); } +void AddIndexSnowball(NQuery::TQueryClient& db) { + TString query = R"sql( + ALTER TABLE `/Root/Texts` ADD INDEX fulltext_idx + GLOBAL USING fulltext + ON (Text) + WITH ( + layout=flat, + tokenizer=standard, + use_filter_lowercase=true, + use_filter_snowball=true, + language=english + ) + )sql"; + auto result = db.ExecuteQuery(query, NYdb::NQuery::TTxControl::NoTx()).ExtractValueSync(); + UNIT_ASSERT_VALUES_EQUAL_C(result.GetStatus(), EStatus::SUCCESS, result.GetIssues().ToString()); +} + TResultSet ReadIndex(NQuery::TQueryClient& db) { TString query = R"sql( SELECT * FROM `/Root/Texts/fulltext_idx/indexImplTable`; @@ -224,6 +241,31 @@ Y_UNIT_TEST(AddIndexEdgeNGram) { ])", NYdb::FormatResultSetYson(index)); } +Y_UNIT_TEST(AddIndexSnowball) { + auto kikimr = Kikimr(); + auto db = kikimr.GetQueryClient(); + + CreateTexts(db); + UpsertTexts(db); + AddIndexSnowball(db); + const auto index = ReadIndex(db); + CompareYson(R"([ + [[[100u];"anim"]; + [[100u];"cat"]; + [[200u];"cat"]; + [[300u];"cat"]; + [[100u];"chase"]; + [[200u];"chase"]; + [[200u];"dog"]; + [[400u];"dog"]; + [[400u];"fox"]; + [[300u];"love"]; + [[400u];"love"]; + [[100u];"small"]; + [[200u];"small"] + ])", NYdb::FormatResultSetYson(index)); +} + Y_UNIT_TEST(InsertRow) { auto kikimr = Kikimr(); auto db = kikimr.GetQueryClient(); From 49ed125ae3708192137ee730ea0286dc84d5a3d6 Mon Sep 17 00:00:00 2001 From: Stanislav Tebloev Date: Thu, 13 Nov 2025 11:25:47 +0000 Subject: [PATCH 04/20] Add analyzers unit test for snowball filter --- ydb/core/base/ut/fulltext_ut.cpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/ydb/core/base/ut/fulltext_ut.cpp b/ydb/core/base/ut/fulltext_ut.cpp index a9e9ab058854..fbec114e1ec9 100644 --- a/ydb/core/base/ut/fulltext_ut.cpp +++ b/ydb/core/base/ut/fulltext_ut.cpp @@ -270,6 +270,22 @@ Y_UNIT_TEST_SUITE(NFulltext) { analyzers.set_filter_ngram_max_length(3); UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector{"эт", "это", "те", "тек"})); } + + Y_UNIT_TEST(AnalyzeFilterSnowball) { + Ydb::Table::FulltextIndexSettings::Analyzers analyzers; + analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::WHITESPACE); + const TString russianText = "машины ездят по дорогам исправно"; + + UNIT_ASSERT_VALUES_EQUAL(Analyze(russianText, analyzers), (TVector{"машины", "ездят", "по", "дорогам", "исправно"})); + + analyzers.set_use_filter_snowball(true); + analyzers.set_language("russian"); + UNIT_ASSERT_VALUES_EQUAL(Analyze(russianText, analyzers), (TVector{"машин", "езд", "по", "дорог", "исправн"})); + + const TString englishText = "cars are driving properly on the roads"; + analyzers.set_language("english"); + UNIT_ASSERT_VALUES_EQUAL(Analyze(englishText, analyzers), (TVector{"car", "are", "drive", "proper", "on", "the", "road"})); + } } } From 0aa71e89dd2e2708772ba693d8699ba967a401ac Mon Sep 17 00:00:00 2001 From: Stanislav Tebloev Date: Thu, 13 Nov 2025 12:05:55 +0000 Subject: [PATCH 05/20] Free stemmer resources after using --- ydb/core/base/fulltext.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/ydb/core/base/fulltext.cpp b/ydb/core/base/fulltext.cpp index 18777be45782..418da5bf7c16 100644 --- a/ydb/core/base/fulltext.cpp +++ b/ydb/core/base/fulltext.cpp @@ -304,6 +304,7 @@ TVector Analyze(const TString& text, const Ydb::Table::FulltextIndexSet const size_t resultLength = sb_stemmer_length(stemmer); token = std::string(reinterpret_cast(stemmed), resultLength); } + sb_stemmer_delete(stemmer); } if (settings.use_filter_ngram() || settings.use_filter_edge_ngram()) { From 4010dfd77ad2d184bb99afb7b08579fd681c3948 Mon Sep 17 00:00:00 2001 From: Stanislav Tebloev Date: Thu, 13 Nov 2025 13:22:19 +0000 Subject: [PATCH 06/20] Create one stemmer for Scan not for Row --- ydb/core/base/fulltext.cpp | 7 ++---- ydb/core/base/fulltext.h | 11 ++++++++- ydb/core/base/ut/fulltext_ut.cpp | 23 +++++++++++-------- .../tx/datashard/build_index/fulltext.cpp | 6 ++++- 4 files changed, 31 insertions(+), 16 deletions(-) diff --git a/ydb/core/base/fulltext.cpp b/ydb/core/base/fulltext.cpp index 418da5bf7c16..e9e2e365ab87 100644 --- a/ydb/core/base/fulltext.cpp +++ b/ydb/core/base/fulltext.cpp @@ -1,7 +1,5 @@ #include "fulltext.h" -#include - #include #include @@ -270,7 +268,7 @@ namespace { } } -TVector Analyze(const TString& text, const Ydb::Table::FulltextIndexSettings::Analyzers& settings) { +TVector Analyze(const TString& text, const Ydb::Table::FulltextIndexSettings::Analyzers& settings, struct sb_stemmer* stemmer) { TVector tokens = Tokenize(text, settings.tokenizer()); if (settings.use_filter_lowercase()) { @@ -293,7 +291,7 @@ TVector Analyze(const TString& text, const Ydb::Table::FulltextIndexSet } if (settings.use_filter_snowball()) { - struct sb_stemmer* stemmer = sb_stemmer_new(settings.language().c_str(), nullptr); + Y_ASSERT(stemmer); for (auto& token : tokens) { const sb_symbol* stemmed = sb_stemmer_stem( stemmer, @@ -304,7 +302,6 @@ TVector Analyze(const TString& text, const Ydb::Table::FulltextIndexSet const size_t resultLength = sb_stemmer_length(stemmer); token = std::string(reinterpret_cast(stemmed), resultLength); } - sb_stemmer_delete(stemmer); } if (settings.use_filter_ngram() || settings.use_filter_edge_ngram()) { diff --git a/ydb/core/base/fulltext.h b/ydb/core/base/fulltext.h index b61b97fab7d0..024331c03088 100644 --- a/ydb/core/base/fulltext.h +++ b/ydb/core/base/fulltext.h @@ -4,9 +4,18 @@ #include +#include + namespace NKikimr::NFulltext { -TVector Analyze(const TString& text, const Ydb::Table::FulltextIndexSettings::Analyzers& settings); +struct TStemmerDeleter { + void operator()(struct sb_stemmer* stemmer) { + sb_stemmer_delete(stemmer); + } +}; +using TStemmerPtr = std::unique_ptr; + +TVector Analyze(const TString& text, const Ydb::Table::FulltextIndexSettings::Analyzers& settings, struct sb_stemmer* stemmer = nullptr); bool ValidateColumnsMatches(const NProtoBuf::RepeatedPtrField& columns, const Ydb::Table::FulltextIndexSettings& settings, TString& error); bool ValidateColumnsMatches(const TVector& columns, const Ydb::Table::FulltextIndexSettings& settings, TString& error); diff --git a/ydb/core/base/ut/fulltext_ut.cpp b/ydb/core/base/ut/fulltext_ut.cpp index fbec114e1ec9..3a12fe12d725 100644 --- a/ydb/core/base/ut/fulltext_ut.cpp +++ b/ydb/core/base/ut/fulltext_ut.cpp @@ -274,17 +274,22 @@ Y_UNIT_TEST_SUITE(NFulltext) { Y_UNIT_TEST(AnalyzeFilterSnowball) { Ydb::Table::FulltextIndexSettings::Analyzers analyzers; analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::WHITESPACE); - const TString russianText = "машины ездят по дорогам исправно"; - UNIT_ASSERT_VALUES_EQUAL(Analyze(russianText, analyzers), (TVector{"машины", "ездят", "по", "дорогам", "исправно"})); - - analyzers.set_use_filter_snowball(true); - analyzers.set_language("russian"); - UNIT_ASSERT_VALUES_EQUAL(Analyze(russianText, analyzers), (TVector{"машин", "езд", "по", "дорог", "исправн"})); + { + const TString text = "машины ездят по дорогам исправно"; + analyzers.set_use_filter_snowball(true); + analyzers.set_language("russian"); + TStemmerPtr stemmer(sb_stemmer_new(analyzers.language().c_str(), nullptr)); + UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers, stemmer.get()), (TVector{"машин", "езд", "по", "дорог", "исправн"})); + } - const TString englishText = "cars are driving properly on the roads"; - analyzers.set_language("english"); - UNIT_ASSERT_VALUES_EQUAL(Analyze(englishText, analyzers), (TVector{"car", "are", "drive", "proper", "on", "the", "road"})); + { + const TString text = "cars are driving properly on the roads"; + analyzers.set_use_filter_snowball(true); + analyzers.set_language("english"); + TStemmerPtr stemmer(sb_stemmer_new(analyzers.language().c_str(), nullptr)); + UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers, stemmer.get()), (TVector{"car", "are", "drive", "proper", "on", "the", "road"})); + } } } diff --git a/ydb/core/tx/datashard/build_index/fulltext.cpp b/ydb/core/tx/datashard/build_index/fulltext.cpp index fdd9f0f11162..02483c814c2a 100644 --- a/ydb/core/tx/datashard/build_index/fulltext.cpp +++ b/ydb/core/tx/datashard/build_index/fulltext.cpp @@ -36,6 +36,7 @@ class TBuildFulltextIndexScan: public TActor, public IA TTags ScanTags; TString TextColumn; Ydb::Table::FulltextIndexSettings::Analyzers TextAnalyzers; + TStemmerPtr Stemmer; TBatchRowsUploader Uploader; TBufferData* UploadBuf = nullptr; @@ -65,6 +66,9 @@ class TBuildFulltextIndexScan: public TActor, public IA Y_ENSURE(Request.settings().columns().size() == 1); TextColumn = Request.settings().columns().at(0).column(); TextAnalyzers = Request.settings().columns().at(0).analyzers(); + if (TextAnalyzers.use_filter_snowball()) { + Stemmer = TStemmerPtr(sb_stemmer_new(TextAnalyzers.language().c_str(), nullptr)); + } auto tags = GetAllTags(table); auto types = GetAllTypes(table); @@ -142,7 +146,7 @@ class TBuildFulltextIndexScan: public TActor, public IA TVector uploadValue(::Reserve(Request.GetDataColumns().size())); TString text((*row).at(0).AsBuf()); - auto tokens = Analyze(text, TextAnalyzers); + auto tokens = Analyze(text, TextAnalyzers, Stemmer.get()); for (const auto& token : tokens) { uploadKey.clear(); uploadKey.push_back(TCell(token)); From 5b3b6e7e48607fe7f53d929df9da690a147672cd Mon Sep 17 00:00:00 2001 From: Stanislav Tebloev Date: Thu, 13 Nov 2025 13:25:53 +0000 Subject: [PATCH 07/20] Unnecessary change --- ydb/core/base/fulltext.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/ydb/core/base/fulltext.cpp b/ydb/core/base/fulltext.cpp index e9e2e365ab87..17812828f798 100644 --- a/ydb/core/base/fulltext.cpp +++ b/ydb/core/base/fulltext.cpp @@ -1,5 +1,4 @@ #include "fulltext.h" - #include #include From 3555e1fb2b604f17cec205472078118dc4d3e6e9 Mon Sep 17 00:00:00 2001 From: Stanislav Tebloev Date: Thu, 13 Nov 2025 13:32:39 +0000 Subject: [PATCH 08/20] Add description for `use_filter_snowball` option --- ydb/public/api/protos/ydb_table.proto | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/ydb/public/api/protos/ydb_table.proto b/ydb/public/api/protos/ydb_table.proto index 5bc3abb840f6..40a90ac15998 100644 --- a/ydb/public/api/protos/ydb_table.proto +++ b/ydb/public/api/protos/ydb_table.proto @@ -174,7 +174,7 @@ message FulltextIndexSettings { // See Tokenizer enum optional Tokenizer tokenizer = 1; - // Language used for language-sensitive operations like stopword filtering + // Language used for language-sensitive operations like stopword filtering and stemming // Example: language = "english" // By default is not specified and no language-specific logic is applied optional string language = 2; @@ -230,8 +230,11 @@ message FulltextIndexSettings { // Must be used with use_filter_length optional int32 filter_length_max = 132 [(Ydb.value) = ">= 0"]; - // Wether to apply stemming for each token - // TODO + // Wether to apply snowball stemming to each token + // Must be used with language option + // Example: language = "english" + // Tokens: ["cars", "beautifully", "conspirated"] + // Output: ["car", "beauti", "conspir"] optional bool use_filter_snowball = 140; } From a61808cc29a92d9bdbd5b54e93b0f318e87dca90 Mon Sep 17 00:00:00 2001 From: Stanislav Tebloev Date: Thu, 13 Nov 2025 15:00:01 +0000 Subject: [PATCH 09/20] Revert "Create one stemmer for Scan not for Row" This reverts commit 4010dfd77ad2d184bb99afb7b08579fd681c3948. --- ydb/core/base/fulltext.cpp | 8 +++++-- ydb/core/base/fulltext.h | 11 +-------- ydb/core/base/ut/fulltext_ut.cpp | 23 ++++++++----------- .../tx/datashard/build_index/fulltext.cpp | 6 +---- 4 files changed, 17 insertions(+), 31 deletions(-) diff --git a/ydb/core/base/fulltext.cpp b/ydb/core/base/fulltext.cpp index 17812828f798..418da5bf7c16 100644 --- a/ydb/core/base/fulltext.cpp +++ b/ydb/core/base/fulltext.cpp @@ -1,4 +1,7 @@ #include "fulltext.h" + +#include + #include #include @@ -267,7 +270,7 @@ namespace { } } -TVector Analyze(const TString& text, const Ydb::Table::FulltextIndexSettings::Analyzers& settings, struct sb_stemmer* stemmer) { +TVector Analyze(const TString& text, const Ydb::Table::FulltextIndexSettings::Analyzers& settings) { TVector tokens = Tokenize(text, settings.tokenizer()); if (settings.use_filter_lowercase()) { @@ -290,7 +293,7 @@ TVector Analyze(const TString& text, const Ydb::Table::FulltextIndexSet } if (settings.use_filter_snowball()) { - Y_ASSERT(stemmer); + struct sb_stemmer* stemmer = sb_stemmer_new(settings.language().c_str(), nullptr); for (auto& token : tokens) { const sb_symbol* stemmed = sb_stemmer_stem( stemmer, @@ -301,6 +304,7 @@ TVector Analyze(const TString& text, const Ydb::Table::FulltextIndexSet const size_t resultLength = sb_stemmer_length(stemmer); token = std::string(reinterpret_cast(stemmed), resultLength); } + sb_stemmer_delete(stemmer); } if (settings.use_filter_ngram() || settings.use_filter_edge_ngram()) { diff --git a/ydb/core/base/fulltext.h b/ydb/core/base/fulltext.h index 024331c03088..b61b97fab7d0 100644 --- a/ydb/core/base/fulltext.h +++ b/ydb/core/base/fulltext.h @@ -4,18 +4,9 @@ #include -#include - namespace NKikimr::NFulltext { -struct TStemmerDeleter { - void operator()(struct sb_stemmer* stemmer) { - sb_stemmer_delete(stemmer); - } -}; -using TStemmerPtr = std::unique_ptr; - -TVector Analyze(const TString& text, const Ydb::Table::FulltextIndexSettings::Analyzers& settings, struct sb_stemmer* stemmer = nullptr); +TVector Analyze(const TString& text, const Ydb::Table::FulltextIndexSettings::Analyzers& settings); bool ValidateColumnsMatches(const NProtoBuf::RepeatedPtrField& columns, const Ydb::Table::FulltextIndexSettings& settings, TString& error); bool ValidateColumnsMatches(const TVector& columns, const Ydb::Table::FulltextIndexSettings& settings, TString& error); diff --git a/ydb/core/base/ut/fulltext_ut.cpp b/ydb/core/base/ut/fulltext_ut.cpp index 3a12fe12d725..fbec114e1ec9 100644 --- a/ydb/core/base/ut/fulltext_ut.cpp +++ b/ydb/core/base/ut/fulltext_ut.cpp @@ -274,22 +274,17 @@ Y_UNIT_TEST_SUITE(NFulltext) { Y_UNIT_TEST(AnalyzeFilterSnowball) { Ydb::Table::FulltextIndexSettings::Analyzers analyzers; analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::WHITESPACE); + const TString russianText = "машины ездят по дорогам исправно"; - { - const TString text = "машины ездят по дорогам исправно"; - analyzers.set_use_filter_snowball(true); - analyzers.set_language("russian"); - TStemmerPtr stemmer(sb_stemmer_new(analyzers.language().c_str(), nullptr)); - UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers, stemmer.get()), (TVector{"машин", "езд", "по", "дорог", "исправн"})); - } + UNIT_ASSERT_VALUES_EQUAL(Analyze(russianText, analyzers), (TVector{"машины", "ездят", "по", "дорогам", "исправно"})); - { - const TString text = "cars are driving properly on the roads"; - analyzers.set_use_filter_snowball(true); - analyzers.set_language("english"); - TStemmerPtr stemmer(sb_stemmer_new(analyzers.language().c_str(), nullptr)); - UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers, stemmer.get()), (TVector{"car", "are", "drive", "proper", "on", "the", "road"})); - } + analyzers.set_use_filter_snowball(true); + analyzers.set_language("russian"); + UNIT_ASSERT_VALUES_EQUAL(Analyze(russianText, analyzers), (TVector{"машин", "езд", "по", "дорог", "исправн"})); + + const TString englishText = "cars are driving properly on the roads"; + analyzers.set_language("english"); + UNIT_ASSERT_VALUES_EQUAL(Analyze(englishText, analyzers), (TVector{"car", "are", "drive", "proper", "on", "the", "road"})); } } diff --git a/ydb/core/tx/datashard/build_index/fulltext.cpp b/ydb/core/tx/datashard/build_index/fulltext.cpp index 02483c814c2a..fdd9f0f11162 100644 --- a/ydb/core/tx/datashard/build_index/fulltext.cpp +++ b/ydb/core/tx/datashard/build_index/fulltext.cpp @@ -36,7 +36,6 @@ class TBuildFulltextIndexScan: public TActor, public IA TTags ScanTags; TString TextColumn; Ydb::Table::FulltextIndexSettings::Analyzers TextAnalyzers; - TStemmerPtr Stemmer; TBatchRowsUploader Uploader; TBufferData* UploadBuf = nullptr; @@ -66,9 +65,6 @@ class TBuildFulltextIndexScan: public TActor, public IA Y_ENSURE(Request.settings().columns().size() == 1); TextColumn = Request.settings().columns().at(0).column(); TextAnalyzers = Request.settings().columns().at(0).analyzers(); - if (TextAnalyzers.use_filter_snowball()) { - Stemmer = TStemmerPtr(sb_stemmer_new(TextAnalyzers.language().c_str(), nullptr)); - } auto tags = GetAllTags(table); auto types = GetAllTypes(table); @@ -146,7 +142,7 @@ class TBuildFulltextIndexScan: public TActor, public IA TVector uploadValue(::Reserve(Request.GetDataColumns().size())); TString text((*row).at(0).AsBuf()); - auto tokens = Analyze(text, TextAnalyzers, Stemmer.get()); + auto tokens = Analyze(text, TextAnalyzers); for (const auto& token : tokens) { uploadKey.clear(); uploadKey.push_back(TCell(token)); From 8b7e25c54806e63d7f7afeece81cd722fa7f501d Mon Sep 17 00:00:00 2001 From: Stanislav Tebloev Date: Thu, 13 Nov 2025 15:16:52 +0000 Subject: [PATCH 10/20] Fix kqp test --- ydb/core/kqp/ut/indexes/kqp_indexes_fulltext_ut.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ydb/core/kqp/ut/indexes/kqp_indexes_fulltext_ut.cpp b/ydb/core/kqp/ut/indexes/kqp_indexes_fulltext_ut.cpp index 03f149107948..6e51fd6a9250 100644 --- a/ydb/core/kqp/ut/indexes/kqp_indexes_fulltext_ut.cpp +++ b/ydb/core/kqp/ut/indexes/kqp_indexes_fulltext_ut.cpp @@ -249,8 +249,9 @@ Y_UNIT_TEST(AddIndexSnowball) { UpsertTexts(db); AddIndexSnowball(db); const auto index = ReadIndex(db); + Cerr << index.RowsCount() << Endl; CompareYson(R"([ - [[[100u];"anim"]; + [[100u];"anim"]; [[100u];"cat"]; [[200u];"cat"]; [[300u];"cat"]; From 7118e8570c139b483528d4260fff1442d21d0eac Mon Sep 17 00:00:00 2001 From: Stanislav Tebloev Date: Wed, 19 Nov 2025 10:14:41 +0000 Subject: [PATCH 11/20] Check for nullptr from sb_stemmer_new --- ydb/core/base/fulltext.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ydb/core/base/fulltext.cpp b/ydb/core/base/fulltext.cpp index 418da5bf7c16..ad042cbd192d 100644 --- a/ydb/core/base/fulltext.cpp +++ b/ydb/core/base/fulltext.cpp @@ -294,6 +294,9 @@ TVector Analyze(const TString& text, const Ydb::Table::FulltextIndexSet if (settings.use_filter_snowball()) { struct sb_stemmer* stemmer = sb_stemmer_new(settings.language().c_str(), nullptr); + if (stemmer == nullptr) { + ythrow yexception() << "sb_stemmer_new returned nullptr"; + } for (auto& token : tokens) { const sb_symbol* stemmed = sb_stemmer_stem( stemmer, From d3c713129a544658d9816174c18143c1e58f4d0c Mon Sep 17 00:00:00 2001 From: Stanislav Tebloev Date: Wed, 19 Nov 2025 10:23:24 +0000 Subject: [PATCH 12/20] Fix typos --- ydb/core/base/fulltext.cpp | 2 +- ydb/public/api/protos/ydb_table.proto | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ydb/core/base/fulltext.cpp b/ydb/core/base/fulltext.cpp index ad042cbd192d..5aec361c549d 100644 --- a/ydb/core/base/fulltext.cpp +++ b/ydb/core/base/fulltext.cpp @@ -177,7 +177,7 @@ namespace { if (settings.use_filter_snowball()) { if (settings.use_filter_ngram() || settings.use_filter_edge_ngram()) { - error = "cannot set use_filter_snowball with use_filter_ngam or use_filter_edge_ngram at the same time"; + error = "cannot set use_filter_snowball with use_filter_ngram or use_filter_edge_ngram at the same time"; return false; } diff --git a/ydb/public/api/protos/ydb_table.proto b/ydb/public/api/protos/ydb_table.proto index 40a90ac15998..8da825026d5e 100644 --- a/ydb/public/api/protos/ydb_table.proto +++ b/ydb/public/api/protos/ydb_table.proto @@ -230,10 +230,10 @@ message FulltextIndexSettings { // Must be used with use_filter_length optional int32 filter_length_max = 132 [(Ydb.value) = ">= 0"]; - // Wether to apply snowball stemming to each token + // Whether to apply snowball stemming to each token // Must be used with language option // Example: language = "english" - // Tokens: ["cars", "beautifully", "conspirated"] + // Tokens: ["cars", "beautifully", "conspired"] // Output: ["car", "beauti", "conspir"] optional bool use_filter_snowball = 140; } From 9fca29f99d3cbc1630df840c191b19e098f128b9 Mon Sep 17 00:00:00 2001 From: Stanislav Tebloev Date: Wed, 19 Nov 2025 10:37:19 +0000 Subject: [PATCH 13/20] Add unit test for invalid languages --- ydb/core/base/ut/fulltext_ut.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ydb/core/base/ut/fulltext_ut.cpp b/ydb/core/base/ut/fulltext_ut.cpp index fbec114e1ec9..fc1e0061f369 100644 --- a/ydb/core/base/ut/fulltext_ut.cpp +++ b/ydb/core/base/ut/fulltext_ut.cpp @@ -285,6 +285,12 @@ Y_UNIT_TEST_SUITE(NFulltext) { const TString englishText = "cars are driving properly on the roads"; analyzers.set_language("english"); UNIT_ASSERT_VALUES_EQUAL(Analyze(englishText, analyzers), (TVector{"car", "are", "drive", "proper", "on", "the", "road"})); + + analyzers.set_language("klingon"); + UNIT_ASSERT_EXCEPTION(Analyze(englishText, analyzers), yexception); + + analyzers.clear_language(); + UNIT_ASSERT_EXCEPTION(Analyze(englishText, analyzers), yexception); } } From 11288153e98bdbdd32812d08678bfb4206fcff0b Mon Sep 17 00:00:00 2001 From: Stanislav Tebloev Date: Wed, 19 Nov 2025 10:50:11 +0000 Subject: [PATCH 14/20] Add failing test with wrong language --- .../ut/indexes/kqp_indexes_fulltext_ut.cpp | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/ydb/core/kqp/ut/indexes/kqp_indexes_fulltext_ut.cpp b/ydb/core/kqp/ut/indexes/kqp_indexes_fulltext_ut.cpp index 6e51fd6a9250..5474689c56ff 100644 --- a/ydb/core/kqp/ut/indexes/kqp_indexes_fulltext_ut.cpp +++ b/ydb/core/kqp/ut/indexes/kqp_indexes_fulltext_ut.cpp @@ -91,8 +91,8 @@ void AddIndexCovered(NQuery::TQueryClient& db) { UNIT_ASSERT_VALUES_EQUAL_C(result.GetStatus(), EStatus::SUCCESS, result.GetIssues().ToString()); } -void AddIndexSnowball(NQuery::TQueryClient& db) { - TString query = R"sql( +void AddIndexSnowball(NQuery::TQueryClient& db, const TString& language) { + TString query = Sprintf(R"sql( ALTER TABLE `/Root/Texts` ADD INDEX fulltext_idx GLOBAL USING fulltext ON (Text) @@ -101,9 +101,9 @@ void AddIndexSnowball(NQuery::TQueryClient& db) { tokenizer=standard, use_filter_lowercase=true, use_filter_snowball=true, - language=english + language=%s ) - )sql"; + )sql", language.c_str()); auto result = db.ExecuteQuery(query, NYdb::NQuery::TTxControl::NoTx()).ExtractValueSync(); UNIT_ASSERT_VALUES_EQUAL_C(result.GetStatus(), EStatus::SUCCESS, result.GetIssues().ToString()); } @@ -247,7 +247,7 @@ Y_UNIT_TEST(AddIndexSnowball) { CreateTexts(db); UpsertTexts(db); - AddIndexSnowball(db); + AddIndexSnowball(db, "english"); const auto index = ReadIndex(db); Cerr << index.RowsCount() << Endl; CompareYson(R"([ @@ -267,6 +267,16 @@ Y_UNIT_TEST(AddIndexSnowball) { ])", NYdb::FormatResultSetYson(index)); } +Y_UNIT_TEST(AddIndexSnowballWithWrongLanguage) { + auto kikimr = Kikimr(); + auto db = kikimr.GetQueryClient(); + + CreateTexts(db); + UpsertTexts(db); + + UNIT_ASSERT_TEST_FAILS(AddIndexSnowball(db, "klingon")); +} + Y_UNIT_TEST(InsertRow) { auto kikimr = Kikimr(); auto db = kikimr.GetQueryClient(); From f3d8f28e4d6d31388d50888a083721cce24741cd Mon Sep 17 00:00:00 2001 From: Stanislav Tebloev Date: Wed, 19 Nov 2025 11:05:39 +0000 Subject: [PATCH 15/20] Add tests for validation of snowball settings --- ydb/core/base/ut/fulltext_ut.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/ydb/core/base/ut/fulltext_ut.cpp b/ydb/core/base/ut/fulltext_ut.cpp index fc1e0061f369..4e879d8e0803 100644 --- a/ydb/core/base/ut/fulltext_ut.cpp +++ b/ydb/core/base/ut/fulltext_ut.cpp @@ -80,6 +80,20 @@ Y_UNIT_TEST_SUITE(NFulltext) { UNIT_ASSERT_C(!ValidateSettings(settings, error), error); UNIT_ASSERT_VALUES_EQUAL(error, "Invalid filter_length_max: 3000 should be between 1 and 1000"); + columnAnalyzers->set_use_filter_snowball(true); + columnAnalyzers->clear_language(); + UNIT_ASSERT_C(!ValidateSettings(settings, error), error); + UNIT_ASSERT_VALUES_EQUAL(error, "language required when use_filter_snowball is set"); + + columnAnalyzers->set_language("klingon"); + UNIT_ASSERT_C(!ValidateSettings(settings, error), error); + UNIT_ASSERT_VALUES_EQUAL(error, "language is not supported by snowball"); + + columnAnalyzers->set_language("english"); + columnAnalyzers->set_use_filter_ngram(true); + UNIT_ASSERT_C(!ValidateSettings(settings, error), error); + UNIT_ASSERT_VALUES_EQUAL(error, "cannot set use_filter_snowball with use_filter_ngram or use_filter_edge_ngram at the same time"); + columnSettings = settings.add_columns(); columnSettings->set_column("text2"); UNIT_ASSERT_C(!ValidateSettings(settings, error), error); From 728e542f9b5ca5a8ecb455e7ec46d3eb3944b7b5 Mon Sep 17 00:00:00 2001 From: Stanislav Tebloev Date: Wed, 19 Nov 2025 11:06:03 +0000 Subject: [PATCH 16/20] Minor optimizations --- ydb/core/base/fulltext.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ydb/core/base/fulltext.cpp b/ydb/core/base/fulltext.cpp index 5aec361c549d..4084a0b58ae9 100644 --- a/ydb/core/base/fulltext.cpp +++ b/ydb/core/base/fulltext.cpp @@ -190,6 +190,7 @@ namespace { for (auto ptr = sb_stemmer_list(); *ptr != nullptr; ++ptr) { if (settings.language() == *ptr) { supportedLanguage = true; + break; } } if (!supportedLanguage) { @@ -294,7 +295,7 @@ TVector Analyze(const TString& text, const Ydb::Table::FulltextIndexSet if (settings.use_filter_snowball()) { struct sb_stemmer* stemmer = sb_stemmer_new(settings.language().c_str(), nullptr); - if (stemmer == nullptr) { + if (Y_UNLIKELY(stemmer == nullptr)) { ythrow yexception() << "sb_stemmer_new returned nullptr"; } for (auto& token : tokens) { From 3773711002544d00c417d6afa025a92208eb2ff0 Mon Sep 17 00:00:00 2001 From: Stanislav Tebloev Date: Wed, 19 Nov 2025 13:30:54 +0000 Subject: [PATCH 17/20] Handle case when sb_stemmer_stem returned nullptr --- ydb/core/base/fulltext.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ydb/core/base/fulltext.cpp b/ydb/core/base/fulltext.cpp index 4084a0b58ae9..4a11f1c56e5c 100644 --- a/ydb/core/base/fulltext.cpp +++ b/ydb/core/base/fulltext.cpp @@ -304,6 +304,9 @@ TVector Analyze(const TString& text, const Ydb::Table::FulltextIndexSet reinterpret_cast(token.data()), token.size() ); + if (Y_UNLIKELY(stemmed == nullptr)) { + ythrow yexception() << "unable to allocate memory for sb_stemmer_stem result"; + } const size_t resultLength = sb_stemmer_length(stemmer); token = std::string(reinterpret_cast(stemmed), resultLength); From a25d177be795856bc8f049d40fcd1fd79034c7e8 Mon Sep 17 00:00:00 2001 From: Stanislav Tebloev Date: Wed, 19 Nov 2025 14:03:46 +0000 Subject: [PATCH 18/20] Use RAII wrapper for deleting stemmer --- ydb/core/base/fulltext.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ydb/core/base/fulltext.cpp b/ydb/core/base/fulltext.cpp index 4a11f1c56e5c..2fddf4b98416 100644 --- a/ydb/core/base/fulltext.cpp +++ b/ydb/core/base/fulltext.cpp @@ -298,6 +298,8 @@ TVector Analyze(const TString& text, const Ydb::Table::FulltextIndexSet if (Y_UNLIKELY(stemmer == nullptr)) { ythrow yexception() << "sb_stemmer_new returned nullptr"; } + Y_DEFER { sb_stemmer_delete(stemmer); }; + for (auto& token : tokens) { const sb_symbol* stemmed = sb_stemmer_stem( stemmer, @@ -311,7 +313,6 @@ TVector Analyze(const TString& text, const Ydb::Table::FulltextIndexSet const size_t resultLength = sb_stemmer_length(stemmer); token = std::string(reinterpret_cast(stemmed), resultLength); } - sb_stemmer_delete(stemmer); } if (settings.use_filter_ngram() || settings.use_filter_edge_ngram()) { From 374aa65b610c8f349a61bed878dc063f00bb75c5 Mon Sep 17 00:00:00 2001 From: Stanislav Tebloev Date: Wed, 19 Nov 2025 14:04:04 +0000 Subject: [PATCH 19/20] Remove debug output --- ydb/core/kqp/ut/indexes/kqp_indexes_fulltext_ut.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/ydb/core/kqp/ut/indexes/kqp_indexes_fulltext_ut.cpp b/ydb/core/kqp/ut/indexes/kqp_indexes_fulltext_ut.cpp index 5474689c56ff..510072c017f2 100644 --- a/ydb/core/kqp/ut/indexes/kqp_indexes_fulltext_ut.cpp +++ b/ydb/core/kqp/ut/indexes/kqp_indexes_fulltext_ut.cpp @@ -249,7 +249,6 @@ Y_UNIT_TEST(AddIndexSnowball) { UpsertTexts(db); AddIndexSnowball(db, "english"); const auto index = ReadIndex(db); - Cerr << index.RowsCount() << Endl; CompareYson(R"([ [[100u];"anim"]; [[100u];"cat"]; From 8311de7c33c20dca1e6aad3824776601aad2de59 Mon Sep 17 00:00:00 2001 From: Stanislav Tebloev Date: Wed, 19 Nov 2025 17:23:56 +0300 Subject: [PATCH 20/20] More descriptive error for language option Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- ydb/core/base/fulltext.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ydb/core/base/fulltext.cpp b/ydb/core/base/fulltext.cpp index 2fddf4b98416..f69d411dd6ce 100644 --- a/ydb/core/base/fulltext.cpp +++ b/ydb/core/base/fulltext.cpp @@ -198,7 +198,9 @@ namespace { return false; } } else if (settings.has_language()) { - error = "Unsupported language setting"; + // Currently, language is only used for stemming (use_filter_snowball). + // In the future, it may be used for other language-sensitive operations (e.g., stopword filtering). + error = "language setting is only supported with use_filter_snowball at present; other uses may be supported in the future"; return false; }