diff --git a/ydb/core/base/fulltext.cpp b/ydb/core/base/fulltext.cpp index dabd55a25850..f69d411dd6ce 100644 --- a/ydb/core/base/fulltext.cpp +++ b/ydb/core/base/fulltext.cpp @@ -1,4 +1,7 @@ #include "fulltext.h" + +#include + #include #include @@ -172,8 +175,32 @@ namespace { return false; } - if (settings.has_language()) { - error = "Unsupported language setting"; + if (settings.use_filter_snowball()) { + if (settings.use_filter_ngram() || settings.use_filter_edge_ngram()) { + error = "cannot set use_filter_snowball with use_filter_ngram or use_filter_edge_ngram at the same time"; + return false; + } + + if (!settings.has_language()) { + error = "language required when use_filter_snowball is set"; + return false; + } + + bool supportedLanguage = false; + for (auto ptr = sb_stemmer_list(); *ptr != nullptr; ++ptr) { + if (settings.language() == *ptr) { + supportedLanguage = true; + break; + } + } + if (!supportedLanguage) { + error = "language is not supported by snowball"; + return false; + } + } else if (settings.has_language()) { + // Currently, language is only used for stemming (use_filter_snowball). + // In the future, it may be used for other language-sensitive operations (e.g., stopword filtering). + error = "language setting is only supported with use_filter_snowball at present; other uses may be supported in the future"; return false; } @@ -268,6 +295,28 @@ TVector Analyze(const TString& text, const Ydb::Table::FulltextIndexSet }), tokens.end()); } + if (settings.use_filter_snowball()) { + struct sb_stemmer* stemmer = sb_stemmer_new(settings.language().c_str(), nullptr); + if (Y_UNLIKELY(stemmer == nullptr)) { + ythrow yexception() << "sb_stemmer_new returned nullptr"; + } + Y_DEFER { sb_stemmer_delete(stemmer); }; + + for (auto& token : tokens) { + const sb_symbol* stemmed = sb_stemmer_stem( + stemmer, + reinterpret_cast(token.data()), + token.size() + ); + if (Y_UNLIKELY(stemmed == nullptr)) { + ythrow yexception() << "unable to allocate memory for sb_stemmer_stem result"; + } + + const size_t resultLength = sb_stemmer_length(stemmer); + token = std::string(reinterpret_cast(stemmed), resultLength); + } + } + if (settings.use_filter_ngram() || settings.use_filter_edge_ngram()) { TVector ngrams; for (const auto& token : tokens) { @@ -367,6 +416,8 @@ bool FillSetting(Ydb::Table::FulltextIndexSettings& settings, const TString& nam analyzers->set_filter_length_min(ParseInt32(name, value, error)); } else if (nameLower == "filter_length_max") { analyzers->set_filter_length_max(ParseInt32(name, value, error)); + } else if (nameLower == "use_filter_snowball") { + analyzers->set_use_filter_snowball(ParseBool(name, value, error)); } else { error = TStringBuilder() << "Unknown index setting: " << name; return false; diff --git a/ydb/core/base/ut/fulltext_ut.cpp b/ydb/core/base/ut/fulltext_ut.cpp index a9e9ab058854..4e879d8e0803 100644 --- a/ydb/core/base/ut/fulltext_ut.cpp +++ b/ydb/core/base/ut/fulltext_ut.cpp @@ -80,6 +80,20 @@ Y_UNIT_TEST_SUITE(NFulltext) { UNIT_ASSERT_C(!ValidateSettings(settings, error), error); UNIT_ASSERT_VALUES_EQUAL(error, "Invalid filter_length_max: 3000 should be between 1 and 1000"); + columnAnalyzers->set_use_filter_snowball(true); + columnAnalyzers->clear_language(); + UNIT_ASSERT_C(!ValidateSettings(settings, error), error); + UNIT_ASSERT_VALUES_EQUAL(error, "language required when use_filter_snowball is set"); + + columnAnalyzers->set_language("klingon"); + UNIT_ASSERT_C(!ValidateSettings(settings, error), error); + UNIT_ASSERT_VALUES_EQUAL(error, "language is not supported by snowball"); + + columnAnalyzers->set_language("english"); + columnAnalyzers->set_use_filter_ngram(true); + UNIT_ASSERT_C(!ValidateSettings(settings, error), error); + UNIT_ASSERT_VALUES_EQUAL(error, "cannot set use_filter_snowball with use_filter_ngram or use_filter_edge_ngram at the same time"); + columnSettings = settings.add_columns(); columnSettings->set_column("text2"); UNIT_ASSERT_C(!ValidateSettings(settings, error), error); @@ -270,6 +284,28 @@ Y_UNIT_TEST_SUITE(NFulltext) { analyzers.set_filter_ngram_max_length(3); UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector{"эт", "это", "те", "тек"})); } + + Y_UNIT_TEST(AnalyzeFilterSnowball) { + Ydb::Table::FulltextIndexSettings::Analyzers analyzers; + analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::WHITESPACE); + const TString russianText = "машины ездят по дорогам исправно"; + + UNIT_ASSERT_VALUES_EQUAL(Analyze(russianText, analyzers), (TVector{"машины", "ездят", "по", "дорогам", "исправно"})); + + analyzers.set_use_filter_snowball(true); + analyzers.set_language("russian"); + UNIT_ASSERT_VALUES_EQUAL(Analyze(russianText, analyzers), (TVector{"машин", "езд", "по", "дорог", "исправн"})); + + const TString englishText = "cars are driving properly on the roads"; + analyzers.set_language("english"); + UNIT_ASSERT_VALUES_EQUAL(Analyze(englishText, analyzers), (TVector{"car", "are", "drive", "proper", "on", "the", "road"})); + + analyzers.set_language("klingon"); + UNIT_ASSERT_EXCEPTION(Analyze(englishText, analyzers), yexception); + + analyzers.clear_language(); + UNIT_ASSERT_EXCEPTION(Analyze(englishText, analyzers), yexception); + } } } diff --git a/ydb/core/base/ya.make b/ydb/core/base/ya.make index 785df55aadcb..2f763f91b31f 100644 --- a/ydb/core/base/ya.make +++ b/ydb/core/base/ya.make @@ -88,6 +88,7 @@ SRCS( ) PEERDIR( + contrib/libs/snowball ydb/library/actors/core ydb/library/actors/helpers ydb/library/actors/interconnect diff --git a/ydb/core/kqp/ut/indexes/kqp_indexes_fulltext_ut.cpp b/ydb/core/kqp/ut/indexes/kqp_indexes_fulltext_ut.cpp index 80d08eb29b3e..510072c017f2 100644 --- a/ydb/core/kqp/ut/indexes/kqp_indexes_fulltext_ut.cpp +++ b/ydb/core/kqp/ut/indexes/kqp_indexes_fulltext_ut.cpp @@ -91,6 +91,23 @@ void AddIndexCovered(NQuery::TQueryClient& db) { UNIT_ASSERT_VALUES_EQUAL_C(result.GetStatus(), EStatus::SUCCESS, result.GetIssues().ToString()); } +void AddIndexSnowball(NQuery::TQueryClient& db, const TString& language) { + TString query = Sprintf(R"sql( + ALTER TABLE `/Root/Texts` ADD INDEX fulltext_idx + GLOBAL USING fulltext + ON (Text) + WITH ( + layout=flat, + tokenizer=standard, + use_filter_lowercase=true, + use_filter_snowball=true, + language=%s + ) + )sql", language.c_str()); + auto result = db.ExecuteQuery(query, NYdb::NQuery::TTxControl::NoTx()).ExtractValueSync(); + UNIT_ASSERT_VALUES_EQUAL_C(result.GetStatus(), EStatus::SUCCESS, result.GetIssues().ToString()); +} + TResultSet ReadIndex(NQuery::TQueryClient& db) { TString query = R"sql( SELECT * FROM `/Root/Texts/fulltext_idx/indexImplTable`; @@ -224,6 +241,41 @@ Y_UNIT_TEST(AddIndexEdgeNGram) { ])", NYdb::FormatResultSetYson(index)); } +Y_UNIT_TEST(AddIndexSnowball) { + auto kikimr = Kikimr(); + auto db = kikimr.GetQueryClient(); + + CreateTexts(db); + UpsertTexts(db); + AddIndexSnowball(db, "english"); + const auto index = ReadIndex(db); + CompareYson(R"([ + [[100u];"anim"]; + [[100u];"cat"]; + [[200u];"cat"]; + [[300u];"cat"]; + [[100u];"chase"]; + [[200u];"chase"]; + [[200u];"dog"]; + [[400u];"dog"]; + [[400u];"fox"]; + [[300u];"love"]; + [[400u];"love"]; + [[100u];"small"]; + [[200u];"small"] + ])", NYdb::FormatResultSetYson(index)); +} + +Y_UNIT_TEST(AddIndexSnowballWithWrongLanguage) { + auto kikimr = Kikimr(); + auto db = kikimr.GetQueryClient(); + + CreateTexts(db); + UpsertTexts(db); + + UNIT_ASSERT_TEST_FAILS(AddIndexSnowball(db, "klingon")); +} + Y_UNIT_TEST(InsertRow) { auto kikimr = Kikimr(); auto db = kikimr.GetQueryClient(); diff --git a/ydb/public/api/protos/ydb_table.proto b/ydb/public/api/protos/ydb_table.proto index e22ce6145054..8da825026d5e 100644 --- a/ydb/public/api/protos/ydb_table.proto +++ b/ydb/public/api/protos/ydb_table.proto @@ -174,7 +174,7 @@ message FulltextIndexSettings { // See Tokenizer enum optional Tokenizer tokenizer = 1; - // Language used for language-sensitive operations like stopword filtering + // Language used for language-sensitive operations like stopword filtering and stemming // Example: language = "english" // By default is not specified and no language-specific logic is applied optional string language = 2; @@ -229,6 +229,13 @@ message FulltextIndexSettings { // Maximum token length to keep (inclusive) // Must be used with use_filter_length optional int32 filter_length_max = 132 [(Ydb.value) = ">= 0"]; + + // Whether to apply snowball stemming to each token + // Must be used with language option + // Example: language = "english" + // Tokens: ["cars", "beautifully", "conspired"] + // Output: ["car", "beauti", "conspir"] + optional bool use_filter_snowball = 140; } // Represents text analyzers settings for a specific column