Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 53 additions & 2 deletions ydb/core/base/fulltext.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
#include "fulltext.h"

#include <contrib/libs/snowball/include/libstemmer.h>

#include <util/charset/utf8.h>
#include <util/generic/xrange.h>

Expand Down Expand Up @@ -172,8 +175,32 @@ namespace {
return false;
}

if (settings.has_language()) {
error = "Unsupported language setting";
if (settings.use_filter_snowball()) {
if (settings.use_filter_ngram() || settings.use_filter_edge_ngram()) {
error = "cannot set use_filter_snowball with use_filter_ngram or use_filter_edge_ngram at the same time";
return false;
}

if (!settings.has_language()) {
error = "language required when use_filter_snowball is set";
return false;
}

bool supportedLanguage = false;
for (auto ptr = sb_stemmer_list(); *ptr != nullptr; ++ptr) {
if (settings.language() == *ptr) {
supportedLanguage = true;
break;
}
}
if (!supportedLanguage) {
error = "language is not supported by snowball";
return false;
}
} else if (settings.has_language()) {
// Currently, language is only used for stemming (use_filter_snowball).
// In the future, it may be used for other language-sensitive operations (e.g., stopword filtering).
error = "language setting is only supported with use_filter_snowball at present; other uses may be supported in the future";
return false;
}

Expand Down Expand Up @@ -268,6 +295,28 @@ TVector<TString> Analyze(const TString& text, const Ydb::Table::FulltextIndexSet
}), tokens.end());
}

if (settings.use_filter_snowball()) {
struct sb_stemmer* stemmer = sb_stemmer_new(settings.language().c_str(), nullptr);
if (Y_UNLIKELY(stemmer == nullptr)) {
ythrow yexception() << "sb_stemmer_new returned nullptr";
}
Y_DEFER { sb_stemmer_delete(stemmer); };

for (auto& token : tokens) {
const sb_symbol* stemmed = sb_stemmer_stem(
stemmer,
reinterpret_cast<const sb_symbol*>(token.data()),
token.size()
);
if (Y_UNLIKELY(stemmed == nullptr)) {
ythrow yexception() << "unable to allocate memory for sb_stemmer_stem result";
}

const size_t resultLength = sb_stemmer_length(stemmer);
token = std::string(reinterpret_cast<const char*>(stemmed), resultLength);
}
}

if (settings.use_filter_ngram() || settings.use_filter_edge_ngram()) {
TVector<TString> ngrams;
for (const auto& token : tokens) {
Expand Down Expand Up @@ -367,6 +416,8 @@ bool FillSetting(Ydb::Table::FulltextIndexSettings& settings, const TString& nam
analyzers->set_filter_length_min(ParseInt32(name, value, error));
} else if (nameLower == "filter_length_max") {
analyzers->set_filter_length_max(ParseInt32(name, value, error));
} else if (nameLower == "use_filter_snowball") {
analyzers->set_use_filter_snowball(ParseBool(name, value, error));
} else {
error = TStringBuilder() << "Unknown index setting: " << name;
return false;
Expand Down
36 changes: 36 additions & 0 deletions ydb/core/base/ut/fulltext_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,20 @@ Y_UNIT_TEST_SUITE(NFulltext) {
UNIT_ASSERT_C(!ValidateSettings(settings, error), error);
UNIT_ASSERT_VALUES_EQUAL(error, "Invalid filter_length_max: 3000 should be between 1 and 1000");

columnAnalyzers->set_use_filter_snowball(true);
columnAnalyzers->clear_language();
UNIT_ASSERT_C(!ValidateSettings(settings, error), error);
UNIT_ASSERT_VALUES_EQUAL(error, "language required when use_filter_snowball is set");

columnAnalyzers->set_language("klingon");
UNIT_ASSERT_C(!ValidateSettings(settings, error), error);
UNIT_ASSERT_VALUES_EQUAL(error, "language is not supported by snowball");

columnAnalyzers->set_language("english");
columnAnalyzers->set_use_filter_ngram(true);
UNIT_ASSERT_C(!ValidateSettings(settings, error), error);
UNIT_ASSERT_VALUES_EQUAL(error, "cannot set use_filter_snowball with use_filter_ngram or use_filter_edge_ngram at the same time");

columnSettings = settings.add_columns();
columnSettings->set_column("text2");
UNIT_ASSERT_C(!ValidateSettings(settings, error), error);
Expand Down Expand Up @@ -270,6 +284,28 @@ Y_UNIT_TEST_SUITE(NFulltext) {
analyzers.set_filter_ngram_max_length(3);
UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector<TString>{"эт", "это", "те", "тек"}));
}

Y_UNIT_TEST(AnalyzeFilterSnowball) {
Ydb::Table::FulltextIndexSettings::Analyzers analyzers;
analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::WHITESPACE);
const TString russianText = "машины ездят по дорогам исправно";

UNIT_ASSERT_VALUES_EQUAL(Analyze(russianText, analyzers), (TVector<TString>{"машины", "ездят", "по", "дорогам", "исправно"}));

analyzers.set_use_filter_snowball(true);
analyzers.set_language("russian");
UNIT_ASSERT_VALUES_EQUAL(Analyze(russianText, analyzers), (TVector<TString>{"машин", "езд", "по", "дорог", "исправн"}));

const TString englishText = "cars are driving properly on the roads";
analyzers.set_language("english");
UNIT_ASSERT_VALUES_EQUAL(Analyze(englishText, analyzers), (TVector<TString>{"car", "are", "drive", "proper", "on", "the", "road"}));

analyzers.set_language("klingon");
UNIT_ASSERT_EXCEPTION(Analyze(englishText, analyzers), yexception);

analyzers.clear_language();
UNIT_ASSERT_EXCEPTION(Analyze(englishText, analyzers), yexception);
}
}

}
1 change: 1 addition & 0 deletions ydb/core/base/ya.make
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ SRCS(
)

PEERDIR(
contrib/libs/snowball
ydb/library/actors/core
ydb/library/actors/helpers
ydb/library/actors/interconnect
Expand Down
52 changes: 52 additions & 0 deletions ydb/core/kqp/ut/indexes/kqp_indexes_fulltext_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,23 @@ void AddIndexCovered(NQuery::TQueryClient& db) {
UNIT_ASSERT_VALUES_EQUAL_C(result.GetStatus(), EStatus::SUCCESS, result.GetIssues().ToString());
}

void AddIndexSnowball(NQuery::TQueryClient& db, const TString& language) {
TString query = Sprintf(R"sql(
ALTER TABLE `/Root/Texts` ADD INDEX fulltext_idx
GLOBAL USING fulltext
ON (Text)
WITH (
layout=flat,
tokenizer=standard,
use_filter_lowercase=true,
use_filter_snowball=true,
language=%s
)
)sql", language.c_str());
auto result = db.ExecuteQuery(query, NYdb::NQuery::TTxControl::NoTx()).ExtractValueSync();
UNIT_ASSERT_VALUES_EQUAL_C(result.GetStatus(), EStatus::SUCCESS, result.GetIssues().ToString());
}

TResultSet ReadIndex(NQuery::TQueryClient& db) {
TString query = R"sql(
SELECT * FROM `/Root/Texts/fulltext_idx/indexImplTable`;
Expand Down Expand Up @@ -224,6 +241,41 @@ Y_UNIT_TEST(AddIndexEdgeNGram) {
])", NYdb::FormatResultSetYson(index));
}

Y_UNIT_TEST(AddIndexSnowball) {
auto kikimr = Kikimr();
auto db = kikimr.GetQueryClient();

CreateTexts(db);
UpsertTexts(db);
AddIndexSnowball(db, "english");
const auto index = ReadIndex(db);
CompareYson(R"([
[[100u];"anim"];
[[100u];"cat"];
[[200u];"cat"];
[[300u];"cat"];
[[100u];"chase"];
[[200u];"chase"];
[[200u];"dog"];
[[400u];"dog"];
[[400u];"fox"];
[[300u];"love"];
[[400u];"love"];
[[100u];"small"];
[[200u];"small"]
])", NYdb::FormatResultSetYson(index));
}

Y_UNIT_TEST(AddIndexSnowballWithWrongLanguage) {
auto kikimr = Kikimr();
auto db = kikimr.GetQueryClient();

CreateTexts(db);
UpsertTexts(db);

UNIT_ASSERT_TEST_FAILS(AddIndexSnowball(db, "klingon"));
}

Y_UNIT_TEST(InsertRow) {
auto kikimr = Kikimr();
auto db = kikimr.GetQueryClient();
Expand Down
9 changes: 8 additions & 1 deletion ydb/public/api/protos/ydb_table.proto
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ message FulltextIndexSettings {
// See Tokenizer enum
optional Tokenizer tokenizer = 1;

// Language used for language-sensitive operations like stopword filtering
// Language used for language-sensitive operations like stopword filtering and stemming
// Example: language = "english"
// By default is not specified and no language-specific logic is applied
optional string language = 2;
Expand Down Expand Up @@ -229,6 +229,13 @@ message FulltextIndexSettings {
// Maximum token length to keep (inclusive)
// Must be used with use_filter_length
optional int32 filter_length_max = 132 [(Ydb.value) = ">= 0"];

// Whether to apply snowball stemming to each token
// Must be used with language option
// Example: language = "english"
// Tokens: ["cars", "beautifully", "conspired"]
// Output: ["car", "beauti", "conspir"]
optional bool use_filter_snowball = 140;
}

// Represents text analyzers settings for a specific column
Expand Down
Loading