From 41f1ad52e27aaabcc390bf76d349daa04b25baac Mon Sep 17 00:00:00 2001 From: kungasc Date: Wed, 10 Sep 2025 13:42:54 +0300 Subject: [PATCH 1/2] validate settings --- ydb/core/base/fulltext.cpp | 176 ++++++++++++++++++ ydb/core/base/fulltext.h | 12 ++ ydb/core/base/ya.make | 2 + ...eshard__operation_create_indexed_table.cpp | 4 + .../schemeshard_build_index__create.cpp | 6 + .../tx/schemeshard/schemeshard_info_types.h | 1 + .../ut_index/ut_fulltext_index.cpp | 65 +++++-- ydb/public/api/protos/ydb_table.proto | 26 +-- 8 files changed, 265 insertions(+), 27 deletions(-) create mode 100644 ydb/core/base/fulltext.cpp create mode 100644 ydb/core/base/fulltext.h diff --git a/ydb/core/base/fulltext.cpp b/ydb/core/base/fulltext.cpp new file mode 100644 index 000000000000..032d979250cc --- /dev/null +++ b/ydb/core/base/fulltext.cpp @@ -0,0 +1,176 @@ +#include "fulltext.h" + +namespace NKikimr::NFulltext { + +namespace { + + Ydb::Table::FulltextIndexSettings::Layout ParseLayout(const TString& layout, TString& error) { + if (layout == "flat") + return Ydb::Table::FulltextIndexSettings::FLAT; + else { + error = TStringBuilder() << "Invalid layout: " << layout; + return Ydb::Table::FulltextIndexSettings::LAYOUT_UNSPECIFIED; + } + }; + + Ydb::Table::FulltextIndexSettings::Tokenizer ParseTokenizer(const TString& tokenizer, TString& error) { + if (tokenizer == "whitespace") + return Ydb::Table::FulltextIndexSettings::WHITESPACE; + else if (tokenizer == "standard") + return Ydb::Table::FulltextIndexSettings::STANDARD; + else if (tokenizer == "keyword") + return Ydb::Table::FulltextIndexSettings::KEYWORD; + else { + error = TStringBuilder() << "Invalid tokenizer: " << tokenizer; + return Ydb::Table::FulltextIndexSettings::TOKENIZER_UNSPECIFIED; + } + }; + + i32 ParseInt32(const TString& name, const TString& value, TString& error) { + i32 result = 0; + if (!TryFromString(value, result) || result < 0) { // proto int32 fields with [(Ydb.value) = ">= 0"] annotation + error = TStringBuilder() << "Invalid " << name << ": " << value; + } + return result; + } + + bool ParseBool(const TString& name, const TString& value, TString& error) { + bool result = false; + if (!TryFromString(value, result)) { + error = TStringBuilder() << "Invalid " << name << ": " << value; + } + return result; + } + + bool ValidateSettings(const Ydb::Table::FulltextIndexSettings::Analyzers& settings, TString& error) { + if (!settings.has_tokenizer() || settings.tokenizer() == Ydb::Table::FulltextIndexSettings::TOKENIZER_UNSPECIFIED) { + error = "tokenizer should be set"; + return false; + } + + if (settings.has_language()) { + error = "Unsupported language setting"; + return false; + } + + if (settings.use_filter_stopwords()) { + error = "Unsupported use_filter_stopwords setting"; + return false; + } + + if (settings.use_filter_ngram()) { + error = "Unsupported use_filter_ngram setting"; + return false; + } + if (settings.use_filter_edge_ngram()) { + error = "Unsupported use_filter_edge_ngram setting"; + return false; + } + if (settings.has_filter_ngram_min_length()) { + error = "Unsupported filter_ngram_min_length setting"; + return false; + } + if (settings.has_filter_ngram_max_length()) { + error = "Unsupported filter_ngram_max_length setting"; + return false; + } + + if (settings.use_filter_length()) { + error = "Unsupported use_filter_length setting"; + return false; + } + if (settings.has_filter_length_min()) { + error = "Unsupported filter_length_min setting"; + return false; + } + if (settings.has_filter_length_max()) { + error = "Unsupported filter_length_max setting"; + return false; + } + + return true; + } +} + +bool ValidateSettings(const Ydb::Table::FulltextIndexSettings& settings, TString& error) { + if (!settings.has_layout() || settings.layout() == Ydb::Table::FulltextIndexSettings::LAYOUT_UNSPECIFIED) { + error = "layout should be set"; + return false; + } + + if (settings.columns().size() != 1) { + error = TStringBuilder() << "fulltext index should have single column settings" + << " but have " << settings.columns().size() << " of them"; + return false; + } + + for (auto column : settings.columns()) { + if (!column.has_column()) { + error = "column should be set"; + return false; + } + if (!column.has_analyzers()) { + error = "column analyzers should be set"; + return false; + } + if (!ValidateSettings(column.analyzers(), error)) { + return false; + } + } + + return true; +} + +Ydb::Table::FulltextIndexSettings FillSettings(const TString& column, const TVector>& settings, TString& error) { + Ydb::Table::FulltextIndexSettings result; + Ydb::Table::FulltextIndexSettings::Analyzers resultAnalyzers; + + for (const auto& [name, value] : settings) { + if (name == "layout") { + result.set_layout(ParseLayout(value, error)); + } else if (name == "tokenizer") { + resultAnalyzers.set_tokenizer(ParseTokenizer(value, error)); + } else if (name == "language") { + resultAnalyzers.set_language(value); + } else if (name == "use_filter_lowercase") { + resultAnalyzers.set_use_filter_lowercase(ParseBool(name, value, error)); + } else if (name == "use_filter_stopwords") { + resultAnalyzers.set_use_filter_stopwords(ParseBool(name, value, error)); + } else if (name == "use_filter_ngram") { + resultAnalyzers.set_use_filter_ngram(ParseBool(name, value, error)); + } else if (name == "use_filter_edge_ngram") { + resultAnalyzers.set_use_filter_edge_ngram(ParseBool(name, value, error)); + } else if (name == "filter_ngram_min_length") { + resultAnalyzers.set_filter_ngram_min_length(ParseInt32(name, value, error)); + } else if (name == "filter_ngram_max_length") { + resultAnalyzers.set_filter_ngram_max_length(ParseInt32(name, value, error)); + } else if (name == "use_filter_length") { + resultAnalyzers.set_use_filter_length(ParseBool(name, value, error)); + } else if (name == "filter_length_min") { + resultAnalyzers.set_filter_length_min(ParseInt32(name, value, error)); + } else if (name == "filter_length_max") { + resultAnalyzers.set_filter_length_max(ParseInt32(name, value, error)); + } else { + error = TStringBuilder() << "Unknown index setting: " << name; + return result; + } + + if (error) { + return result; + } + } + + { + // only single-columned index is supported for now + auto columnAnalyzers = result.add_columns(); + columnAnalyzers->set_column(column); + columnAnalyzers->CopyFrom(resultAnalyzers); + } + + ValidateSettings(result, error); + + return result; +} + + +} diff --git a/ydb/core/base/fulltext.h b/ydb/core/base/fulltext.h new file mode 100644 index 000000000000..fab441e92139 --- /dev/null +++ b/ydb/core/base/fulltext.h @@ -0,0 +1,12 @@ +#pragma once + +#include "defs.h" + +#include + +namespace NKikimr::NFulltext { + +bool ValidateSettings(const Ydb::Table::FulltextIndexSettings& settings, TString& error); +Ydb::Table::FulltextIndexSettings FillSettings(const TString& column, const TVector>& values, TString& error); + +} diff --git a/ydb/core/base/ya.make b/ydb/core/base/ya.make index 78234ead0f89..285913b9bf5c 100644 --- a/ydb/core/base/ya.make +++ b/ydb/core/base/ya.make @@ -28,6 +28,8 @@ SRCS( feature_flags.h feature_flags_service.cpp feature_flags_service.h + fulltext.cpp + fulltext.h group_stat.cpp group_stat.h hive.h diff --git a/ydb/core/tx/schemeshard/schemeshard__operation_create_indexed_table.cpp b/ydb/core/tx/schemeshard/schemeshard__operation_create_indexed_table.cpp index ff1f1bf8de36..fc648c9dda86 100644 --- a/ydb/core/tx/schemeshard/schemeshard__operation_create_indexed_table.cpp +++ b/ydb/core/tx/schemeshard/schemeshard__operation_create_indexed_table.cpp @@ -152,6 +152,10 @@ TVector CreateIndexedTable(TOperationId nextId, const TTxTr if (!context.SS->EnableFulltextIndex) { return {CreateReject(nextId, NKikimrScheme::EStatus::StatusPreconditionFailed, "Fulltext index support is disabled")}; } + TString msg; + if (!NKikimr::NFulltext::ValidateSettings(indexDescription.GetFulltextIndexDescription().GetSettings(), msg)) { + return {CreateReject(nextId, NKikimrScheme::EStatus::StatusInvalidParameter, msg)}; + } break; } default: diff --git a/ydb/core/tx/schemeshard/schemeshard_build_index__create.cpp b/ydb/core/tx/schemeshard/schemeshard_build_index__create.cpp index 207fd7001faa..0b83aae30125 100644 --- a/ydb/core/tx/schemeshard/schemeshard_build_index__create.cpp +++ b/ydb/core/tx/schemeshard/schemeshard_build_index__create.cpp @@ -280,6 +280,12 @@ class TSchemeShard::TIndexBuilder::TTxCreate: public TSchemeShard::TIndexBuilder } buildInfo.BuildKind = TIndexBuildInfo::EBuildKind::BuildFulltext; buildInfo.IndexType = NKikimrSchemeOp::EIndexType::EIndexTypeGlobalFulltext; + NKikimrSchemeOp::TFulltextIndexDescription fulltextIndexDescription; + *fulltextIndexDescription.MutableSettings() = index.global_fulltext_index().fulltext_settings(); + if (!NKikimr::NFulltext::ValidateSettings(fulltextIndexDescription.GetSettings(), explain)) { + return false; + } + buildInfo.SpecializedIndexDescription = fulltextIndexDescription; break; } }; diff --git a/ydb/core/tx/schemeshard/schemeshard_info_types.h b/ydb/core/tx/schemeshard/schemeshard_info_types.h index d1b372266776..c4b952bfadd1 100644 --- a/ydb/core/tx/schemeshard/schemeshard_info_types.h +++ b/ydb/core/tx/schemeshard/schemeshard_info_types.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include diff --git a/ydb/core/tx/schemeshard/ut_index/ut_fulltext_index.cpp b/ydb/core/tx/schemeshard/ut_index/ut_fulltext_index.cpp index 26de835d2c67..4b5b27405453 100644 --- a/ydb/core/tx/schemeshard/ut_index/ut_fulltext_index.cpp +++ b/ydb/core/tx/schemeshard/ut_index/ut_fulltext_index.cpp @@ -24,8 +24,7 @@ Y_UNIT_TEST_SUITE(TFulltextIndexTests) { column: "text" analyzers: { tokenizer: STANDARD - use_filter_ngram: true - filter_ngram_max_length: 42 + use_filter_lowercase: true } } )"; @@ -71,8 +70,8 @@ Y_UNIT_TEST_SUITE(TFulltextIndexTests) { TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/texts/idx_fulltext/indexImplTable"),{ NLs::PathExist, NLs::CheckColumns("indexImplTable", - { NFulltext::TokenColumn, "id", "covered" }, {}, - { NFulltext::TokenColumn, "id" }, true) }); + { NTableIndex::NFulltext::TokenColumn, "id", "covered" }, {}, + { NTableIndex::NFulltext::TokenColumn, "id" }, true) }); Cerr << "Reboot SchemeShard.." << Endl; TActorId sender = runtime.AllocateEdgeActor(); @@ -91,8 +90,7 @@ Y_UNIT_TEST_SUITE(TFulltextIndexTests) { column: "text" analyzers: { tokenizer: STANDARD - use_filter_ngram: true - filter_ngram_max_length: 42 + use_filter_lowercase: true } } )"; @@ -135,16 +133,14 @@ Y_UNIT_TEST_SUITE(TFulltextIndexTests) { column: "text1" analyzers: { tokenizer: STANDARD - use_filter_ngram: true - filter_ngram_max_length: 42 + use_filter_lowercase: true } } columns: { column: "text2" analyzers: { tokenizer: STANDARD - use_filter_ngram: true - filter_ngram_max_length: 42 + use_filter_lowercase: true } } )"; @@ -188,8 +184,7 @@ Y_UNIT_TEST_SUITE(TFulltextIndexTests) { column: "text" analyzers: { tokenizer: STANDARD - use_filter_ngram: true - filter_ngram_max_length: 42 + use_filter_lowercase: true } } )"; @@ -232,8 +227,7 @@ Y_UNIT_TEST_SUITE(TFulltextIndexTests) { column: "text_wrong" analyzers: { tokenizer: STANDARD - use_filter_ngram: true - filter_ngram_max_length: 42 + use_filter_lowercase: true } } )"; @@ -300,4 +294,47 @@ Y_UNIT_TEST_SUITE(TFulltextIndexTests) { NLs::PathNotExist, }); } + + Y_UNIT_TEST(CreateTableUnsupportedSettings) { + TTestBasicRuntime runtime; + TTestEnv env(runtime); + ui64 txId = 100; + + TString fulltextSettings = R"( + layout: FLAT + columns: { + column: "text" + analyzers: { + tokenizer: STANDARD + use_filter_edge_ngram: true + } + } + )"; + TestCreateIndexedTable(runtime, ++txId, "/MyRoot", Sprintf(R"( + TableDescription { + Name: "texts" + Columns { Name: "id" Type: "Uint64" } + Columns { Name: "text" Type: "String" } + Columns { Name: "covered" Type: "String" } + Columns { Name: "another" Type: "Uint64" } + KeyColumnNames: ["id"] + } + IndexDescription { + Name: "idx_fulltext" + KeyColumnNames: ["text"] + DataColumnNames: ["covered"] + Type: EIndexTypeGlobalFulltext + FulltextIndexDescription: { + Settings: { + %s + } + } + } + )", fulltextSettings.c_str()), {NKikimrScheme::StatusInvalidParameter}); + env.TestWaitNotification(runtime, txId); + + TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/texts/idx_fulltext"),{ + NLs::PathNotExist, + }); + } } diff --git a/ydb/public/api/protos/ydb_table.proto b/ydb/public/api/protos/ydb_table.proto index c30285aa6155..8aaa1c40515d 100644 --- a/ydb/public/api/protos/ydb_table.proto +++ b/ydb/public/api/protos/ydb_table.proto @@ -172,76 +172,76 @@ message FulltextIndexSettings { // Represents text analyzers settings message Analyzers { // See Tokenizer enum - Tokenizer tokenizer = 1; + optional Tokenizer tokenizer = 1; // Language used for language-sensitive operations like stopword filtering // Example: language = "english" // By default is not specified and no language-specific logic is applied - string language = 2; + optional string language = 2; // Whether to convert tokens to lowercase // Example: // Token: "Quick" // Output: "quick" - bool use_filter_lowercase = 100; + optional bool use_filter_lowercase = 100; // Whether to remove common stopwords like "the", "a", "is" // Example: language = "english" // Tokens: ["the", "quick", "brown"] // Output: ["quick", "brown"] - bool use_filter_stopwords = 110; + optional bool use_filter_stopwords = 110; // Whether to apply character n-gram indexing to each token // Must be used with filter_ngram_min_length and filter_ngram_max_length // Example: filter_ngram_min_length = 3, filter_ngram_max_length = 4 // Token: "search" // Output: ["sea", "ear", "arc", "rch", "sear", "earc", "arch"] - bool use_filter_ngram = 120; + optional bool use_filter_ngram = 120; // Whether to apply edge n-gram indexing (prefix-based) to each token // Used with filter_ngram_min_length and filter_ngram_max_length // Example: filter_ngram_min_length = 3, filter_ngram_max_length = 4 // Token: "search" // Output: ["sea", "sear"] - bool use_filter_edge_ngram = 121; + optional bool use_filter_edge_ngram = 121; // Minimum length of n-grams to generate (inclusive) // Must be used with use_filter_ngram or use_filter_edge_ngram // Default value is 3 - int32 filter_ngram_min_length = 122 [(Ydb.value) = ">= 0"]; + optional int32 filter_ngram_min_length = 122 [(Ydb.value) = ">= 0"]; // Maximum length of n-grams to generate (inclusive) // Must be used with use_filter_ngram or use_filter_edge_ngram // Default value is 4 - int32 filter_ngram_max_length = 123 [(Ydb.value) = ">= 0"]; + optional int32 filter_ngram_max_length = 123 [(Ydb.value) = ">= 0"]; // Whether to filter tokens by their length // Must be used with filter_length_min or filter_length_max // Example: filter_length_min = 4, filter_length_max = 6 // Tokens: ["foo", "fooba", "foobar", "foobarbaz"] // Output: ["fooba", "foobar"] - bool use_filter_length = 130; + optional bool use_filter_length = 130; // Minimum token length to keep (inclusive) // Must be used with use_filter_length - int32 filter_length_min = 131 [(Ydb.value) = ">= 0"]; + optional int32 filter_length_min = 131 [(Ydb.value) = ">= 0"]; // Maximum token length to keep (inclusive) // Must be used with use_filter_length - int32 filter_length_max = 132 [(Ydb.value) = ">= 0"]; + optional int32 filter_length_max = 132 [(Ydb.value) = ">= 0"]; } // Represents text analyzers settings for a specific column message ColumnAnalyzers { // Name of the column to be indexed - string column = 1; + optional string column = 1; // Analyzer settings specific to this column Analyzers analyzers = 2; } // See Layout enum - Layout layout = 1; + optional Layout layout = 1; // List of columns and their fulltext settings // Currently, this list should contain a single entry From 6e93e3e9034589f22d0a225b5e0f96365eaa834b Mon Sep 17 00:00:00 2001 From: kungasc Date: Wed, 10 Sep 2025 14:56:10 +0300 Subject: [PATCH 2/2] add tests --- ydb/core/base/fulltext.cpp | 48 +++++++++++++- ydb/core/base/fulltext.h | 2 + ydb/core/base/kmeans_clusters.cpp | 2 + ydb/core/base/ut/fulltext_ut.cpp | 105 ++++++++++++++++++++++++++++++ ydb/core/base/ut/ya.make | 5 +- 5 files changed, 159 insertions(+), 3 deletions(-) create mode 100644 ydb/core/base/ut/fulltext_ut.cpp diff --git a/ydb/core/base/fulltext.cpp b/ydb/core/base/fulltext.cpp index 032d979250cc..a23e83f2955d 100644 --- a/ydb/core/base/fulltext.cpp +++ b/ydb/core/base/fulltext.cpp @@ -1,4 +1,5 @@ #include "fulltext.h" +#include namespace NKikimr::NFulltext { @@ -42,6 +43,38 @@ namespace { return result; } + // Note: written by llm, can be optimized a lot later + TVector Tokenize(const TString& text, const Ydb::Table::FulltextIndexSettings::Tokenizer& tokenizer) { + TVector tokens; + switch (tokenizer) { + case Ydb::Table::FulltextIndexSettings::WHITESPACE: { + std::istringstream stream(text); + TString token; + while (stream >> token) { + tokens.push_back(token); + } + break; + } + case Ydb::Table::FulltextIndexSettings::STANDARD: { + std::regex word_regex(R"(\b\w+\b)"); // match alphanumeric words + std::sregex_iterator it(text.begin(), text.end(), word_regex); + std::sregex_iterator end; + while (it != end) { + tokens.push_back(it->str()); + ++it; + } + break; + } + case Ydb::Table::FulltextIndexSettings::KEYWORD: + tokens.push_back(text); + break; + default: + Y_ENSURE(TStringBuilder() << "Invalid tokenizer: " << static_cast(tokenizer)); + } + + return tokens; + } + bool ValidateSettings(const Ydb::Table::FulltextIndexSettings::Analyzers& settings, TString& error) { if (!settings.has_tokenizer() || settings.tokenizer() == Ydb::Table::FulltextIndexSettings::TOKENIZER_UNSPECIFIED) { error = "tokenizer should be set"; @@ -92,6 +125,18 @@ namespace { } } +TVector Analyze(const TString& text, const Ydb::Table::FulltextIndexSettings::Analyzers& settings) { + TVector tokens = Tokenize(text, settings.tokenizer()); + + if (settings.use_filter_lowercase()) { + for (auto& token : tokens) { + token.to_lower(); + } + } + + return tokens; +} + bool ValidateSettings(const Ydb::Table::FulltextIndexSettings& settings, TString& error) { if (!settings.has_layout() || settings.layout() == Ydb::Table::FulltextIndexSettings::LAYOUT_UNSPECIFIED) { error = "layout should be set"; @@ -118,6 +163,7 @@ bool ValidateSettings(const Ydb::Table::FulltextIndexSettings& settings, TString } } + error = ""; return true; } @@ -164,7 +210,7 @@ Ydb::Table::FulltextIndexSettings FillSettings(const TString& column, const TVec // only single-columned index is supported for now auto columnAnalyzers = result.add_columns(); columnAnalyzers->set_column(column); - columnAnalyzers->CopyFrom(resultAnalyzers); + columnAnalyzers->mutable_analyzers()->CopyFrom(resultAnalyzers); } ValidateSettings(result, error); diff --git a/ydb/core/base/fulltext.h b/ydb/core/base/fulltext.h index fab441e92139..b65e3043622a 100644 --- a/ydb/core/base/fulltext.h +++ b/ydb/core/base/fulltext.h @@ -6,6 +6,8 @@ namespace NKikimr::NFulltext { +TVector Analyze(const TString& text, const Ydb::Table::FulltextIndexSettings::Analyzers& settings); + bool ValidateSettings(const Ydb::Table::FulltextIndexSettings& settings, TString& error); Ydb::Table::FulltextIndexSettings FillSettings(const TString& column, const TVector>& values, TString& error); diff --git a/ydb/core/base/kmeans_clusters.cpp b/ydb/core/base/kmeans_clusters.cpp index ebb58d94261e..97ccfb07f2fd 100644 --- a/ydb/core/base/kmeans_clusters.cpp +++ b/ydb/core/base/kmeans_clusters.cpp @@ -483,6 +483,7 @@ bool ValidateSettings(const Ydb::Table::KMeansTreeSettings& settings, TString& e return false; } + error = ""; return true; } @@ -516,6 +517,7 @@ bool ValidateSettings(const Ydb::Table::VectorIndexSettings& settings, TString& return false; } + error = ""; return true; } diff --git a/ydb/core/base/ut/fulltext_ut.cpp b/ydb/core/base/ut/fulltext_ut.cpp new file mode 100644 index 000000000000..0824748fe0de --- /dev/null +++ b/ydb/core/base/ut/fulltext_ut.cpp @@ -0,0 +1,105 @@ +#include "fulltext.h" + +#include + +namespace NKikimr::NFulltext { + +Y_UNIT_TEST_SUITE(NFulltext) { + + Y_UNIT_TEST(ValidateSettings) { + Ydb::Table::FulltextIndexSettings settings; + TString error; + + UNIT_ASSERT(!ValidateSettings(settings, error)); + UNIT_ASSERT_VALUES_EQUAL(error, "layout should be set"); + settings.set_layout(Ydb::Table::FulltextIndexSettings::FLAT); + + UNIT_ASSERT(!ValidateSettings(settings, error)); + UNIT_ASSERT_VALUES_EQUAL(error, "fulltext index should have single column settings but have 0 of them"); + auto columnSettings = settings.add_columns(); + + UNIT_ASSERT(!ValidateSettings(settings, error)); + UNIT_ASSERT_VALUES_EQUAL(error, "column should be set"); + columnSettings->set_column("text"); + + UNIT_ASSERT(!ValidateSettings(settings, error)); + UNIT_ASSERT_VALUES_EQUAL(error, "column analyzers should be set"); + auto columnAnalyzers = columnSettings->mutable_analyzers(); + + UNIT_ASSERT(!ValidateSettings(settings, error)); + UNIT_ASSERT_VALUES_EQUAL(error, "tokenizer should be set"); + columnAnalyzers->set_tokenizer(Ydb::Table::FulltextIndexSettings::STANDARD); + + UNIT_ASSERT_C(ValidateSettings(settings, error), error); + UNIT_ASSERT_VALUES_EQUAL(error, ""); + } + + Y_UNIT_TEST(FillSettings) { + TVector> list{ + {"layout", "flat"}, + {"tokenizer", "standard"}, + {"use_filter_lowercase", "true"} + }; + + TString error; + auto settings = FillSettings("text", list, error); + UNIT_ASSERT_VALUES_EQUAL(error, ""); + + UNIT_ASSERT_EQUAL(settings.layout(), Ydb::Table::FulltextIndexSettings::FLAT); + UNIT_ASSERT_VALUES_EQUAL(settings.columns().size(), 1); + UNIT_ASSERT_VALUES_EQUAL(settings.columns().at(0).column(), "text"); + UNIT_ASSERT_EQUAL(settings.columns().at(0).analyzers().tokenizer(), Ydb::Table::FulltextIndexSettings::STANDARD); + UNIT_ASSERT_VALUES_EQUAL(settings.columns().at(0).analyzers().use_filter_lowercase(), true); + } + + Y_UNIT_TEST(FillSettingsInvalid) { + { + TVector> list{ + {"asdf", "qwer"} + }; + TString error; + auto settings = FillSettings("text", list, error); + UNIT_ASSERT_VALUES_EQUAL(error, "Unknown index setting: asdf"); + } + + { + TVector> list{ + {"layout", "flat"}, + {"tokenizer", "standard"}, + {"use_filter_lowercase", "asdf"} + }; + TString error; + auto settings = FillSettings("text", list, error); + UNIT_ASSERT_VALUES_EQUAL(error, "Invalid use_filter_lowercase: asdf"); + } + + { + TVector> list{ + {"layout", "flat"}, + }; + TString error; + auto settings = FillSettings("text", list, error); + UNIT_ASSERT_VALUES_EQUAL(error, "tokenizer should be set"); + } + } + + Y_UNIT_TEST(Analyze) { + Ydb::Table::FulltextIndexSettings::Analyzers analyzers; + TString text = "apple WaLLet spaced-dog"; + + analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::WHITESPACE); + UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector{"apple", "WaLLet", "spaced-dog"})); + + analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::STANDARD); + UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector{"apple", "WaLLet", "spaced", "dog"})); + + analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::KEYWORD); + UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector{text})); + + analyzers.set_tokenizer(Ydb::Table::FulltextIndexSettings::WHITESPACE); + analyzers.set_use_filter_lowercase(true); + UNIT_ASSERT_VALUES_EQUAL(Analyze(text, analyzers), (TVector{"apple", "wallet", "spaced-dog"})); + } +} + +} diff --git a/ydb/core/base/ut/ya.make b/ydb/core/base/ut/ya.make index 82b340fde09c..cc1fd67d984e 100644 --- a/ydb/core/base/ut/ya.make +++ b/ydb/core/base/ut/ya.make @@ -9,13 +9,14 @@ PEERDIR( ) SRCS( - path_ut.cpp blobstorage_grouptype_ut.cpp + fulltext_ut.cpp localdb_ut.cpp logoblob_ut.cpp memory_stats_ut.cpp - statestorage_ut.cpp + path_ut.cpp statestorage_guardian_impl_ut.cpp + statestorage_ut.cpp table_index_ut.cpp )