diff --git a/ydb/core/tx/schemeshard/schemeshard_utils.h b/ydb/core/tx/schemeshard/schemeshard_utils.h index d23dfbdb89a2..cf299e6564ae 100644 --- a/ydb/core/tx/schemeshard/schemeshard_utils.h +++ b/ydb/core/tx/schemeshard/schemeshard_utils.h @@ -194,6 +194,18 @@ bool CommonCheck(const TTableDesc& tableDesc, const NKikimrSchemeOp::TIndexCreat error = TStringBuilder() << "fulltext index can only have a single key text column"; return false; } + if (indexDesc.GetFulltextIndexDescription().GetSettings().Getcolumns().size() != 1) { + status = NKikimrScheme::EStatus::StatusInvalidParameter; + error = TStringBuilder() << "fulltext index should have single '" << indexKeys.KeyColumns.at(0) << "' column settings" + << " but have " << indexDesc.GetFulltextIndexDescription().GetSettings().Getcolumns().size() << " of them"; + return false; + } + if (indexDesc.GetFulltextIndexDescription().GetSettings().Getcolumns().at(0).Getcolumn() != indexKeys.KeyColumns.at(0)) { + status = NKikimrScheme::EStatus::StatusInvalidParameter; + error = TStringBuilder() << "fulltext index should have '" << indexKeys.KeyColumns.at(0) << "' column settings" + << " but have '" << indexDesc.GetFulltextIndexDescription().GetSettings().Getcolumns().at(0).Getcolumn() << "' column settings"; + return false; + } const TString& indexColumnName = indexKeys.KeyColumns.back(); Y_ABORT_UNLESS(baseColumnTypes.contains(indexColumnName)); diff --git a/ydb/core/tx/schemeshard/ut_helpers/ls_checks.cpp b/ydb/core/tx/schemeshard/ut_helpers/ls_checks.cpp index a75939072c02..cd06a10a1909 100644 --- a/ydb/core/tx/schemeshard/ut_helpers/ls_checks.cpp +++ b/ydb/core/tx/schemeshard/ut_helpers/ls_checks.cpp @@ -1,5 +1,6 @@ #include "ls_checks.h" +#include #include #include #include @@ -914,20 +915,37 @@ TCheckFunc KMeansTreeDescription(Ydb::Table::VectorIndexSettings_Metric metric, TCheckFunc SpecializedIndexDescription(const TString& proto) { return [=] (const NKikimrScheme::TEvDescribeSchemeResult& record) { - TString actual; switch (record.GetPathDescription().GetTableIndex().GetSpecializedIndexDescriptionCase()) { - case NKikimrSchemeOp::TIndexDescription::kVectorIndexKmeansTreeDescription: - actual = record.GetPathDescription().GetTableIndex().GetVectorIndexKmeansTreeDescription().GetSettings().ShortDebugString(); - break; - case NKikimrSchemeOp::TIndexDescription::kFulltextIndexDescription: - actual = record.GetPathDescription().GetTableIndex().GetFulltextIndexDescription().GetSettings().ShortDebugString(); - break; - case NKikimrSchemeOp::TIndexDescription::SPECIALIZEDINDEXDESCRIPTION_NOT_SET: - actual = "SPECIALIZEDINDEXDESCRIPTION_NOT_SET"; - break; + case NKikimrSchemeOp::TIndexDescription::kVectorIndexKmeansTreeDescription: { + auto actual = record.GetPathDescription().GetTableIndex().GetVectorIndexKmeansTreeDescription().GetSettings(); + Ydb::Table::KMeansTreeSettings expected; + UNIT_ASSERT(google::protobuf::TextFormat::ParseFromString(proto, &expected)); + UNIT_ASSERT_C(google::protobuf::util::MessageDifferencer::Equals(actual, expected), + TStringBuilder() << "Expected" + << expected.ShortDebugString() + << " but got " + << actual.ShortDebugString()); + break; + } + case NKikimrSchemeOp::TIndexDescription::kFulltextIndexDescription: { + auto actual = record.GetPathDescription().GetTableIndex().GetFulltextIndexDescription().GetSettings(); + Ydb::Table::FulltextIndexSettings expected; + UNIT_ASSERT(google::protobuf::TextFormat::ParseFromString(proto, &expected)); + UNIT_ASSERT_C(google::protobuf::util::MessageDifferencer::Equals(actual, expected), + TStringBuilder() << "Expected" + << expected.ShortDebugString() + << " but got " + << actual.ShortDebugString()); + break; + } + case NKikimrSchemeOp::TIndexDescription::SPECIALIZEDINDEXDESCRIPTION_NOT_SET: { + UNIT_ASSERT_C(proto == "SPECIALIZEDINDEXDESCRIPTION_NOT_SET", + TStringBuilder() << "Expected" + << proto + << " but got SPECIALIZEDINDEXDESCRIPTION_NOT_SET"); + break; + } } - - UNIT_ASSERT_VALUES_EQUAL(actual, proto); }; } diff --git a/ydb/core/tx/schemeshard/ut_index/ut_fulltext_index.cpp b/ydb/core/tx/schemeshard/ut_index/ut_fulltext_index.cpp index e9f772e8e3fc..26de835d2c67 100644 --- a/ydb/core/tx/schemeshard/ut_index/ut_fulltext_index.cpp +++ b/ydb/core/tx/schemeshard/ut_index/ut_fulltext_index.cpp @@ -18,23 +18,38 @@ Y_UNIT_TEST_SUITE(TFulltextIndexTests) { TTestEnv env(runtime); ui64 txId = 100; - TestCreateIndexedTable(runtime, ++txId, "/MyRoot", R"( + TString fulltextSettings = R"( + layout: FLAT + columns: { + column: "text" + analyzers: { + tokenizer: STANDARD + use_filter_ngram: true + filter_ngram_max_length: 42 + } + } + )"; + TestCreateIndexedTable(runtime, ++txId, "/MyRoot", Sprintf(R"( TableDescription { - Name: "texts" - Columns { Name: "id" Type: "Uint64" } - Columns { Name: "text" Type: "String" } - Columns { Name: "covered" Type: "String" } - Columns { Name: "another" Type: "Uint64" } - KeyColumnNames: ["id"] + Name: "texts" + Columns { Name: "id" Type: "Uint64" } + Columns { Name: "text" Type: "String" } + Columns { Name: "covered" Type: "String" } + Columns { Name: "another" Type: "Uint64" } + KeyColumnNames: ["id"] } IndexDescription { - Name: "idx_fulltext" - KeyColumnNames: ["text"] - DataColumnNames: ["covered"] - Type: EIndexTypeGlobalFulltext - FulltextIndexDescription: { Settings: { layout: FLAT, tokenizer: STANDARD, use_filter_ngram: true, filter_ngram_max_length: 42 } } + Name: "idx_fulltext" + KeyColumnNames: ["text"] + DataColumnNames: ["covered"] + Type: EIndexTypeGlobalFulltext + FulltextIndexDescription: { + Settings: { + %s + } + } } - )"); + )", fulltextSettings.c_str())); env.TestWaitNotification(runtime, txId); NKikimrSchemeOp::TDescribeOptions opts; @@ -49,7 +64,7 @@ Y_UNIT_TEST_SUITE(TFulltextIndexTests) { NLs::IndexState(NKikimrSchemeOp::EIndexStateReady), NLs::IndexKeys({"text"}), NLs::IndexDataColumns({"covered"}), - NLs::SpecializedIndexDescription("layout: FLAT tokenizer: STANDARD use_filter_ngram: true filter_ngram_max_length: 42"), + NLs::SpecializedIndexDescription(fulltextSettings), NLs::ChildrenCount(1), }); @@ -70,23 +85,91 @@ Y_UNIT_TEST_SUITE(TFulltextIndexTests) { TTestEnv env(runtime); ui64 txId = 100; - TestCreateIndexedTable(runtime, ++txId, "/MyRoot", R"( + TString fulltextSettings = R"( + layout: FLAT + columns: { + column: "text" + analyzers: { + tokenizer: STANDARD + use_filter_ngram: true + filter_ngram_max_length: 42 + } + } + )"; + TestCreateIndexedTable(runtime, ++txId, "/MyRoot", Sprintf(R"( + TableDescription { + Name: "texts" + Columns { Name: "id" Type: "Uint64" } + Columns { Name: "text" Type: "String" } + Columns { Name: "covered" Type: "String" } + Columns { Name: "another" Type: "Uint64" } + KeyColumnNames: ["id"] + } + IndexDescription { + Name: "idx_fulltext" + KeyColumnNames: [ "another", "text"] + DataColumnNames: ["covered"] + Type: EIndexTypeGlobalFulltext + FulltextIndexDescription: { + Settings: { + %s + } + } + } + )", fulltextSettings.c_str()), {NKikimrScheme::StatusInvalidParameter}); + env.TestWaitNotification(runtime, txId); + + TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/texts/idx_fulltext"),{ + NLs::PathNotExist, + }); + } + + Y_UNIT_TEST(CreateTableMultipleColumns) { // not supported for now, maybe later + TTestBasicRuntime runtime; + TTestEnv env(runtime); + ui64 txId = 100; + + TString fulltextSettings = R"( + layout: FLAT + columns: { + column: "text1" + analyzers: { + tokenizer: STANDARD + use_filter_ngram: true + filter_ngram_max_length: 42 + } + } + columns: { + column: "text2" + analyzers: { + tokenizer: STANDARD + use_filter_ngram: true + filter_ngram_max_length: 42 + } + } + )"; + TestCreateIndexedTable(runtime, ++txId, "/MyRoot", Sprintf(R"( TableDescription { - Name: "texts" - Columns { Name: "id" Type: "Uint64" } - Columns { Name: "text" Type: "String" } - Columns { Name: "covered" Type: "String" } - Columns { Name: "another" Type: "Uint64" } - KeyColumnNames: [ "id"] + Name: "texts" + Columns { Name: "id" Type: "Uint64" } + Columns { Name: "text1" Type: "String" } + Columns { Name: "text2" Type: "String" } + Columns { Name: "covered" Type: "String" } + Columns { Name: "another" Type: "Uint64" } + KeyColumnNames: ["id"] } IndexDescription { - Name: "idx_fulltext" - KeyColumnNames: [ "another", "text"] - DataColumnNames: ["covered"] - Type: EIndexTypeGlobalFulltext - FulltextIndexDescription: { Settings: { layout: FLAT, tokenizer: STANDARD, use_filter_ngram: true, filter_ngram_max_length: 42 } } + Name: "idx_fulltext" + KeyColumnNames: ["text1", "text2"] + DataColumnNames: ["covered"] + Type: EIndexTypeGlobalFulltext + FulltextIndexDescription: { + Settings: { + %s + } + } } - )", {NKikimrScheme::StatusInvalidParameter}); + )", fulltextSettings.c_str()), {NKikimrScheme::StatusInvalidParameter}); env.TestWaitNotification(runtime, txId); TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/texts/idx_fulltext"),{ @@ -99,23 +182,118 @@ Y_UNIT_TEST_SUITE(TFulltextIndexTests) { TTestEnv env(runtime); ui64 txId = 100; - TestCreateIndexedTable(runtime, ++txId, "/MyRoot", R"( + TString fulltextSettings = R"( + layout: FLAT + columns: { + column: "text" + analyzers: { + tokenizer: STANDARD + use_filter_ngram: true + filter_ngram_max_length: 42 + } + } + )"; + TestCreateIndexedTable(runtime, ++txId, "/MyRoot", Sprintf(R"( + TableDescription { + Name: "texts" + Columns { Name: "id" Type: "Uint64" } + Columns { Name: "text" Type: "Uint64" } + Columns { Name: "covered" Type: "String" } + Columns { Name: "another" Type: "Uint64" } + KeyColumnNames: ["id"] + } + IndexDescription { + Name: "idx_fulltext" + KeyColumnNames: ["text"] + DataColumnNames: ["covered"] + Type: EIndexTypeGlobalFulltext + FulltextIndexDescription: { + Settings: { + %s + } + } + } + )", fulltextSettings.c_str()), {NKikimrScheme::StatusInvalidParameter}); + env.TestWaitNotification(runtime, txId); + + TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/texts/idx_fulltext"),{ + NLs::PathNotExist, + }); + } + + Y_UNIT_TEST(CreateTableColumnsMismatch) { + TTestBasicRuntime runtime; + TTestEnv env(runtime); + ui64 txId = 100; + + TString fulltextSettings = R"( + layout: FLAT + columns: { + column: "text_wrong" + analyzers: { + tokenizer: STANDARD + use_filter_ngram: true + filter_ngram_max_length: 42 + } + } + )"; + TestCreateIndexedTable(runtime, ++txId, "/MyRoot", Sprintf(R"( + TableDescription { + Name: "texts" + Columns { Name: "id" Type: "Uint64" } + Columns { Name: "text" Type: "String" } + Columns { Name: "covered" Type: "String" } + Columns { Name: "another" Type: "Uint64" } + KeyColumnNames: ["id"] + } + IndexDescription { + Name: "idx_fulltext" + KeyColumnNames: ["text"] + DataColumnNames: ["covered"] + Type: EIndexTypeGlobalFulltext + FulltextIndexDescription: { + Settings: { + %s + } + } + } + )", fulltextSettings.c_str()), {NKikimrScheme::StatusInvalidParameter}); + env.TestWaitNotification(runtime, txId); + + TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/texts/idx_fulltext"),{ + NLs::PathNotExist, + }); + } + + Y_UNIT_TEST(CreateTableNoColumnsSettings) { + TTestBasicRuntime runtime; + TTestEnv env(runtime); + ui64 txId = 100; + + TString fulltextSettings = R"( + layout: FLAT + )"; + TestCreateIndexedTable(runtime, ++txId, "/MyRoot", Sprintf(R"( TableDescription { - Name: "texts" - Columns { Name: "id" Type: "Uint64" } - Columns { Name: "text" Type: "Uint64" } - Columns { Name: "covered" Type: "String" } - Columns { Name: "another" Type: "Uint64" } - KeyColumnNames: ["id"] + Name: "texts" + Columns { Name: "id" Type: "Uint64" } + Columns { Name: "text" Type: "String" } + Columns { Name: "covered" Type: "String" } + Columns { Name: "another" Type: "Uint64" } + KeyColumnNames: ["id"] } IndexDescription { - Name: "idx_fulltext" - KeyColumnNames: ["text"] - DataColumnNames: ["covered"] - Type: EIndexTypeGlobalFulltext - FulltextIndexDescription: { Settings: { layout: FLAT, tokenizer: STANDARD, use_filter_ngram: true, filter_ngram_max_length: 42 } } + Name: "idx_fulltext" + KeyColumnNames: ["text"] + DataColumnNames: ["covered"] + Type: EIndexTypeGlobalFulltext + FulltextIndexDescription: { + Settings: { + %s + } + } } - )", {NKikimrScheme::StatusInvalidParameter}); + )", fulltextSettings.c_str()), {NKikimrScheme::StatusInvalidParameter}); env.TestWaitNotification(runtime, txId); TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/texts/idx_fulltext"),{ diff --git a/ydb/public/api/protos/ydb_table.proto b/ydb/public/api/protos/ydb_table.proto index ea0bb46f00d5..5237514c8b3e 100644 --- a/ydb/public/api/protos/ydb_table.proto +++ b/ydb/public/api/protos/ydb_table.proto @@ -139,6 +139,7 @@ message FulltextIndexSettings { // │ "The" │ 1 │ // │ "The" │ 2 │ // └──────────────┴────┘ + // Supports a single column only FLAT = 1; } @@ -168,67 +169,84 @@ message FulltextIndexSettings { KEYWORD = 3; } + // Represents text analyzers settings + message Analyzers { + // See Tokenizer enum + Tokenizer tokenizer = 1; + + // Language used for language-sensitive operations like stopword filtering + // Example: language = "english" + // By default is not specified and no language-specific logic is applied + string language = 2; + + // Whether to convert tokens to lowercase + // Example: + // Token: "Quick" + // Output: "quick" + bool use_filter_lowercase = 100; + + // Whether to remove common stopwords like "the", "a", "is" + // Example: language = "english" + // Tokens: ["the", "quick", "brown"] + // Output: ["quick", "brown"] + bool use_filter_stopwords = 110; + + // Whether to apply character n-gram indexing to each token + // Must be used with filter_ngram_min_length and filter_ngram_max_length + // Example: filter_ngram_min_length = 3, filter_ngram_max_length = 4 + // Token: "search" + // Output: ["sea", "ear", "arc", "rch", "sear", "earc", "arch"] + bool use_filter_ngram = 120; + + // Whether to apply edge n-gram indexing (prefix-based) to each token + // Used with filter_ngram_min_length and filter_ngram_max_length + // Example: filter_ngram_min_length = 3, filter_ngram_max_length = 4 + // Token: "search" + // Output: ["sea", "sear"] + bool use_filter_edge_ngram = 121; + + // Minimum length of n-grams to generate (inclusive) + // Must be used with use_filter_ngram or use_filter_edge_ngram + // Default value is 3 + int32 filter_ngram_min_length = 122 [(Ydb.value) = ">= 0"]; + + // Maximum length of n-grams to generate (inclusive) + // Must be used with use_filter_ngram or use_filter_edge_ngram + // Default value is 4 + int32 filter_ngram_max_length = 123 [(Ydb.value) = ">= 0"]; + + // Whether to filter tokens by their length + // Must be used with filter_length_min or filter_length_max + // Example: filter_length_min = 4, filter_length_max = 6 + // Tokens: ["foo", "fooba", "foobar", "foobarbaz"] + // Output: ["fooba", "foobar"] + bool use_filter_length = 130; + + // Minimum token length to keep (inclusive) + // Must be used with use_filter_length + int32 filter_length_min = 131 [(Ydb.value) = ">= 0"]; + + // Maximum token length to keep (inclusive) + // Must be used with use_filter_length + int32 filter_length_max = 132 [(Ydb.value) = ">= 0"]; + } + + // Represents text analyzers settings for a specific column + message ColumnAnalyzers { + // Name of the column to be indexed + string column = 1; + + // Analyzer settings specific to this column + Analyzers analyzers = 2; + } + // See Layout enum Layout layout = 1; - // See Tokenizer enum - Tokenizer tokenizer = 2; - - // Language used for language-sensitive operations like stopword filtering - // Example: language = "english" - // By default is not specified and no language-specific logic is applied - string language = 3; - - // Whether to convert tokens to lowercase - // Example: - // Token: "Quick" - // Output: "quick" - bool use_filter_lowercase = 100; - - // Whether to remove common stopwords like "the", "a", "is" - // Example: language = "english" - // Tokens: ["the", "quick", "brown"] - // Output: ["quick", "brown"] - bool use_filter_stopwords = 110; - - // Whether to apply character n-gram indexing to each token - // Must be used with filter_ngram_min_length and filter_ngram_max_length - // Example: filter_ngram_min_length = 3, filter_ngram_max_length = 4 - // Token: "search" - // Output: ["sea", "ear", "arc", "rch", "sear", "earc", "arch"] - bool use_filter_ngram = 120; - - // Whether to apply edge n-gram indexing (prefix-based) to each token - // Used with filter_ngram_min_length and filter_ngram_max_length - // Example: filter_ngram_min_length = 3, filter_ngram_max_length = 4 - // Token: "search" - // Output: ["sea", "sear"] - bool use_filter_edge_ngram = 121; - - // Minimum length of n-grams to generate (inclusive) - // Must be used with use_filter_ngram or use_filter_edge_ngram - // Default value is 3 - int32 filter_ngram_min_length = 122 [(Ydb.value) = ">= 0"]; - - // Maximum length of n-grams to generate (inclusive) - // Must be used with use_filter_ngram or use_filter_edge_ngram - // Default value is 4 - int32 filter_ngram_max_length = 123 [(Ydb.value) = ">= 0"]; - - // Whether to filter tokens by their length - // Must be used with filter_length_min or filter_length_max - // Example: filter_length_min = 4, filter_length_max = 6 - // Tokens: ["foo", "fooba", "foobar", "foobarbaz"] - // Output: ["fooba", "foobar"] - bool use_filter_length = 130; - - // Minimum token length to keep (inclusive) - // Must be used with use_filter_length - int32 filter_length_min = 131 [(Ydb.value) = ">= 0"]; - - // Maximum token length to keep (inclusive) - // Must be used with use_filter_length - int32 filter_length_max = 132 [(Ydb.value) = ">= 0"]; + // List of columns and their fulltext settings + // Currently, this list should contain a single entry + // And provided column should be the only one in the TableIndex.index_columns list + repeated ColumnAnalyzers columns = 2; } message GlobalFulltextIndex {