From 5a188c2510d38d096a565e6043a5c23a9c0c0442 Mon Sep 17 00:00:00 2001 From: kungasc Date: Mon, 8 Sep 2025 17:56:06 +0300 Subject: [PATCH 1/4] Support multiple columns in fulltext index --- ydb/core/tx/schemeshard/schemeshard_utils.h | 12 + .../tx/schemeshard/ut_helpers/ls_checks.cpp | 42 +++- .../ut_index/ut_fulltext_index.cpp | 205 ++++++++++++++---- ydb/public/api/protos/ydb_table.proto | 85 +++++--- 4 files changed, 258 insertions(+), 86 deletions(-) diff --git a/ydb/core/tx/schemeshard/schemeshard_utils.h b/ydb/core/tx/schemeshard/schemeshard_utils.h index d23dfbdb89a2..cf299e6564ae 100644 --- a/ydb/core/tx/schemeshard/schemeshard_utils.h +++ b/ydb/core/tx/schemeshard/schemeshard_utils.h @@ -194,6 +194,18 @@ bool CommonCheck(const TTableDesc& tableDesc, const NKikimrSchemeOp::TIndexCreat error = TStringBuilder() << "fulltext index can only have a single key text column"; return false; } + if (indexDesc.GetFulltextIndexDescription().GetSettings().Getcolumns().size() != 1) { + status = NKikimrScheme::EStatus::StatusInvalidParameter; + error = TStringBuilder() << "fulltext index should have single '" << indexKeys.KeyColumns.at(0) << "' column settings" + << " but have " << indexDesc.GetFulltextIndexDescription().GetSettings().Getcolumns().size() << " of them"; + return false; + } + if (indexDesc.GetFulltextIndexDescription().GetSettings().Getcolumns().at(0).Getcolumn() != indexKeys.KeyColumns.at(0)) { + status = NKikimrScheme::EStatus::StatusInvalidParameter; + error = TStringBuilder() << "fulltext index should have '" << indexKeys.KeyColumns.at(0) << "' column settings" + << " but have '" << indexDesc.GetFulltextIndexDescription().GetSettings().Getcolumns().at(0).Getcolumn() << "' column settings"; + return false; + } const TString& indexColumnName = indexKeys.KeyColumns.back(); Y_ABORT_UNLESS(baseColumnTypes.contains(indexColumnName)); diff --git a/ydb/core/tx/schemeshard/ut_helpers/ls_checks.cpp b/ydb/core/tx/schemeshard/ut_helpers/ls_checks.cpp index a75939072c02..cd06a10a1909 100644 --- a/ydb/core/tx/schemeshard/ut_helpers/ls_checks.cpp +++ b/ydb/core/tx/schemeshard/ut_helpers/ls_checks.cpp @@ -1,5 +1,6 @@ #include "ls_checks.h" +#include #include #include #include @@ -914,20 +915,37 @@ TCheckFunc KMeansTreeDescription(Ydb::Table::VectorIndexSettings_Metric metric, TCheckFunc SpecializedIndexDescription(const TString& proto) { return [=] (const NKikimrScheme::TEvDescribeSchemeResult& record) { - TString actual; switch (record.GetPathDescription().GetTableIndex().GetSpecializedIndexDescriptionCase()) { - case NKikimrSchemeOp::TIndexDescription::kVectorIndexKmeansTreeDescription: - actual = record.GetPathDescription().GetTableIndex().GetVectorIndexKmeansTreeDescription().GetSettings().ShortDebugString(); - break; - case NKikimrSchemeOp::TIndexDescription::kFulltextIndexDescription: - actual = record.GetPathDescription().GetTableIndex().GetFulltextIndexDescription().GetSettings().ShortDebugString(); - break; - case NKikimrSchemeOp::TIndexDescription::SPECIALIZEDINDEXDESCRIPTION_NOT_SET: - actual = "SPECIALIZEDINDEXDESCRIPTION_NOT_SET"; - break; + case NKikimrSchemeOp::TIndexDescription::kVectorIndexKmeansTreeDescription: { + auto actual = record.GetPathDescription().GetTableIndex().GetVectorIndexKmeansTreeDescription().GetSettings(); + Ydb::Table::KMeansTreeSettings expected; + UNIT_ASSERT(google::protobuf::TextFormat::ParseFromString(proto, &expected)); + UNIT_ASSERT_C(google::protobuf::util::MessageDifferencer::Equals(actual, expected), + TStringBuilder() << "Expected" + << expected.ShortDebugString() + << " but got " + << actual.ShortDebugString()); + break; + } + case NKikimrSchemeOp::TIndexDescription::kFulltextIndexDescription: { + auto actual = record.GetPathDescription().GetTableIndex().GetFulltextIndexDescription().GetSettings(); + Ydb::Table::FulltextIndexSettings expected; + UNIT_ASSERT(google::protobuf::TextFormat::ParseFromString(proto, &expected)); + UNIT_ASSERT_C(google::protobuf::util::MessageDifferencer::Equals(actual, expected), + TStringBuilder() << "Expected" + << expected.ShortDebugString() + << " but got " + << actual.ShortDebugString()); + break; + } + case NKikimrSchemeOp::TIndexDescription::SPECIALIZEDINDEXDESCRIPTION_NOT_SET: { + UNIT_ASSERT_C(proto == "SPECIALIZEDINDEXDESCRIPTION_NOT_SET", + TStringBuilder() << "Expected" + << proto + << " but got SPECIALIZEDINDEXDESCRIPTION_NOT_SET"); + break; + } } - - UNIT_ASSERT_VALUES_EQUAL(actual, proto); }; } diff --git a/ydb/core/tx/schemeshard/ut_index/ut_fulltext_index.cpp b/ydb/core/tx/schemeshard/ut_index/ut_fulltext_index.cpp index e9f772e8e3fc..6c147d7af6ad 100644 --- a/ydb/core/tx/schemeshard/ut_index/ut_fulltext_index.cpp +++ b/ydb/core/tx/schemeshard/ut_index/ut_fulltext_index.cpp @@ -18,23 +18,38 @@ Y_UNIT_TEST_SUITE(TFulltextIndexTests) { TTestEnv env(runtime); ui64 txId = 100; - TestCreateIndexedTable(runtime, ++txId, "/MyRoot", R"( + TString fulltextSettings = R"( + layout: FLAT + columns: { + column: "text" + settings: { + tokenizer: STANDARD + use_filter_ngram: true + filter_ngram_max_length: 42 + } + } + )"; + TestCreateIndexedTable(runtime, ++txId, "/MyRoot", Sprintf(R"( TableDescription { - Name: "texts" - Columns { Name: "id" Type: "Uint64" } - Columns { Name: "text" Type: "String" } - Columns { Name: "covered" Type: "String" } - Columns { Name: "another" Type: "Uint64" } - KeyColumnNames: ["id"] + Name: "texts" + Columns { Name: "id" Type: "Uint64" } + Columns { Name: "text" Type: "String" } + Columns { Name: "covered" Type: "String" } + Columns { Name: "another" Type: "Uint64" } + KeyColumnNames: ["id"] } IndexDescription { - Name: "idx_fulltext" - KeyColumnNames: ["text"] - DataColumnNames: ["covered"] - Type: EIndexTypeGlobalFulltext - FulltextIndexDescription: { Settings: { layout: FLAT, tokenizer: STANDARD, use_filter_ngram: true, filter_ngram_max_length: 42 } } + Name: "idx_fulltext" + KeyColumnNames: ["text"] + DataColumnNames: ["covered"] + Type: EIndexTypeGlobalFulltext + FulltextIndexDescription: { + Settings: { + %s + } + } } - )"); + )", fulltextSettings.c_str())); env.TestWaitNotification(runtime, txId); NKikimrSchemeOp::TDescribeOptions opts; @@ -49,7 +64,7 @@ Y_UNIT_TEST_SUITE(TFulltextIndexTests) { NLs::IndexState(NKikimrSchemeOp::EIndexStateReady), NLs::IndexKeys({"text"}), NLs::IndexDataColumns({"covered"}), - NLs::SpecializedIndexDescription("layout: FLAT tokenizer: STANDARD use_filter_ngram: true filter_ngram_max_length: 42"), + NLs::SpecializedIndexDescription(fulltextSettings), NLs::ChildrenCount(1), }); @@ -70,23 +85,38 @@ Y_UNIT_TEST_SUITE(TFulltextIndexTests) { TTestEnv env(runtime); ui64 txId = 100; - TestCreateIndexedTable(runtime, ++txId, "/MyRoot", R"( + TString fulltextSettings = R"( + layout: FLAT + columns: { + column: "text" + settings: { + tokenizer: STANDARD + use_filter_ngram: true + filter_ngram_max_length: 42 + } + } + )"; + TestCreateIndexedTable(runtime, ++txId, "/MyRoot", Sprintf(R"( TableDescription { - Name: "texts" - Columns { Name: "id" Type: "Uint64" } - Columns { Name: "text" Type: "String" } - Columns { Name: "covered" Type: "String" } - Columns { Name: "another" Type: "Uint64" } - KeyColumnNames: [ "id"] + Name: "texts" + Columns { Name: "id" Type: "Uint64" } + Columns { Name: "text" Type: "String" } + Columns { Name: "covered" Type: "String" } + Columns { Name: "another" Type: "Uint64" } + KeyColumnNames: ["id"] } IndexDescription { - Name: "idx_fulltext" - KeyColumnNames: [ "another", "text"] - DataColumnNames: ["covered"] - Type: EIndexTypeGlobalFulltext - FulltextIndexDescription: { Settings: { layout: FLAT, tokenizer: STANDARD, use_filter_ngram: true, filter_ngram_max_length: 42 } } + Name: "idx_fulltext" + KeyColumnNames: [ "another", "text"] + DataColumnNames: ["covered"] + Type: EIndexTypeGlobalFulltext + FulltextIndexDescription: { + Settings: { + %s + } + } } - )", {NKikimrScheme::StatusInvalidParameter}); + )", fulltextSettings.c_str()), {NKikimrScheme::StatusInvalidParameter}); env.TestWaitNotification(runtime, txId); TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/texts/idx_fulltext"),{ @@ -99,23 +129,118 @@ Y_UNIT_TEST_SUITE(TFulltextIndexTests) { TTestEnv env(runtime); ui64 txId = 100; - TestCreateIndexedTable(runtime, ++txId, "/MyRoot", R"( + TString fulltextSettings = R"( + layout: FLAT + columns: { + column: "text" + settings: { + tokenizer: STANDARD + use_filter_ngram: true + filter_ngram_max_length: 42 + } + } + )"; + TestCreateIndexedTable(runtime, ++txId, "/MyRoot", Sprintf(R"( + TableDescription { + Name: "texts" + Columns { Name: "id" Type: "Uint64" } + Columns { Name: "text" Type: "Uint64" } + Columns { Name: "covered" Type: "String" } + Columns { Name: "another" Type: "Uint64" } + KeyColumnNames: ["id"] + } + IndexDescription { + Name: "idx_fulltext" + KeyColumnNames: ["text"] + DataColumnNames: ["covered"] + Type: EIndexTypeGlobalFulltext + FulltextIndexDescription: { + Settings: { + %s + } + } + } + )", fulltextSettings.c_str()), {NKikimrScheme::StatusInvalidParameter}); + env.TestWaitNotification(runtime, txId); + + TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/texts/idx_fulltext"),{ + NLs::PathNotExist, + }); + } + + Y_UNIT_TEST(CreateTableColumnsMismatch) { + TTestBasicRuntime runtime; + TTestEnv env(runtime); + ui64 txId = 100; + + TString fulltextSettings = R"( + layout: FLAT + columns: { + column: "text_wrong" + settings: { + tokenizer: STANDARD + use_filter_ngram: true + filter_ngram_max_length: 42 + } + } + )"; + TestCreateIndexedTable(runtime, ++txId, "/MyRoot", Sprintf(R"( + TableDescription { + Name: "texts" + Columns { Name: "id" Type: "Uint64" } + Columns { Name: "text" Type: "String" } + Columns { Name: "covered" Type: "String" } + Columns { Name: "another" Type: "Uint64" } + KeyColumnNames: ["id"] + } + IndexDescription { + Name: "idx_fulltext" + KeyColumnNames: ["text"] + DataColumnNames: ["covered"] + Type: EIndexTypeGlobalFulltext + FulltextIndexDescription: { + Settings: { + %s + } + } + } + )", fulltextSettings.c_str()), {NKikimrScheme::StatusInvalidParameter}); + env.TestWaitNotification(runtime, txId); + + TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/texts/idx_fulltext"),{ + NLs::PathNotExist, + }); + } + + Y_UNIT_TEST(CreateTableNoColumnsSettings) { + TTestBasicRuntime runtime; + TTestEnv env(runtime); + ui64 txId = 100; + + TString fulltextSettings = R"( + layout: FLAT + )"; + TestCreateIndexedTable(runtime, ++txId, "/MyRoot", Sprintf(R"( TableDescription { - Name: "texts" - Columns { Name: "id" Type: "Uint64" } - Columns { Name: "text" Type: "Uint64" } - Columns { Name: "covered" Type: "String" } - Columns { Name: "another" Type: "Uint64" } - KeyColumnNames: ["id"] + Name: "texts" + Columns { Name: "id" Type: "Uint64" } + Columns { Name: "text" Type: "String" } + Columns { Name: "covered" Type: "String" } + Columns { Name: "another" Type: "Uint64" } + KeyColumnNames: ["id"] } IndexDescription { - Name: "idx_fulltext" - KeyColumnNames: ["text"] - DataColumnNames: ["covered"] - Type: EIndexTypeGlobalFulltext - FulltextIndexDescription: { Settings: { layout: FLAT, tokenizer: STANDARD, use_filter_ngram: true, filter_ngram_max_length: 42 } } + Name: "idx_fulltext" + KeyColumnNames: ["text"] + DataColumnNames: ["covered"] + Type: EIndexTypeGlobalFulltext + FulltextIndexDescription: { + Settings: { + %s + } + } } - )", {NKikimrScheme::StatusInvalidParameter}); + )", fulltextSettings.c_str()), {NKikimrScheme::StatusInvalidParameter}); env.TestWaitNotification(runtime, txId); TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/texts/idx_fulltext"),{ diff --git a/ydb/public/api/protos/ydb_table.proto b/ydb/public/api/protos/ydb_table.proto index ea0bb46f00d5..5f784c654794 100644 --- a/ydb/public/api/protos/ydb_table.proto +++ b/ydb/public/api/protos/ydb_table.proto @@ -113,35 +113,7 @@ message GlobalVectorKMeansTreeIndex { KMeansTreeSettings vector_settings = 3; } -message FulltextIndexSettings { - // Specifies the layout strategy for storing and updating the full-text index - enum Layout { - LAYOUT_UNSPECIFIED = 0; - - // Uses a single flat inverted index table (indexImplTable) - // Example source table: - // ┌────┬────────────────────────────┐ - // │ id │ text │ - // ├────┼────────────────────────────┤ - // │ 1 │ "The quick brown fox" │ - // │ 2 │ "The quick blue hare" │ - // └────┴────────────────────────────┘ - // Example inverted index table (indexImplTable): - // ┌──────────────┬────┐ - // │ __ydb_token │ id │ - // ├──────────────┼────┤ - // │ "blue" │ 2 │ - // │ "brown" │ 1 │ - // │ "fox" │ 1 │ - // │ "hare" │ 2 │ - // │ "quick" │ 1 │ - // │ "quick" │ 2 │ - // │ "The" │ 1 │ - // │ "The" │ 2 │ - // └──────────────┴────┘ - FLAT = 1; - } - +message FulltextIndexAnalyzerSettings { // Specifies how text is tokenized during indexing enum Tokenizer { TOKENIZER_UNSPECIFIED = 0; @@ -168,16 +140,13 @@ message FulltextIndexSettings { KEYWORD = 3; } - // See Layout enum - Layout layout = 1; - // See Tokenizer enum - Tokenizer tokenizer = 2; + Tokenizer tokenizer = 1; // Language used for language-sensitive operations like stopword filtering // Example: language = "english" // By default is not specified and no language-specific logic is applied - string language = 3; + string language = 2; // Whether to convert tokens to lowercase // Example: @@ -231,6 +200,54 @@ message FulltextIndexSettings { int32 filter_length_max = 132 [(Ydb.value) = ">= 0"]; } +// Represents fulltext index settings for a single column +message FulltextIndexColumnSettings { + // Name of the column to be indexed + string column = 1; + + // Fulltext index analyzer settings specific to this column + FulltextIndexAnalyzerSettings settings = 2; +} + +message FulltextIndexSettings { + // Specifies the layout strategy for storing and updating the full-text index + enum Layout { + LAYOUT_UNSPECIFIED = 0; + + // Uses a single flat inverted index table (indexImplTable) + // Example source table: + // ┌────┬────────────────────────────┐ + // │ id │ text │ + // ├────┼────────────────────────────┤ + // │ 1 │ "The quick brown fox" │ + // │ 2 │ "The quick blue hare" │ + // └────┴────────────────────────────┘ + // Example inverted index table (indexImplTable): + // ┌──────────────┬────┐ + // │ __ydb_token │ id │ + // ├──────────────┼────┤ + // │ "blue" │ 2 │ + // │ "brown" │ 1 │ + // │ "fox" │ 1 │ + // │ "hare" │ 2 │ + // │ "quick" │ 1 │ + // │ "quick" │ 2 │ + // │ "The" │ 1 │ + // │ "The" │ 2 │ + // └──────────────┴────┘ + // Supports single column only + FLAT = 1; + } + + // See Layout enum + Layout layout = 1; + + // List of columns and their fulltext settings + // Currently, this list should contain a single entry + // And provided column should be the only one in the TableIndex.index_columns list + repeated FulltextIndexColumnSettings columns = 2; +} + message GlobalFulltextIndex { GlobalIndexSettings settings = 1; FulltextIndexSettings fulltext_settings = 2; From ca1ada5ea1dab9c7e8f79ec6f317335477f4f9b2 Mon Sep 17 00:00:00 2001 From: kungasc Date: Tue, 9 Sep 2025 11:55:22 +0300 Subject: [PATCH 2/4] better structure inside FulltextIndexSettings --- .../ut_index/ut_fulltext_index.cpp | 8 +- ydb/public/api/protos/ydb_table.proto | 195 +++++++++--------- 2 files changed, 102 insertions(+), 101 deletions(-) diff --git a/ydb/core/tx/schemeshard/ut_index/ut_fulltext_index.cpp b/ydb/core/tx/schemeshard/ut_index/ut_fulltext_index.cpp index 6c147d7af6ad..4cf0e1cacc6f 100644 --- a/ydb/core/tx/schemeshard/ut_index/ut_fulltext_index.cpp +++ b/ydb/core/tx/schemeshard/ut_index/ut_fulltext_index.cpp @@ -22,7 +22,7 @@ Y_UNIT_TEST_SUITE(TFulltextIndexTests) { layout: FLAT columns: { column: "text" - settings: { + analyzers: { tokenizer: STANDARD use_filter_ngram: true filter_ngram_max_length: 42 @@ -89,7 +89,7 @@ Y_UNIT_TEST_SUITE(TFulltextIndexTests) { layout: FLAT columns: { column: "text" - settings: { + analyzers: { tokenizer: STANDARD use_filter_ngram: true filter_ngram_max_length: 42 @@ -133,7 +133,7 @@ Y_UNIT_TEST_SUITE(TFulltextIndexTests) { layout: FLAT columns: { column: "text" - settings: { + analyzers: { tokenizer: STANDARD use_filter_ngram: true filter_ngram_max_length: 42 @@ -177,7 +177,7 @@ Y_UNIT_TEST_SUITE(TFulltextIndexTests) { layout: FLAT columns: { column: "text_wrong" - settings: { + analyzers: { tokenizer: STANDARD use_filter_ngram: true filter_ngram_max_length: 42 diff --git a/ydb/public/api/protos/ydb_table.proto b/ydb/public/api/protos/ydb_table.proto index 5f784c654794..59dc37224a0f 100644 --- a/ydb/public/api/protos/ydb_table.proto +++ b/ydb/public/api/protos/ydb_table.proto @@ -113,102 +113,6 @@ message GlobalVectorKMeansTreeIndex { KMeansTreeSettings vector_settings = 3; } -message FulltextIndexAnalyzerSettings { - // Specifies how text is tokenized during indexing - enum Tokenizer { - TOKENIZER_UNSPECIFIED = 0; - - // Splits text only by whitespace - // Does not split on punctuation - // Example: - // Text: "foo-bar baz_lorem ipsum" - // Tokens: ["foo-bar", "baz_lorem", "ipsum"] - WHITESPACE = 1; - - // Applies general language-aware tokenization - // Splits text on whitespace and punctuation - // Example: - // Text: "foo-bar baz_lorem ipsum" - // Tokens: ["foo", "bar", "baz", "lorem", "ipsum"] - STANDARD = 2; - - // Treats the entire input as a single token - // No splitting is performed - // Example: - // Text: "Hello World!" - // Tokens: ["Hello World!"] - KEYWORD = 3; - } - - // See Tokenizer enum - Tokenizer tokenizer = 1; - - // Language used for language-sensitive operations like stopword filtering - // Example: language = "english" - // By default is not specified and no language-specific logic is applied - string language = 2; - - // Whether to convert tokens to lowercase - // Example: - // Token: "Quick" - // Output: "quick" - bool use_filter_lowercase = 100; - - // Whether to remove common stopwords like "the", "a", "is" - // Example: language = "english" - // Tokens: ["the", "quick", "brown"] - // Output: ["quick", "brown"] - bool use_filter_stopwords = 110; - - // Whether to apply character n-gram indexing to each token - // Must be used with filter_ngram_min_length and filter_ngram_max_length - // Example: filter_ngram_min_length = 3, filter_ngram_max_length = 4 - // Token: "search" - // Output: ["sea", "ear", "arc", "rch", "sear", "earc", "arch"] - bool use_filter_ngram = 120; - - // Whether to apply edge n-gram indexing (prefix-based) to each token - // Used with filter_ngram_min_length and filter_ngram_max_length - // Example: filter_ngram_min_length = 3, filter_ngram_max_length = 4 - // Token: "search" - // Output: ["sea", "sear"] - bool use_filter_edge_ngram = 121; - - // Minimum length of n-grams to generate (inclusive) - // Must be used with use_filter_ngram or use_filter_edge_ngram - // Default value is 3 - int32 filter_ngram_min_length = 122 [(Ydb.value) = ">= 0"]; - - // Maximum length of n-grams to generate (inclusive) - // Must be used with use_filter_ngram or use_filter_edge_ngram - // Default value is 4 - int32 filter_ngram_max_length = 123 [(Ydb.value) = ">= 0"]; - - // Whether to filter tokens by their length - // Must be used with filter_length_min or filter_length_max - // Example: filter_length_min = 4, filter_length_max = 6 - // Tokens: ["foo", "fooba", "foobar", "foobarbaz"] - // Output: ["fooba", "foobar"] - bool use_filter_length = 130; - - // Minimum token length to keep (inclusive) - // Must be used with use_filter_length - int32 filter_length_min = 131 [(Ydb.value) = ">= 0"]; - - // Maximum token length to keep (inclusive) - // Must be used with use_filter_length - int32 filter_length_max = 132 [(Ydb.value) = ">= 0"]; -} - -// Represents fulltext index settings for a single column -message FulltextIndexColumnSettings { - // Name of the column to be indexed - string column = 1; - - // Fulltext index analyzer settings specific to this column - FulltextIndexAnalyzerSettings settings = 2; -} - message FulltextIndexSettings { // Specifies the layout strategy for storing and updating the full-text index enum Layout { @@ -239,13 +143,110 @@ message FulltextIndexSettings { FLAT = 1; } + // Specifies how text is tokenized during indexing + enum Tokenizer { + TOKENIZER_UNSPECIFIED = 0; + + // Splits text only by whitespace + // Does not split on punctuation + // Example: + // Text: "foo-bar baz_lorem ipsum" + // Tokens: ["foo-bar", "baz_lorem", "ipsum"] + WHITESPACE = 1; + + // Applies general language-aware tokenization + // Splits text on whitespace and punctuation + // Example: + // Text: "foo-bar baz_lorem ipsum" + // Tokens: ["foo", "bar", "baz", "lorem", "ipsum"] + STANDARD = 2; + + // Treats the entire input as a single token + // No splitting is performed + // Example: + // Text: "Hello World!" + // Tokens: ["Hello World!"] + KEYWORD = 3; + } + + // Represents text analyzers settings + message Analyzers { + // See Tokenizer enum + Tokenizer tokenizer = 1; + + // Language used for language-sensitive operations like stopword filtering + // Example: language = "english" + // By default is not specified and no language-specific logic is applied + string language = 2; + + // Whether to convert tokens to lowercase + // Example: + // Token: "Quick" + // Output: "quick" + bool use_filter_lowercase = 100; + + // Whether to remove common stopwords like "the", "a", "is" + // Example: language = "english" + // Tokens: ["the", "quick", "brown"] + // Output: ["quick", "brown"] + bool use_filter_stopwords = 110; + + // Whether to apply character n-gram indexing to each token + // Must be used with filter_ngram_min_length and filter_ngram_max_length + // Example: filter_ngram_min_length = 3, filter_ngram_max_length = 4 + // Token: "search" + // Output: ["sea", "ear", "arc", "rch", "sear", "earc", "arch"] + bool use_filter_ngram = 120; + + // Whether to apply edge n-gram indexing (prefix-based) to each token + // Used with filter_ngram_min_length and filter_ngram_max_length + // Example: filter_ngram_min_length = 3, filter_ngram_max_length = 4 + // Token: "search" + // Output: ["sea", "sear"] + bool use_filter_edge_ngram = 121; + + // Minimum length of n-grams to generate (inclusive) + // Must be used with use_filter_ngram or use_filter_edge_ngram + // Default value is 3 + int32 filter_ngram_min_length = 122 [(Ydb.value) = ">= 0"]; + + // Maximum length of n-grams to generate (inclusive) + // Must be used with use_filter_ngram or use_filter_edge_ngram + // Default value is 4 + int32 filter_ngram_max_length = 123 [(Ydb.value) = ">= 0"]; + + // Whether to filter tokens by their length + // Must be used with filter_length_min or filter_length_max + // Example: filter_length_min = 4, filter_length_max = 6 + // Tokens: ["foo", "fooba", "foobar", "foobarbaz"] + // Output: ["fooba", "foobar"] + bool use_filter_length = 130; + + // Minimum token length to keep (inclusive) + // Must be used with use_filter_length + int32 filter_length_min = 131 [(Ydb.value) = ">= 0"]; + + // Maximum token length to keep (inclusive) + // Must be used with use_filter_length + int32 filter_length_max = 132 [(Ydb.value) = ">= 0"]; + } + + // Represents text analyzers settings for a specific column + message ColumnAnalyzers { + // Name of the column to be indexed + string column = 1; + + // Analyzer settings specific to this column + Analyzers analyzers = 2; + } + // See Layout enum Layout layout = 1; // List of columns and their fulltext settings // Currently, this list should contain a single entry // And provided column should be the only one in the TableIndex.index_columns list - repeated FulltextIndexColumnSettings columns = 2; + repeated ColumnAnalyzers columns = 2; } message GlobalFulltextIndex { From ac8a824230f6724efc4adb43e96a5c08a8591e0c Mon Sep 17 00:00:00 2001 From: kungasc Date: Tue, 9 Sep 2025 11:59:43 +0300 Subject: [PATCH 3/4] fix typo --- ydb/public/api/protos/ydb_table.proto | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ydb/public/api/protos/ydb_table.proto b/ydb/public/api/protos/ydb_table.proto index 59dc37224a0f..5237514c8b3e 100644 --- a/ydb/public/api/protos/ydb_table.proto +++ b/ydb/public/api/protos/ydb_table.proto @@ -139,7 +139,7 @@ message FulltextIndexSettings { // │ "The" │ 1 │ // │ "The" │ 2 │ // └──────────────┴────┘ - // Supports single column only + // Supports a single column only FLAT = 1; } From fb8f9d8fac7bb2f8f3fdb0a070b472bc9f676ef7 Mon Sep 17 00:00:00 2001 From: kungasc Date: Tue, 9 Sep 2025 14:40:30 +0300 Subject: [PATCH 4/4] cr: add multiple columns tests --- .../ut_index/ut_fulltext_index.cpp | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/ydb/core/tx/schemeshard/ut_index/ut_fulltext_index.cpp b/ydb/core/tx/schemeshard/ut_index/ut_fulltext_index.cpp index 4cf0e1cacc6f..26de835d2c67 100644 --- a/ydb/core/tx/schemeshard/ut_index/ut_fulltext_index.cpp +++ b/ydb/core/tx/schemeshard/ut_index/ut_fulltext_index.cpp @@ -124,6 +124,59 @@ Y_UNIT_TEST_SUITE(TFulltextIndexTests) { }); } + Y_UNIT_TEST(CreateTableMultipleColumns) { // not supported for now, maybe later + TTestBasicRuntime runtime; + TTestEnv env(runtime); + ui64 txId = 100; + + TString fulltextSettings = R"( + layout: FLAT + columns: { + column: "text1" + analyzers: { + tokenizer: STANDARD + use_filter_ngram: true + filter_ngram_max_length: 42 + } + } + columns: { + column: "text2" + analyzers: { + tokenizer: STANDARD + use_filter_ngram: true + filter_ngram_max_length: 42 + } + } + )"; + TestCreateIndexedTable(runtime, ++txId, "/MyRoot", Sprintf(R"( + TableDescription { + Name: "texts" + Columns { Name: "id" Type: "Uint64" } + Columns { Name: "text1" Type: "String" } + Columns { Name: "text2" Type: "String" } + Columns { Name: "covered" Type: "String" } + Columns { Name: "another" Type: "Uint64" } + KeyColumnNames: ["id"] + } + IndexDescription { + Name: "idx_fulltext" + KeyColumnNames: ["text1", "text2"] + DataColumnNames: ["covered"] + Type: EIndexTypeGlobalFulltext + FulltextIndexDescription: { + Settings: { + %s + } + } + } + )", fulltextSettings.c_str()), {NKikimrScheme::StatusInvalidParameter}); + env.TestWaitNotification(runtime, txId); + + TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/texts/idx_fulltext"),{ + NLs::PathNotExist, + }); + } + Y_UNIT_TEST(CreateTableNotText) { TTestBasicRuntime runtime; TTestEnv env(runtime);