From 616b9cd948e9d2730fb7c7446537a5cc6bd3f39e Mon Sep 17 00:00:00 2001 From: Stanislav Tebloev Date: Tue, 11 Nov 2025 12:15:57 +0000 Subject: [PATCH 1/2] Add n-gram kqp tests --- .../ut/indexes/kqp_indexes_fulltext_ut.cpp | 93 +++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/ydb/core/kqp/ut/indexes/kqp_indexes_fulltext_ut.cpp b/ydb/core/kqp/ut/indexes/kqp_indexes_fulltext_ut.cpp index 229d5f5fa6c5..176aa2e91ff5 100644 --- a/ydb/core/kqp/ut/indexes/kqp_indexes_fulltext_ut.cpp +++ b/ydb/core/kqp/ut/indexes/kqp_indexes_fulltext_ut.cpp @@ -61,6 +61,24 @@ void AddIndex(NQuery::TQueryClient& db) { UNIT_ASSERT_VALUES_EQUAL_C(result.GetStatus(), EStatus::SUCCESS, result.GetIssues().ToString()); } +void AddIndexNGram(NQuery::TQueryClient& db, const size_t nGramMinLength = 3, const size_t nGramMaxLength = 3, const bool edgeNGram = false) { + const TString query = Sprintf(R"sql( + ALTER TABLE `/Root/Texts` ADD INDEX fulltext_idx + GLOBAL USING fulltext + ON (Text) + WITH ( + layout=flat, + tokenizer=standard, + use_filter_ngram=%d, + use_filter_edge_ngram=%d, + filter_ngram_min_length=%d, + filter_ngram_max_length=%d + ); + )sql", !edgeNGram, edgeNGram, nGramMinLength, nGramMaxLength); + auto result = db.ExecuteQuery(query, NYdb::NQuery::TTxControl::NoTx()).ExtractValueSync(); + UNIT_ASSERT_VALUES_EQUAL_C(result.GetStatus(), EStatus::SUCCESS, result.GetIssues().ToString()); +} + void AddIndexCovered(NQuery::TQueryClient& db) { TString query = R"sql( ALTER TABLE `/Root/Texts` ADD INDEX fulltext_idx @@ -131,6 +149,81 @@ Y_UNIT_TEST(AddIndexCovered) { ])", NYdb::FormatResultSetYson(index)); } +Y_UNIT_TEST(AddIndexNGram) { + auto kikimr = Kikimr(); + auto db = kikimr.GetQueryClient(); + + CreateTexts(db); + UpsertTexts(db); + AddIndexNGram(db); + + const auto index = ReadIndex(db); + CompareYson(R"([ + [[100u];"Cat"]; + [[300u];"Cat"]; + [[200u];"Dog"]; + [[400u];"Fox"]; + [[100u];"all"]; + [[200u];"all"]; + [[100u];"als"]; + [[100u];"ani"]; + [[100u];"ase"]; + [[200u];"ase"]; + [[100u];"ats"]; + [[200u];"ats"]; + [[300u];"ats"]; + [[200u];"cat"]; + [[300u];"cat"]; + [[100u];"cha"]; + [[200u];"cha"]; + [[400u];"dog"]; + [[100u];"has"]; + [[200u];"has"]; + [[100u];"ima"]; + [[300u];"lov"]; + [[400u];"lov"]; + [[100u];"mal"]; + [[200u];"mal"]; + [[100u];"nim"]; + [[200u];"ogs"]; + [[400u];"ogs"]; + [[300u];"ove"]; + [[400u];"ove"]; + [[400u];"oxe"]; + [[100u];"sma"]; + [[200u];"sma"]; + [[400u];"xes"] + ])", NYdb::FormatResultSetYson(index)); +} + +Y_UNIT_TEST(AddIndexEdgeNGram) { + auto kikimr = Kikimr(); + auto db = kikimr.GetQueryClient(); + + CreateTexts(db); + UpsertTexts(db); + AddIndexNGram(db, 3, 3, true); + + const auto index = ReadIndex(db); + Cerr << NYdb::FormatResultSetYson(index) << Endl; + CompareYson(R"([ + [[100u];"Cat"]; + [[300u];"Cat"]; + [[200u];"Dog"]; + [[400u];"Fox"]; + [[100u];"ani"]; + [[200u];"cat"]; + [[300u];"cat"]; + [[100u];"cha"]; + [[200u];"cha"]; + [[400u];"dog"]; + [[300u];"lov"]; + [[400u];"lov"]; + [[100u];"sma"]; + [[200u];"sma"] + ])", NYdb::FormatResultSetYson(index)); +} + Y_UNIT_TEST(InsertRow) { auto kikimr = Kikimr(); auto db = kikimr.GetQueryClient(); From dbc8196c5c5b67d5e7de356ff3cf69a6eb3c0682 Mon Sep 17 00:00:00 2001 From: Stanislav Tebloev Date: Thu, 13 Nov 2025 11:02:15 +0000 Subject: [PATCH 2/2] Use lowercase filter --- .../kqp/ut/indexes/kqp_indexes_fulltext_ut.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/ydb/core/kqp/ut/indexes/kqp_indexes_fulltext_ut.cpp b/ydb/core/kqp/ut/indexes/kqp_indexes_fulltext_ut.cpp index 176aa2e91ff5..80d08eb29b3e 100644 --- a/ydb/core/kqp/ut/indexes/kqp_indexes_fulltext_ut.cpp +++ b/ydb/core/kqp/ut/indexes/kqp_indexes_fulltext_ut.cpp @@ -69,6 +69,7 @@ void AddIndexNGram(NQuery::TQueryClient& db, const size_t nGramMinLength = 3, co WITH ( layout=flat, tokenizer=standard, + use_filter_lowercase=true, use_filter_ngram=%d, use_filter_edge_ngram=%d, filter_ngram_min_length=%d, @@ -158,11 +159,8 @@ Y_UNIT_TEST(AddIndexNGram) { AddIndexNGram(db); const auto index = ReadIndex(db); + CompareYson(R"([ - [[100u];"Cat"]; - [[300u];"Cat"]; - [[200u];"Dog"]; - [[400u];"Fox"]; [[100u];"all"]; [[200u];"all"]; [[100u];"als"]; @@ -172,11 +170,14 @@ Y_UNIT_TEST(AddIndexNGram) { [[100u];"ats"]; [[200u];"ats"]; [[300u];"ats"]; + [[100u];"cat"]; [[200u];"cat"]; [[300u];"cat"]; [[100u];"cha"]; [[200u];"cha"]; + [[200u];"dog"]; [[400u];"dog"]; + [[400u];"fox"]; [[100u];"has"]; [[200u];"has"]; [[100u];"ima"]; @@ -207,16 +208,15 @@ Y_UNIT_TEST(AddIndexEdgeNGram) { const auto index = ReadIndex(db); Cerr << NYdb::FormatResultSetYson(index) << Endl; CompareYson(R"([ - [[100u];"Cat"]; - [[300u];"Cat"]; - [[200u];"Dog"]; - [[400u];"Fox"]; [[100u];"ani"]; + [[100u];"cat"]; [[200u];"cat"]; [[300u];"cat"]; [[100u];"cha"]; [[200u];"cha"]; + [[200u];"dog"]; [[400u];"dog"]; + [[400u];"fox"]; [[300u];"lov"]; [[400u];"lov"]; [[100u];"sma"];