diff --git a/ydb/core/base/table_index.cpp b/ydb/core/base/table_index.cpp index fa63ed0f90c7..bc6bdea57e40 100644 --- a/ydb/core/base/table_index.cpp +++ b/ydb/core/base/table_index.cpp @@ -13,20 +13,24 @@ TVector::const_iterator IsUniq(const TVector& names) { return names.end(); } +bool Contains(const TVector& names, TString str) { + return std::find(names.begin(), names.end(), str) != names.end(); +} + namespace NKikimr { namespace NTableIndex { -TTableColumns CalcTableImplDescription(const TTableColumns& table, const TIndexColumns& index) { - { - TString explain; - Y_ABORT_UNLESS(IsCompatibleIndex(table, index, explain), "explain is %s", explain.c_str()); - } - +TTableColumns CalcTableImplDescription(const NKikimrSchemeOp::EIndexType indexType, const TTableColumns& table, const TIndexColumns& index) { TTableColumns result; - for (const auto& ik: index.KeyColumns) { - result.Keys.push_back(ik); - result.Columns.insert(ik); + if (indexType == NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree) { + result.Keys.push_back(NTableVectorKmeansTreeIndex::PostingTable_ParentIdColumn); + result.Columns.insert(NTableVectorKmeansTreeIndex::PostingTable_ParentIdColumn); + } else { + for (const auto& ik: index.KeyColumns) { + result.Keys.push_back(ik); + result.Columns.insert(ik); + } } for (const auto& tk: table.Keys) { @@ -43,7 +47,9 @@ TTableColumns CalcTableImplDescription(const TTableColumns& table, const TIndexC return result; } -bool IsCompatibleIndex(const TTableColumns& table, const TIndexColumns& index, TString& explain) { +bool IsCompatibleIndex(const NKikimrSchemeOp::EIndexType indexType, const TTableColumns& table, const TIndexColumns& index, TString& explain) { + const bool isVectorIndex = indexType == NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree; + { auto brokenAt = IsUniq(table.Keys); if (brokenAt != table.Keys.end()) { @@ -71,6 +77,23 @@ bool IsCompatibleIndex(const TTableColumns& table, const TIndexColumns& index, T } } + if (isVectorIndex) { + if (index.KeyColumns.size() != 1) { + explain = "Only single key column is supported for vector index"; + return false; + } + + if (Contains(index.KeyColumns, NTableVectorKmeansTreeIndex::PostingTable_ParentIdColumn)) { + explain = TStringBuilder() << "Key column should not have a reserved name: " << NTableVectorKmeansTreeIndex::PostingTable_ParentIdColumn; + return false; + } + + if (Contains(index.DataColumns, NTableVectorKmeansTreeIndex::PostingTable_ParentIdColumn)) { + explain = TStringBuilder() << "Data column should not have a reserved name: " << NTableVectorKmeansTreeIndex::PostingTable_ParentIdColumn; + return false; + } + } + THashSet indexKeys; for (const auto& tableKeyName: table.Keys) { @@ -84,7 +107,8 @@ bool IsCompatibleIndex(const TTableColumns& table, const TIndexColumns& index, T } for (const auto& indexKeyName: index.KeyColumns) { - indexKeys.insert(indexKeyName); + if (!isVectorIndex) + indexKeys.insert(indexKeyName); if (!table.Columns.contains(indexKeyName)) { explain = TStringBuilder() << "all index keys should be in table columns" @@ -93,9 +117,9 @@ bool IsCompatibleIndex(const TTableColumns& table, const TIndexColumns& index, T } } - if (index.KeyColumns == table.Keys) { + if (index.KeyColumns == table.Keys && !isVectorIndex) { explain = TStringBuilder() - << "table and index keys are the same"; + << "table and index keys are the same"; return false; } diff --git a/ydb/core/base/table_index.h b/ydb/core/base/table_index.h index ea1328cdb614..7d7af7b915d0 100644 --- a/ydb/core/base/table_index.h +++ b/ydb/core/base/table_index.h @@ -1,5 +1,8 @@ #pragma once +#include "table_vector_index.h" +#include + #include #include #include @@ -18,8 +21,8 @@ struct TIndexColumns { TVector DataColumns; }; -bool IsCompatibleIndex(const TTableColumns& table, const TIndexColumns& index, TString& explain); -TTableColumns CalcTableImplDescription(const TTableColumns& table, const TIndexColumns &index); +bool IsCompatibleIndex(const NKikimrSchemeOp::EIndexType indexType, const TTableColumns& table, const TIndexColumns& index, TString& explain); +TTableColumns CalcTableImplDescription(const NKikimrSchemeOp::EIndexType indexType, const TTableColumns& table, const TIndexColumns& index); } } diff --git a/ydb/core/base/table_vector_index.h b/ydb/core/base/table_vector_index.h new file mode 100644 index 000000000000..1b6e24508314 --- /dev/null +++ b/ydb/core/base/table_vector_index.h @@ -0,0 +1,19 @@ +#pragma once + +namespace NKikimr::NTableIndex::NTableVectorKmeansTreeIndex { + +// Vector KmeansTree index tables description + +// Levels table +inline constexpr const char* LevelTable = "indexImplLevelTable"; +inline constexpr const char* LevelTable_ParentIdColumn = "-parent"; +inline constexpr const char* LevelTable_IdColumn = "-id"; +inline constexpr const char* LevelTable_EmbeddingColumn = "-embedding"; + +// Posting table +inline constexpr const char* PostingTable = "indexImplPostingTable"; +inline constexpr const char* PostingTable_ParentIdColumn = "-parent"; + + +} + diff --git a/ydb/core/kqp/host/kqp_gateway_proxy.cpp b/ydb/core/kqp/host/kqp_gateway_proxy.cpp index 3c717ee29c7c..61b587e96c10 100644 --- a/ydb/core/kqp/host/kqp_gateway_proxy.cpp +++ b/ydb/core/kqp/host/kqp_gateway_proxy.cpp @@ -572,18 +572,7 @@ class TKqpGatewayProxy : public IKikimrGateway { for (const auto& index : metadata->Indexes) { auto indexDesc = schemeTx.MutableCreateIndexedTable()->AddIndexDescription(); indexDesc->SetName(index.Name); - switch (index.Type) { - case NYql::TIndexDescription::EType::GlobalSync: - indexDesc->SetType(NKikimrSchemeOp::EIndexType::EIndexTypeGlobal); - break; - case NYql::TIndexDescription::EType::GlobalAsync: - indexDesc->SetType(NKikimrSchemeOp::EIndexType::EIndexTypeGlobalAsync); - break; - case NYql::TIndexDescription::EType::GlobalSyncUnique: - indexDesc->SetType(NKikimrSchemeOp::EIndexType::EIndexTypeGlobalUnique); - break; - } - + indexDesc->SetType(TIndexDescription::ConvertIndexType(index.Type)); indexDesc->SetState(static_cast<::NKikimrSchemeOp::EIndexState>(index.State)); for (const auto& col : index.KeyColumns) { indexDesc->AddKeyColumnNames(col); diff --git a/ydb/core/kqp/provider/yql_kikimr_gateway.cpp b/ydb/core/kqp/provider/yql_kikimr_gateway.cpp index c0bf3edf6d61..897bbfc54190 100644 --- a/ydb/core/kqp/provider/yql_kikimr_gateway.cpp +++ b/ydb/core/kqp/provider/yql_kikimr_gateway.cpp @@ -115,9 +115,14 @@ void IKikimrGateway::BuildIndexMetadata(TTableMetadataResult& loadTableMetadataR for (size_t i = 0; i < indexesCount; i++) { const auto& index = tableMetadata->Indexes[i]; auto indexTablePath = NKikimr::NKqp::NSchemeHelpers::CreateIndexTablePath(tableName, index.Name); - NKikimr::NTableIndex::TTableColumns indexTableColumns = NKikimr::NTableIndex::CalcTableImplDescription( - tableColumns, - NKikimr::NTableIndex::TIndexColumns{index.KeyColumns, {}}); + + NKikimr::NTableIndex::TIndexColumns indexColumns{index.KeyColumns, {}}; + + TString error; + NKikimrSchemeOp::EIndexType indexType = TIndexDescription::ConvertIndexType(index.Type); + YQL_ENSURE(IsCompatibleIndex(indexType, tableColumns, indexColumns, error), "Index is not compatible: " << error); + + NKikimr::NTableIndex::TTableColumns indexTableColumns = NKikimr::NTableIndex::CalcTableImplDescription(indexType, tableColumns, indexColumns); TKikimrTableMetadataPtr indexTableMetadata = new TKikimrTableMetadata(cluster, indexTablePath); indexTableMetadata->DoesExist = true; diff --git a/ydb/core/kqp/provider/yql_kikimr_gateway.h b/ydb/core/kqp/provider/yql_kikimr_gateway.h index a5c666a23ab1..ca7cfecf0f3a 100644 --- a/ydb/core/kqp/provider/yql_kikimr_gateway.h +++ b/ydb/core/kqp/provider/yql_kikimr_gateway.h @@ -64,6 +64,7 @@ struct TIndexDescription { GlobalSync = 0, GlobalAsync = 1, GlobalSyncUnique = 2, + GlobalSyncVectorKMeansTree = 3 }; // Index states here must be in sync with NKikimrSchemeOp::EIndexState protobuf @@ -99,7 +100,7 @@ struct TIndexDescription { : Name(index.GetName()) , KeyColumns(index.GetKeyColumnNames().begin(), index.GetKeyColumnNames().end()) , DataColumns(index.GetDataColumnNames().begin(), index.GetDataColumnNames().end()) - , Type(ConvertIndexType(index)) + , Type(ConvertIndexType(index.GetType())) , State(static_cast(index.GetState())) , SchemaVersion(index.GetSchemaVersion()) , LocalPathId(index.GetLocalPathId()) @@ -117,15 +118,32 @@ struct TIndexDescription { , PathOwnerId(message->GetPathOwnerId()) {} - static TIndexDescription::EType ConvertIndexType(const NKikimrSchemeOp::TIndexDescription& index) { - auto type = NYql::TIndexDescription::EType::GlobalSync; - if (index.GetType() == NKikimrSchemeOp::EIndexType::EIndexTypeGlobalAsync) { - type = NYql::TIndexDescription::EType::GlobalAsync; - } else if (index.GetType() == NKikimrSchemeOp::EIndexType::EIndexTypeGlobalUnique) { - type = NYql::TIndexDescription::EType::GlobalSyncUnique; + static TIndexDescription::EType ConvertIndexType(const NKikimrSchemeOp::EIndexType indexType) { + switch (indexType) { + case NKikimrSchemeOp::EIndexType::EIndexTypeGlobal: + return TIndexDescription::EType::GlobalSync; + case NKikimrSchemeOp::EIndexType::EIndexTypeGlobalAsync: + return TIndexDescription::EType::GlobalAsync; + case NKikimrSchemeOp::EIndexType::EIndexTypeGlobalUnique: + return TIndexDescription::EType::GlobalSyncUnique; + case NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree: + return TIndexDescription::EType::GlobalSyncVectorKMeansTree; + default: + YQL_ENSURE(false, "Unexpected NKikimrSchemeOp::EIndexType::EIndexTypeInvalid"); } + } - return type; + static NKikimrSchemeOp::EIndexType ConvertIndexType(const TIndexDescription::EType indexType) { + switch (indexType) { + case TIndexDescription::EType::GlobalSync: + return NKikimrSchemeOp::EIndexType::EIndexTypeGlobal; + case TIndexDescription::EType::GlobalAsync: + return NKikimrSchemeOp::EIndexType::EIndexTypeGlobalAsync; + case TIndexDescription::EType::GlobalSyncUnique: + return NKikimrSchemeOp::EIndexType::EIndexTypeGlobalUnique; + case NYql::TIndexDescription::EType::GlobalSyncVectorKMeansTree: + return NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree; + } } void ToMessage(NKikimrKqp::TIndexDescriptionProto* message) const { @@ -160,6 +178,8 @@ struct TIndexDescription { return true; case EType::GlobalAsync: return false; + case EType::GlobalSyncVectorKMeansTree: + return true; } } }; diff --git a/ydb/core/kqp/ut/scheme/kqp_scheme_ut.cpp b/ydb/core/kqp/ut/scheme/kqp_scheme_ut.cpp index 4db4ac5dd9fb..7c5d2f282beb 100644 --- a/ydb/core/kqp/ut/scheme/kqp_scheme_ut.cpp +++ b/ydb/core/kqp/ut/scheme/kqp_scheme_ut.cpp @@ -2433,6 +2433,21 @@ Y_UNIT_TEST_SUITE(KqpScheme) { auto indexDesc = describe.GetTableDescription(); UNIT_ASSERT_VALUES_EQUAL(indexDesc.GetPartitioningSettings().GetMinPartitionsCount(), minPartitionsCount); } + + constexpr int partitionSizeMb = 555; + { + auto result = session.ExecuteSchemeQuery(Sprintf(R"( + ALTER TABLE `/Root/SecondaryKeys` ALTER INDEX Index SET AUTO_PARTITIONING_PARTITION_SIZE_MB %d; + )", partitionSizeMb) + ).ExtractValueSync(); + UNIT_ASSERT_C(result.IsSuccess(), result.GetIssues().ToString()); + } + { + auto describe = session.DescribeTable("/Root/SecondaryKeys/Index/indexImplTable").GetValueSync(); + UNIT_ASSERT_C(describe.IsSuccess(), describe.GetIssues().ToString()); + auto indexDesc = describe.GetTableDescription(); + UNIT_ASSERT_VALUES_EQUAL(indexDesc.GetPartitioningSettings().GetPartitionSizeMb(), partitionSizeMb); + } } Y_UNIT_TEST(AlterIndexImplTable) { @@ -2638,6 +2653,76 @@ Y_UNIT_TEST_SUITE(KqpScheme) { } } + Y_UNIT_TEST(CreateTableWithVectorIndexPublicApi) { + TKikimrRunner kikimr; + auto db = kikimr.GetTableClient(); + auto session = db.CreateSession().GetValueSync().GetSession(); + { + auto builder = TTableBuilder() + .AddNullableColumn("Key", EPrimitiveType::Uint64) + .AddNullableColumn("Embedding", EPrimitiveType::String) + .SetPrimaryKeyColumn("Key") + .AddVectorKMeansTreeSecondaryIndex("vector_idx", {"Embedding"}, + { NYdb::NTable::TVectorIndexSettings::EDistance::Cosine, + NYdb::NTable::TVectorIndexSettings::EVectorType::Float, + 1024}); + + auto result = session.CreateTable("/Root/TestTable", builder.Build()).ExtractValueSync(); + UNIT_ASSERT_VALUES_EQUAL_C(result.GetStatus(), EStatus::SUCCESS, result.GetIssues().ToString()); + } + { + auto result = session.DescribeTable("/Root/TestTable").ExtractValueSync(); + UNIT_ASSERT_VALUES_EQUAL(result.GetStatus(), NYdb::EStatus::SUCCESS); + + UNIT_ASSERT_VALUES_EQUAL(result.GetTableDescription().GetIndexDescriptions().size(), 1); + auto indexDesc = result.GetTableDescription().GetIndexDescriptions()[0]; + UNIT_ASSERT_VALUES_EQUAL(indexDesc.GetIndexName(), "vector_idx"); + UNIT_ASSERT_VALUES_EQUAL(indexDesc.GetIndexType(), EIndexType::GlobalVectorKMeansTree); + UNIT_ASSERT_VALUES_EQUAL(indexDesc.GetIndexColumns().size(), 1); + UNIT_ASSERT_VALUES_EQUAL(indexDesc.GetIndexColumns()[0], "Embedding"); + UNIT_ASSERT_VALUES_EQUAL(indexDesc.GetDataColumns().size(), 0); + UNIT_ASSERT_VALUES_EQUAL(std::get(indexDesc.GetVectorIndexSettings()->Metric), NYdb::NTable::TVectorIndexSettings::EDistance::Cosine); + UNIT_ASSERT_VALUES_EQUAL(indexDesc.GetVectorIndexSettings()->VectorType, NYdb::NTable::TVectorIndexSettings::EVectorType::Float); + UNIT_ASSERT_VALUES_EQUAL(indexDesc.GetVectorIndexSettings()->VectorDimension, 1024); + } + } + + Y_UNIT_TEST(CreateTableWithVectorIndexCoveredPublicApi) { + TKikimrRunner kikimr; + auto db = kikimr.GetTableClient(); + auto session = db.CreateSession().GetValueSync().GetSession(); + { + auto builder = TTableBuilder() + .AddNullableColumn("Key", EPrimitiveType::Uint64) + .AddNullableColumn("Embedding", EPrimitiveType::String) + .AddNullableColumn("Covered", EPrimitiveType::String) + .SetPrimaryKeyColumn("Key") + .AddVectorKMeansTreeSecondaryIndex("vector_idx", {"Embedding"}, {"Covered"}, + { NYdb::NTable::TVectorIndexSettings::EDistance::Cosine, + NYdb::NTable::TVectorIndexSettings::EVectorType::Float, + 1024}); + + auto result = session.CreateTable("/Root/TestTable", builder.Build()).ExtractValueSync(); + UNIT_ASSERT_VALUES_EQUAL_C(result.GetStatus(), EStatus::SUCCESS, result.GetIssues().ToString()); + } + { + auto result = session.DescribeTable("/Root/TestTable").ExtractValueSync(); + UNIT_ASSERT_VALUES_EQUAL(result.GetStatus(), NYdb::EStatus::SUCCESS); + + UNIT_ASSERT_VALUES_EQUAL(result.GetTableDescription().GetIndexDescriptions().size(), 1); + auto indexDesc = result.GetTableDescription().GetIndexDescriptions()[0]; + UNIT_ASSERT_VALUES_EQUAL(indexDesc.GetIndexName(), "vector_idx"); + UNIT_ASSERT_VALUES_EQUAL(indexDesc.GetIndexType(), EIndexType::GlobalVectorKMeansTree); + UNIT_ASSERT_VALUES_EQUAL(indexDesc.GetIndexColumns().size(), 1); + UNIT_ASSERT_VALUES_EQUAL(indexDesc.GetIndexColumns()[0], "Embedding"); + UNIT_ASSERT_VALUES_EQUAL(indexDesc.GetDataColumns().size(), 1); + UNIT_ASSERT_VALUES_EQUAL(indexDesc.GetDataColumns()[0], "Covered"); + UNIT_ASSERT_VALUES_EQUAL(std::get(indexDesc.GetVectorIndexSettings()->Metric), NYdb::NTable::TVectorIndexSettings::EDistance::Cosine); + UNIT_ASSERT_VALUES_EQUAL(indexDesc.GetVectorIndexSettings()->VectorType, NYdb::NTable::TVectorIndexSettings::EVectorType::Float); + UNIT_ASSERT_VALUES_EQUAL(indexDesc.GetVectorIndexSettings()->VectorDimension, 1024); + } + } + Y_UNIT_TEST(AlterTableWithDecimalColumn) { TKikimrRunner kikimr; auto db = kikimr.GetTableClient(); diff --git a/ydb/core/protos/flat_scheme_op.proto b/ydb/core/protos/flat_scheme_op.proto index aa66c44d3ea0..481d7f5d3c6a 100644 --- a/ydb/core/protos/flat_scheme_op.proto +++ b/ydb/core/protos/flat_scheme_op.proto @@ -13,6 +13,7 @@ import "ydb/core/protos/follower_group.proto"; import "ydb/core/protos/blob_depot_config.proto"; import "ydb/public/api/protos/ydb_coordination.proto"; import "ydb/public/api/protos/ydb_export.proto"; +import "ydb/public/api/protos/ydb_table.proto"; import "ydb/public/api/protos/ydb_value.proto"; import "ydb/library/actors/protos/actors.proto"; import "ydb/library/mkql_proto/protos/minikql.proto"; @@ -973,6 +974,7 @@ enum EIndexType { EIndexTypeGlobal = 1; EIndexTypeGlobalAsync = 2; EIndexTypeGlobalUnique = 3; + EIndexTypeGlobalVectorKmeansTree = 4; } enum EIndexState { @@ -982,6 +984,10 @@ enum EIndexState { EIndexStateWriteOnly = 3; } +message TVectorIndexKmeansTreeDescription { + optional Ydb.Table.VectorIndexSettings Settings = 1; +} + message TIndexDescription { optional string Name = 1; optional uint64 LocalPathId = 2; @@ -1000,6 +1006,10 @@ message TIndexDescription { // DataSize + IndexSize of indexImplTable optional uint64 DataSize = 9; repeated TTableDescription IndexImplTableDescriptions = 10; + + oneof SpecializedIndexDescription { + TVectorIndexKmeansTreeDescription VectorIndexKmeansTreeDescription = 11; + } } message TIndexCreationConfig { @@ -1009,6 +1019,9 @@ message TIndexCreationConfig { repeated TTableDescription IndexImplTableDescriptions = 4; //description for index impl tables optional EIndexState State = 5; //state of index at the creation time repeated string DataColumnNames = 6; //columns to be denormalized to read data just from index + oneof SpecializedIndexDescription { + TVectorIndexKmeansTreeDescription VectorIndexKmeansTreeDescription = 7; + } } message TIndexAlteringConfig { @@ -1849,6 +1862,7 @@ enum EPathSubType { EPathSubTypeSyncIndexImplTable = 1; EPathSubTypeAsyncIndexImplTable = 2; EPathSubTypeStreamImpl = 3; + EPathSubTypeVectorKmeansTreeIndexImplTable = 4; } enum EPathState { diff --git a/ydb/core/tx/datashard/datashard_user_table.h b/ydb/core/tx/datashard/datashard_user_table.h index af7c9ab2a6d8..fc77adf9919e 100644 --- a/ydb/core/tx/datashard/datashard_user_table.h +++ b/ydb/core/tx/datashard/datashard_user_table.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include diff --git a/ydb/core/tx/scheme_board/cache.cpp b/ydb/core/tx/scheme_board/cache.cpp index 07062b4ce553..11ef95fdb7b6 100644 --- a/ydb/core/tx/scheme_board/cache.cpp +++ b/ydb/core/tx/scheme_board/cache.cpp @@ -876,6 +876,8 @@ class TSchemeCache: public TMonitorableActor { return TResolve::KindSyncIndexTable; case NKikimrSchemeOp::EPathSubTypeAsyncIndexImplTable: return TResolve::KindAsyncIndexTable; + case NKikimrSchemeOp::EPathSubTypeVectorKmeansTreeIndexImplTable: + return TResolve::KindVectorIndexTable; default: return TResolve::KindRegularTable; } @@ -887,6 +889,7 @@ class TSchemeCache: public TMonitorableActor { switch (subType) { case NKikimrSchemeOp::EPathSubTypeSyncIndexImplTable: case NKikimrSchemeOp::EPathSubTypeAsyncIndexImplTable: + case NKikimrSchemeOp::EPathSubTypeVectorKmeansTreeIndexImplTable: return true; default: return false; diff --git a/ydb/core/tx/scheme_cache/scheme_cache.h b/ydb/core/tx/scheme_cache/scheme_cache.h index b81736613daa..570f47e67f4d 100644 --- a/ydb/core/tx/scheme_cache/scheme_cache.h +++ b/ydb/core/tx/scheme_cache/scheme_cache.h @@ -359,7 +359,8 @@ struct TSchemeCacheRequest { KindUnknown = 0, KindRegularTable = 1, KindSyncIndexTable = 2, - KindAsyncIndexTable= 3, + KindAsyncIndexTable = 3, + KindVectorIndexTable = 4, }; struct TEntry { diff --git a/ydb/core/tx/schemeshard/schemeshard__operation_create_build_index.cpp b/ydb/core/tx/schemeshard/schemeshard__operation_create_build_index.cpp index ade4b00ae014..02353a7f8e77 100644 --- a/ydb/core/tx/schemeshard/schemeshard__operation_create_build_index.cpp +++ b/ydb/core/tx/schemeshard/schemeshard__operation_create_build_index.cpp @@ -10,6 +10,8 @@ namespace NKikimr::NSchemeShard { +using namespace NTableIndex; + TVector CreateBuildColumn(TOperationId opId, const TTxTransaction& tx, TOperationContext& context) { Y_ABORT_UNLESS(tx.GetOperationType() == NKikimrSchemeOp::EOperationType::ESchemeOpCreateColumnBuild); @@ -111,18 +113,33 @@ TVector CreateBuildIndex(TOperationId opId, const TTxTransa result.push_back(CreateInitializeBuildIndexMainTable(NextPartId(opId, result), outTx)); } - { + auto createIndexImplTable = [&] (NKikimrSchemeOp::TTableDescription&& implTableDesc) { auto outTx = TransactionTemplate(index.PathString(), NKikimrSchemeOp::EOperationType::ESchemeOpInitiateBuildIndexImplTable); - auto& indexImplTableDescription = *outTx.MutableCreateTable(); + *outTx.MutableCreateTable() = implTableDesc; - // This description provided by user to override partition policy - const auto& userIndexDesc = indexDesc.GetIndexImplTableDescriptions(0); - indexImplTableDescription = CalcImplTableDesc(tableInfo, implTableColumns, userIndexDesc); + implTableDesc.MutablePartitionConfig()->SetShadowData(true); - indexImplTableDescription.MutablePartitionConfig()->MutableCompactionPolicy()->SetKeepEraseMarkers(true); - indexImplTableDescription.MutablePartitionConfig()->SetShadowData(true); + return CreateInitializeBuildIndexImplTable(NextPartId(opId, result), outTx); + }; - result.push_back(CreateInitializeBuildIndexImplTable(NextPartId(opId, result), outTx)); + if (indexDesc.GetType() == NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree) { + NKikimrSchemeOp::TTableDescription indexLevelTableDesc, indexPostingTableDesc; + // TODO After IndexImplTableDescriptions are persisted, this should be replaced with Y_ABORT_UNLESS + if (indexDesc.IndexImplTableDescriptionsSize() == 2) { + indexLevelTableDesc = indexDesc.GetIndexImplTableDescriptions(0); + indexPostingTableDesc = indexDesc.GetIndexImplTableDescriptions(0); + } + result.push_back(createIndexImplTable(CalcVectorKmeansTreeLevelImplTableDesc(tableInfo->PartitionConfig(), indexLevelTableDesc))); + result.push_back(createIndexImplTable(CalcVectorKmeansTreePostingImplTableDesc(tableInfo, tableInfo->PartitionConfig(), implTableColumns, indexPostingTableDesc))); + } else { + NKikimrSchemeOp::TTableDescription indexTableDesc; + // TODO After IndexImplTableDescriptions are persisted, this should be replaced with Y_ABORT_UNLESS + if (indexDesc.IndexImplTableDescriptionsSize() == 1) { + indexTableDesc = indexDesc.GetIndexImplTableDescriptions(0); + } + NKikimrSchemeOp::TTableDescription implTableDesc = CalcImplTableDesc(tableInfo, implTableColumns, indexTableDesc); + implTableDesc.MutablePartitionConfig()->MutableCompactionPolicy()->SetKeepEraseMarkers(true); + result.push_back(createIndexImplTable(std::move(implTableDesc))); } return result; diff --git a/ydb/core/tx/schemeshard/schemeshard__operation_create_indexed_table.cpp b/ydb/core/tx/schemeshard/schemeshard__operation_create_indexed_table.cpp index e0eb084f8f65..10366c485db3 100644 --- a/ydb/core/tx/schemeshard/schemeshard__operation_create_indexed_table.cpp +++ b/ydb/core/tx/schemeshard/schemeshard__operation_create_indexed_table.cpp @@ -238,25 +238,37 @@ TVector CreateIndexedTable(TOperationId nextId, const TTxTr result.push_back(CreateNewTableIndex(NextPartId(nextId, result), scheme)); } - { + auto createIndexImplTable = [&] (const NKikimrSchemeOp::TTableDescription&& implTableDesc) { auto scheme = TransactionTemplate( tx.GetWorkingDir() + "/" + baseTableDescription.GetName() + "/" + indexDescription.GetName(), NKikimrSchemeOp::EOperationType::ESchemeOpCreateTable); scheme.SetFailOnExist(tx.GetFailOnExist()); scheme.SetAllowCreateInTempDir(tx.GetAllowCreateInTempDir()); - const auto& implTableColumns = indexes.at(indexDescription.GetName()); + *scheme.MutableCreateTable() = implTableDesc; - auto& indexImplTableDescription = *scheme.MutableCreateTable(); + return CreateNewTable(NextPartId(nextId, result), scheme); + }; + const auto& implTableColumns = indexes.at(indexDescription.GetName()); + if (indexDescription.GetType() == NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree) { + NKikimrSchemeOp::TTableDescription userLevelDesc, userPostingDesc; + if (indexDescription.IndexImplTableDescriptionsSize() == 2) { + // This description provided by user to override partition policy + userLevelDesc = indexDescription.GetIndexImplTableDescriptions(0); + userPostingDesc = indexDescription.GetIndexImplTableDescriptions(1); + } + + result.push_back(createIndexImplTable(CalcVectorKmeansTreeLevelImplTableDesc(baseTableDescription.GetPartitionConfig(), userLevelDesc))); + result.push_back(createIndexImplTable(CalcVectorKmeansTreePostingImplTableDesc(baseTableDescription, baseTableDescription.GetPartitionConfig(), implTableColumns, userPostingDesc))); + } else { NKikimrSchemeOp::TTableDescription userIndexDesc; if (indexDescription.IndexImplTableDescriptionsSize()) { // This description provided by user to override partition policy userIndexDesc = indexDescription.GetIndexImplTableDescriptions(0); } - indexImplTableDescription = CalcImplTableDesc(baseTableDescription, implTableColumns, userIndexDesc); - result.push_back(CreateNewTable(NextPartId(nextId, result), scheme)); + result.push_back(createIndexImplTable(CalcImplTableDesc(baseTableDescription, implTableColumns, userIndexDesc))); } } diff --git a/ydb/core/tx/schemeshard/schemeshard__operation_drop_indexed_table.cpp b/ydb/core/tx/schemeshard/schemeshard__operation_drop_indexed_table.cpp index 5f32acec8c03..51bf6f69a0b9 100644 --- a/ydb/core/tx/schemeshard/schemeshard__operation_drop_indexed_table.cpp +++ b/ydb/core/tx/schemeshard/schemeshard__operation_drop_indexed_table.cpp @@ -494,10 +494,12 @@ TVector CreateDropIndexedTable(TOperationId nextId, const T result.push_back(CreateDropCdcStreamImpl(NextPartId(nextId, result), dropStream)); } - Y_ABORT_UNLESS(child.Base()->GetChildren().size() == 1); for (auto& [implName, implPathId] : child.Base()->GetChildren()) { - Y_ABORT_UNLESS(implName == "indexImplTable" || implName == "streamImpl", - "unexpected name %s", implName.c_str()); + Y_ABORT_UNLESS(implName == "indexImplTable" + || implName == "streamImpl" + || implName == NTableIndex::NTableVectorKmeansTreeIndex::LevelTable + || implName == NTableIndex::NTableVectorKmeansTreeIndex::PostingTable + , "unexpected name %s", implName.c_str()); TPath implPath = child.Child(implName); { diff --git a/ydb/core/tx/schemeshard/schemeshard_build_index.cpp b/ydb/core/tx/schemeshard/schemeshard_build_index.cpp index 08f72ca6c04f..39cd6c870eda 100644 --- a/ydb/core/tx/schemeshard/schemeshard_build_index.cpp +++ b/ydb/core/tx/schemeshard/schemeshard_build_index.cpp @@ -47,6 +47,8 @@ void TSchemeShard::PersistCreateBuildIndex(NIceDb::TNiceDb& db, const TIndexBuil NIceDb::TUpdate(info->Limits.MaxShards), NIceDb::TUpdate(info->Limits.MaxRetries), NIceDb::TUpdate(ui32(info->BuildKind)) + + // TODO save info->ImplTableDescriptions ); ui32 columnNo = 0; diff --git a/ydb/core/tx/schemeshard/schemeshard_build_index__create.cpp b/ydb/core/tx/schemeshard/schemeshard_build_index__create.cpp index 1eef982c605c..0c0d5f3b25c3 100644 --- a/ydb/core/tx/schemeshard/schemeshard_build_index__create.cpp +++ b/ydb/core/tx/schemeshard/schemeshard_build_index__create.cpp @@ -226,6 +226,9 @@ class TSchemeShard::TIndexBuilder::TTxCreate: public TSchemeShard::TIndexBuilder case Ydb::Table::TableIndex::TypeCase::kGlobalUniqueIndex: explain = "unsupported index type to build"; return false; + case Ydb::Table::TableIndex::TypeCase::kGlobalVectorKmeansTreeIndex: + explain = "unsupported vector index type to build"; + return false; case Ydb::Table::TableIndex::TypeCase::TYPE_NOT_SET: explain = "invalid or unset index type"; return false; @@ -236,7 +239,7 @@ class TSchemeShard::TIndexBuilder::TTxCreate: public TSchemeShard::TIndexBuilder buildInfo->DataColumns.assign(index.data_columns().begin(), index.data_columns().end()); Ydb::StatusIds::StatusCode status; - if (!FillIndexTablePartitioning(buildInfo->ImplTableDescription, index, status, explain)) { + if (!FillIndexTablePartitioning(buildInfo->ImplTableDescriptions, index, status, explain)) { return false; } } diff --git a/ydb/core/tx/schemeshard/schemeshard_build_index_tx_base.cpp b/ydb/core/tx/schemeshard/schemeshard_build_index_tx_base.cpp index fe94174526d7..b766761905e8 100644 --- a/ydb/core/tx/schemeshard/schemeshard_build_index_tx_base.cpp +++ b/ydb/core/tx/schemeshard/schemeshard_build_index_tx_base.cpp @@ -274,6 +274,9 @@ void TSchemeShard::TIndexBuilder::TTxBase::Fill(NKikimrIndexBuilder::TIndexBuild case NKikimrSchemeOp::EIndexType::EIndexTypeGlobalAsync: *index.mutable_global_async_index() = Ydb::Table::GlobalAsyncIndex(); break; + case NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree: + *index.mutable_global_vector_kmeans_tree_index() = Ydb::Table::GlobalVectorKMeansTreeIndex(); + break; default: Y_ABORT("Unreachable"); } diff --git a/ydb/core/tx/schemeshard/schemeshard_info_types.cpp b/ydb/core/tx/schemeshard/schemeshard_info_types.cpp index 4d1338e6e254..c54f1ae38f40 100644 --- a/ydb/core/tx/schemeshard/schemeshard_info_types.cpp +++ b/ydb/core/tx/schemeshard/schemeshard_info_types.cpp @@ -507,6 +507,13 @@ TTableInfo::TAlterDataPtr TTableInfo::CreateAlterData( } if (op.HasTTLSettings()) { + for (const auto& indexDescription : op.GetTableIndexes()) { + if (indexDescription.GetType() == NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree) { + errStr = "Table with vector indexes doesn't support TTL"; + return nullptr; + } + } + const auto& ttl = op.GetTTLSettings(); if (!ValidateTtlSettings(ttl, source ? source->Columns : THashMap(), alterData->Columns, colName2Id, subDomain, errStr)) { @@ -2114,7 +2121,9 @@ void TIndexBuildInfo::SerializeToProto(TSchemeShard* ss, NKikimrSchemeOp::TIndex *index.AddDataColumnNames() = x; } - *index.AddIndexImplTableDescriptions() = ImplTableDescription; + for (const auto& implTableDescription : ImplTableDescriptions) { + *index.AddIndexImplTableDescriptions() = implTableDescription; + } } void TIndexBuildInfo::SerializeToProto(TSchemeShard* ss, NKikimrIndexBuilder::TColumnBuildSettings* result) const { diff --git a/ydb/core/tx/schemeshard/schemeshard_info_types.h b/ydb/core/tx/schemeshard/schemeshard_info_types.h index b5297bf20037..298bac3eaeeb 100644 --- a/ydb/core/tx/schemeshard/schemeshard_info_types.h +++ b/ydb/core/tx/schemeshard/schemeshard_info_types.h @@ -2365,8 +2365,13 @@ struct TTableIndexInfo : public TSimpleRefCount { alterData->IndexKeys.assign(config.GetKeyColumnNames().begin(), config.GetKeyColumnNames().end()); Y_ABORT_UNLESS(alterData->IndexKeys.size()); alterData->IndexDataColumns.assign(config.GetDataColumnNames().begin(), config.GetDataColumnNames().end()); + alterData->State = config.HasState() ? config.GetState() : EState::EIndexStateReady; + if (config.GetType() == NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree) { + alterData->SpecializedIndexDescription = config.GetVectorIndexKmeansTreeDescription(); + } + return result; } @@ -2378,6 +2383,8 @@ struct TTableIndexInfo : public TSimpleRefCount { TVector IndexDataColumns; TTableIndexInfo::TPtr AlterData = nullptr; + + std::variant SpecializedIndexDescription; }; struct TCdcStreamInfo : public TSimpleRefCount { @@ -2915,7 +2922,7 @@ struct TIndexBuildInfo: public TSimpleRefCount { TString ImplTablePath; NTableIndex::TTableColumns ImplTableColumns; - NKikimrSchemeOp::TTableDescription ImplTableDescription; + TVector ImplTableDescriptions; EState State = EState::Invalid; TString Issue; @@ -3056,6 +3063,8 @@ struct TIndexBuildInfo: public TSimpleRefCount { indexInfo->IndexName = row.template GetValue(); indexInfo->IndexType = row.template GetValue(); + // TODO load indexInfo->ImplTableDescriptions + indexInfo->State = TIndexBuildInfo::EState( row.template GetValue()); indexInfo->Issue = diff --git a/ydb/core/tx/schemeshard/schemeshard_path_describer.cpp b/ydb/core/tx/schemeshard/schemeshard_path_describer.cpp index c1259db27f56..080212788ebc 100644 --- a/ydb/core/tx/schemeshard/schemeshard_path_describer.cpp +++ b/ydb/core/tx/schemeshard/schemeshard_path_describer.cpp @@ -131,10 +131,16 @@ TPathElement::EPathSubType TPathDescriber::CalcPathSubType(const TPath& path) { auto indexInfo = Self->Indexes.at(pathId); switch (indexInfo->Type) { - case NKikimrSchemeOp::EIndexTypeGlobalAsync: - return TPathElement::EPathSubType::EPathSubTypeAsyncIndexImplTable; - default: - return TPathElement::EPathSubType::EPathSubTypeSyncIndexImplTable; + case NKikimrSchemeOp::EIndexTypeGlobalAsync: + return TPathElement::EPathSubType::EPathSubTypeAsyncIndexImplTable; + case NKikimrSchemeOp::EIndexTypeGlobal: + case NKikimrSchemeOp::EIndexTypeGlobalUnique: + return TPathElement::EPathSubType::EPathSubTypeSyncIndexImplTable; + case NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree: + return TPathElement::EPathSubType::EPathSubTypeVectorKmeansTreeIndexImplTable; + default: + Y_DEBUG_ABORT("%s", (TStringBuilder() << "unexpected indexInfo->Type# " << indexInfo->Type).data()); + return TPathElement::EPathSubType::EPathSubTypeEmpty; } } else if (parentPath.IsCdcStream()) { return TPathElement::EPathSubType::EPathSubTypeStreamImpl; @@ -1240,22 +1246,44 @@ void TSchemeShard::DescribeTableIndex(const TPathId& pathId, const TString& name auto* indexPath = PathsById.FindPtr(pathId); Y_ABORT_UNLESS(indexPath); - Y_ABORT_UNLESS((*indexPath)->GetChildren().size() == 1); - const auto& indexImplTablePathId = (*indexPath)->GetChildren().begin()->second; + const ui8 expectedIndexImplTableCount = indexInfo->Type == NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree ? 2 : 1; + Y_ABORT_UNLESS((*indexPath)->GetChildren().size() == expectedIndexImplTableCount); - auto* tableInfo = Tables.FindPtr(indexImplTablePathId); - Y_ABORT_UNLESS(tableInfo); + ui64 dataSize = 0; + for (const auto& indexImplTablePathId : (*indexPath)->GetChildren()) { + auto* tableInfo = Tables.FindPtr(indexImplTablePathId.second); + Y_ABORT_UNLESS(tableInfo); - const auto& tableStats = (*tableInfo)->GetStats().Aggregated; - entry.SetDataSize(tableStats.DataSize + tableStats.IndexSize); + const auto& tableStats = (*tableInfo)->GetStats().Aggregated; + dataSize += tableStats.DataSize + tableStats.IndexSize; - auto* tableDescription = entry.AddIndexImplTableDescriptions(); - if (fillConfig) { - FillPartitionConfig((*tableInfo)->PartitionConfig(), *tableDescription->MutablePartitionConfig()); + auto* tableDescription = entry.AddIndexImplTableDescriptions(); + if (fillConfig) { + FillPartitionConfig((*tableInfo)->PartitionConfig(), *tableDescription->MutablePartitionConfig()); + } + if (fillBoundaries) { + FillTableBoundaries(*tableInfo, *tableDescription->MutableSplitBoundary()); + } } - if (fillBoundaries) { - FillTableBoundaries(*tableInfo, *tableDescription->MutableSplitBoundary()); + entry.SetDataSize(dataSize); + + if (indexInfo->Type == NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree) { + if (const auto* vectorIndexKmeansTreeDescription = std::get_if(&indexInfo->SpecializedIndexDescription)) { + const auto& indexInfoSettings = vectorIndexKmeansTreeDescription->GetSettings(); + auto entrySettings = entry.MutableVectorIndexKmeansTreeDescription()->MutableSettings(); + if (indexInfoSettings.has_distance()) + entrySettings->set_distance(indexInfoSettings.distance()); + else if (indexInfoSettings.has_similarity()) + entrySettings->set_similarity(indexInfoSettings.similarity()); + else + Y_FAIL_S("Either distance or similarity should be set in index settings: " << indexInfoSettings); + entrySettings->set_vector_type(indexInfoSettings.vector_type()); + entrySettings->set_vector_dimension(indexInfoSettings.vector_dimension()); + } else { + Y_FAIL_S("SpecializedIndexDescription should be set"); + } } + } void TSchemeShard::DescribeCdcStream(const TPathId& pathId, const TString& name, diff --git a/ydb/core/tx/schemeshard/schemeshard_utils.cpp b/ydb/core/tx/schemeshard/schemeshard_utils.cpp index 13642fc1054e..b94b0647b866 100644 --- a/ydb/core/tx/schemeshard/schemeshard_utils.cpp +++ b/ydb/core/tx/schemeshard/schemeshard_utils.cpp @@ -255,25 +255,104 @@ TTableColumns ExtractInfo(const NSchemeShard::TTableInfo::TPtr &tableInfo) { return result; } -NKikimrSchemeOp::TTableDescription CalcImplTableDesc( - const NSchemeShard::TTableInfo::TPtr& baseTableInfo, - const NTableIndex::TTableColumns& implTableColumns, - const NKikimrSchemeOp::TTableDescription& indexTableDesc) +NKikimrSchemeOp::TPartitionConfig PartitionConfigForIndexes( + const NKikimrSchemeOp::TPartitionConfig& baseTablePartitionConfig, + const NKikimrSchemeOp::TTableDescription& indexTableDesc) { - NKikimrSchemeOp::TTableDescription result; + // KIKIMR-6687 + NKikimrSchemeOp::TPartitionConfig result; - result.SetName("indexImplTable"); + if (baseTablePartitionConfig.HasNamedCompactionPolicy()) { + result.SetNamedCompactionPolicy(baseTablePartitionConfig.GetNamedCompactionPolicy()); + } + if (baseTablePartitionConfig.HasCompactionPolicy()) { + result.MutableCompactionPolicy()->CopyFrom(baseTablePartitionConfig.GetCompactionPolicy()); + } + // skip optional uint64 FollowerCount = 3; + if (baseTablePartitionConfig.HasExecutorCacheSize()) { + result.SetExecutorCacheSize(baseTablePartitionConfig.GetExecutorCacheSize()); + } + // skip optional bool AllowFollowerPromotion = 5 [default = true]; + if (baseTablePartitionConfig.HasTxReadSizeLimit()) { + result.SetTxReadSizeLimit(baseTablePartitionConfig.GetTxReadSizeLimit()); + } + // skip optional uint32 CrossDataCenterFollowerCount = 8; + if (baseTablePartitionConfig.HasChannelProfileId()) { + result.SetChannelProfileId(baseTablePartitionConfig.GetChannelProfileId()); + } + if (indexTableDesc.GetPartitionConfig().HasPartitioningPolicy()) { + result.MutablePartitioningPolicy()->CopyFrom(indexTableDesc.GetPartitionConfig().GetPartitioningPolicy()); + } else { + result.MutablePartitioningPolicy()->SetSizeToSplit(2_GB); + result.MutablePartitioningPolicy()->SetMinPartitionsCount(1); + } + if (baseTablePartitionConfig.HasPipelineConfig()) { + result.MutablePipelineConfig()->CopyFrom(baseTablePartitionConfig.GetPipelineConfig()); + } + if (baseTablePartitionConfig.ColumnFamiliesSize()) { + // Indexes don't need column families unless it's the default column family + for (const auto& family : baseTablePartitionConfig.GetColumnFamilies()) { + const bool isDefaultFamily = ( + (!family.HasId() && !family.HasName()) || + (family.HasId() && family.GetId() == 0) || + (family.HasName() && family.GetName() == "default")); + if (isDefaultFamily) { + result.AddColumnFamilies()->CopyFrom(family); + } + } + } + if (baseTablePartitionConfig.HasResourceProfile()) { + result.SetResourceProfile(baseTablePartitionConfig.GetResourceProfile()); + } + if (baseTablePartitionConfig.HasDisableStatisticsCalculation()) { + result.SetDisableStatisticsCalculation(baseTablePartitionConfig.GetDisableStatisticsCalculation()); + } + if (baseTablePartitionConfig.HasEnableFilterByKey()) { + result.SetEnableFilterByKey(baseTablePartitionConfig.GetEnableFilterByKey()); + } + if (baseTablePartitionConfig.HasExecutorFastLogPolicy()) { + result.SetExecutorFastLogPolicy(baseTablePartitionConfig.GetExecutorFastLogPolicy()); + } + if (baseTablePartitionConfig.HasEnableEraseCache()) { + result.SetEnableEraseCache(baseTablePartitionConfig.GetEnableEraseCache()); + } + if (baseTablePartitionConfig.HasEraseCacheMinRows()) { + result.SetEraseCacheMinRows(baseTablePartitionConfig.GetEraseCacheMinRows()); + } + if (baseTablePartitionConfig.HasEraseCacheMaxBytes()) { + result.SetEraseCacheMaxBytes(baseTablePartitionConfig.GetEraseCacheMaxBytes()); + } + if (baseTablePartitionConfig.HasKeepSnapshotTimeout()) { + result.SetKeepSnapshotTimeout(baseTablePartitionConfig.GetKeepSnapshotTimeout()); + } + // skip repeated NKikimrStorageSettings.TStorageRoom StorageRooms = 17; + // skip optional NKikimrHive.TFollowerGroup FollowerGroup = 23; + + return result; +} + +void SetImplTablePartitionConfig( + const NKikimrSchemeOp::TPartitionConfig& baseTablePartitionConfig, + const NKikimrSchemeOp::TTableDescription& indexTableDesc, + NKikimrSchemeOp::TTableDescription& tableDescription) +{ if (indexTableDesc.HasUniformPartitionsCount()) { - result.SetUniformPartitionsCount(indexTableDesc.GetUniformPartitionsCount()); + tableDescription.SetUniformPartitionsCount(indexTableDesc.GetUniformPartitionsCount()); } if (indexTableDesc.SplitBoundarySize()) { - result.MutableSplitBoundary()->CopyFrom(indexTableDesc.GetSplitBoundary()); + tableDescription.MutableSplitBoundary()->CopyFrom(indexTableDesc.GetSplitBoundary()); } - *result.MutablePartitionConfig() = PartitionConfigForIndexes(baseTableInfo, indexTableDesc); + *tableDescription.MutablePartitionConfig() = PartitionConfigForIndexes(baseTablePartitionConfig, indexTableDesc); +} + void FillIndexImplTableColumns( + const THashMap& baseTableColumns, + const NTableIndex::TTableColumns& implTableColumns, + NKikimrSchemeOp::TTableDescription& implTableDesc) +{ //Columns and KeyColumnNames order is really important //the order of implTableColumns.Keys is the right one @@ -282,15 +361,14 @@ NKikimrSchemeOp::TTableDescription CalcImplTableDesc( implKeyToImplColumn[implTableColumns.Keys[keyId]] = keyId; } - result.ClearColumns(); - for (auto& iter: baseTableInfo->Columns) { + for (auto& iter: baseTableColumns) { const NSchemeShard::TTableInfo::TColumn& column = iter.second; if (column.IsDropped()) { continue; } if (implTableColumns.Columns.contains(column.Name)) { - auto item = result.AddColumns(); + auto item = implTableDesc.AddColumns(); item->SetName(column.Name); item->SetType(NScheme::TypeName(column.PType, column.PTypeMod)); item->SetNotNull(column.NotNull); @@ -302,43 +380,26 @@ NKikimrSchemeOp::TTableDescription CalcImplTableDesc( } } - std::sort(result.MutableColumns()->begin(), - result.MutableColumns()->end(), + std::sort(implTableDesc.MutableColumns()->begin(), + implTableDesc.MutableColumns()->end(), [] (auto& left, auto& right) { return left.GetId() < right.GetId(); }); - for (auto& column: *result.MutableColumns()) { + for (auto& column: *implTableDesc.MutableColumns()) { column.ClearId(); } - result.ClearKeyColumnNames(); for (auto& keyName: implTableColumns.Keys) { - result.AddKeyColumnNames(keyName); + implTableDesc.AddKeyColumnNames(keyName); } - - return result; } -NKikimrSchemeOp::TTableDescription CalcImplTableDesc( - const NKikimrSchemeOp::TTableDescription &baseTableDescr, - const TTableColumns &implTableColumns, - const NKikimrSchemeOp::TTableDescription &indexTableDesc) +void FillIndexImplTableColumns( + const ::google::protobuf::RepeatedPtrField& baseTableColumns, + const NTableIndex::TTableColumns& implTableColumns, + NKikimrSchemeOp::TTableDescription& implTableDesc) { - NKikimrSchemeOp::TTableDescription result; - - result.SetName("indexImplTable"); - - if (indexTableDesc.HasUniformPartitionsCount()) { - result.SetUniformPartitionsCount(indexTableDesc.GetUniformPartitionsCount()); - } - - if (indexTableDesc.SplitBoundarySize()) { - result.MutableSplitBoundary()->CopyFrom(indexTableDesc.GetSplitBoundary()); - } - - *result.MutablePartitionConfig() = PartitionConfigForIndexes(baseTableDescr, indexTableDesc); - //Columns and KeyColumnNames order is really important //the order of implTableColumns.Keys is the right one @@ -347,11 +408,10 @@ NKikimrSchemeOp::TTableDescription CalcImplTableDesc( implKeyToImplColumn[implTableColumns.Keys[keyId]] = keyId; } - result.ClearColumns(); - for (auto& column: baseTableDescr.GetColumns()) { + for (auto& column: baseTableColumns) { auto& columnName = column.GetName(); if (implTableColumns.Columns.contains(columnName)) { - auto item = result.AddColumns(); + auto item = implTableDesc.AddColumns(); *item = column; // Indexes don't use column families @@ -369,115 +429,143 @@ NKikimrSchemeOp::TTableDescription CalcImplTableDesc( } } - std::sort(result.MutableColumns()->begin(), - result.MutableColumns()->end(), + std::sort(implTableDesc.MutableColumns()->begin(), + implTableDesc.MutableColumns()->end(), [] (auto& left, auto& right) { return left.GetId() < right.GetId(); }); - for (auto& column: *result.MutableColumns()) { + for (auto& column: *implTableDesc.MutableColumns()) { column.ClearId(); } - result.ClearKeyColumnNames(); for (auto& keyName: implTableColumns.Keys) { - result.AddKeyColumnNames(keyName); + implTableDesc.AddKeyColumnNames(keyName); } +} - return result; +NKikimrSchemeOp::TTableDescription CalcImplTableDesc( + const NSchemeShard::TTableInfo::TPtr& baseTableInfo, + const NTableIndex::TTableColumns& implTableColumns, + const NKikimrSchemeOp::TTableDescription& indexTableDesc) +{ + NKikimrSchemeOp::TTableDescription implTableDesc; + + implTableDesc.SetName("indexImplTable"); + + SetImplTablePartitionConfig(baseTableInfo->PartitionConfig(), indexTableDesc, implTableDesc); + + FillIndexImplTableColumns(baseTableInfo->Columns, implTableColumns, implTableDesc); + + return implTableDesc; } -NKikimrSchemeOp::TPartitionConfig PartitionConfigForIndexes( - const NKikimrSchemeOp::TPartitionConfig& baseTablePartitionConfig, - const NKikimrSchemeOp::TTableDescription& indexTableDesc) +NKikimrSchemeOp::TTableDescription CalcImplTableDesc( + const NKikimrSchemeOp::TTableDescription& baseTableDescr, + const TTableColumns& implTableColumns, + const NKikimrSchemeOp::TTableDescription& indexTableDesc) { - // KIKIMR-6687 - NKikimrSchemeOp::TPartitionConfig result; + NKikimrSchemeOp::TTableDescription implTableDesc; - if (baseTablePartitionConfig.HasNamedCompactionPolicy()) { - result.SetNamedCompactionPolicy(baseTablePartitionConfig.GetNamedCompactionPolicy()); - } - if (baseTablePartitionConfig.HasCompactionPolicy()) { - result.MutableCompactionPolicy()->CopyFrom(baseTablePartitionConfig.GetCompactionPolicy()); - } - // skip optional uint64 FollowerCount = 3; - if (baseTablePartitionConfig.HasExecutorCacheSize()) { - result.SetExecutorCacheSize(baseTablePartitionConfig.GetExecutorCacheSize()); - } - // skip optional bool AllowFollowerPromotion = 5 [default = true]; - if (baseTablePartitionConfig.HasTxReadSizeLimit()) { - result.SetTxReadSizeLimit(baseTablePartitionConfig.GetTxReadSizeLimit()); - } - // skip optional uint32 CrossDataCenterFollowerCount = 8; - if (baseTablePartitionConfig.HasChannelProfileId()) { - result.SetChannelProfileId(baseTablePartitionConfig.GetChannelProfileId()); - } + implTableDesc.SetName("indexImplTable"); - if (indexTableDesc.GetPartitionConfig().HasPartitioningPolicy()) { - result.MutablePartitioningPolicy()->CopyFrom(indexTableDesc.GetPartitionConfig().GetPartitioningPolicy()); - } else { - result.MutablePartitioningPolicy()->SetSizeToSplit(2_GB); - result.MutablePartitioningPolicy()->SetMinPartitionsCount(1); - } - if (baseTablePartitionConfig.HasPipelineConfig()) { - result.MutablePipelineConfig()->CopyFrom(baseTablePartitionConfig.GetPipelineConfig()); - } - if (baseTablePartitionConfig.ColumnFamiliesSize()) { - // Indexes don't need column families unless it's the default column family - for (const auto& family : baseTablePartitionConfig.GetColumnFamilies()) { - const bool isDefaultFamily = ( - (!family.HasId() && !family.HasName()) || - (family.HasId() && family.GetId() == 0) || - (family.HasName() && family.GetName() == "default")); - if (isDefaultFamily) { - result.AddColumnFamilies()->CopyFrom(family); - } - } - } - if (baseTablePartitionConfig.HasResourceProfile()) { - result.SetResourceProfile(baseTablePartitionConfig.GetResourceProfile()); - } - if (baseTablePartitionConfig.HasDisableStatisticsCalculation()) { - result.SetDisableStatisticsCalculation(baseTablePartitionConfig.GetDisableStatisticsCalculation()); - } - if (baseTablePartitionConfig.HasEnableFilterByKey()) { - result.SetEnableFilterByKey(baseTablePartitionConfig.GetEnableFilterByKey()); - } - if (baseTablePartitionConfig.HasExecutorFastLogPolicy()) { - result.SetExecutorFastLogPolicy(baseTablePartitionConfig.GetExecutorFastLogPolicy()); - } - if (baseTablePartitionConfig.HasEnableEraseCache()) { - result.SetEnableEraseCache(baseTablePartitionConfig.GetEnableEraseCache()); - } - if (baseTablePartitionConfig.HasEraseCacheMinRows()) { - result.SetEraseCacheMinRows(baseTablePartitionConfig.GetEraseCacheMinRows()); + SetImplTablePartitionConfig(baseTableDescr.GetPartitionConfig(), indexTableDesc, implTableDesc); + + FillIndexImplTableColumns(baseTableDescr.GetColumns(), implTableColumns, implTableDesc); + + return implTableDesc; +} + +NKikimrSchemeOp::TTableDescription CalcVectorKmeansTreeLevelImplTableDesc( + const NKikimrSchemeOp::TPartitionConfig& baseTablePartitionConfig, + const NKikimrSchemeOp::TTableDescription& indexTableDesc) +{ + NKikimrSchemeOp::TTableDescription implTableDesc; + + implTableDesc.SetName(NTableVectorKmeansTreeIndex::LevelTable); + + SetImplTablePartitionConfig(baseTablePartitionConfig, indexTableDesc, implTableDesc); + + { + auto parentIdColumn = implTableDesc.AddColumns(); + parentIdColumn->SetName(NTableVectorKmeansTreeIndex::LevelTable_ParentIdColumn); + parentIdColumn->SetType("Uint32"); + parentIdColumn->SetTypeId(NScheme::NTypeIds::Uint32); + parentIdColumn->SetId(0); } - if (baseTablePartitionConfig.HasEraseCacheMaxBytes()) { - result.SetEraseCacheMaxBytes(baseTablePartitionConfig.GetEraseCacheMaxBytes()); + { + auto idColumn = implTableDesc.AddColumns(); + idColumn->SetName(NTableVectorKmeansTreeIndex::LevelTable_IdColumn); + idColumn->SetType("Uint32"); + idColumn->SetTypeId(NScheme::NTypeIds::Uint32); + idColumn->SetId(1); } - if (baseTablePartitionConfig.HasKeepSnapshotTimeout()) { - result.SetKeepSnapshotTimeout(baseTablePartitionConfig.GetKeepSnapshotTimeout()); + { + auto centroidColumn = implTableDesc.AddColumns(); + centroidColumn->SetName(NTableVectorKmeansTreeIndex::LevelTable_EmbeddingColumn); + centroidColumn->SetType("String"); + centroidColumn->SetTypeId(NScheme::NTypeIds::String); + centroidColumn->SetId(2); } - // skip repeated NKikimrStorageSettings.TStorageRoom StorageRooms = 17; - // skip optional NKikimrHive.TFollowerGroup FollowerGroup = 23; - return result; + implTableDesc.AddKeyColumnNames(NTableVectorKmeansTreeIndex::LevelTable_ParentIdColumn); + implTableDesc.AddKeyColumnNames(NTableVectorKmeansTreeIndex::LevelTable_IdColumn); + + return implTableDesc; } -NKikimrSchemeOp::TPartitionConfig PartitionConfigForIndexes( +NKikimrSchemeOp::TTableDescription CalcVectorKmeansTreePostingImplTableDesc( const NSchemeShard::TTableInfo::TPtr& baseTableInfo, + const NKikimrSchemeOp::TPartitionConfig& baseTablePartitionConfig, + const NTableIndex::TTableColumns& implTableColumns, const NKikimrSchemeOp::TTableDescription& indexTableDesc) { - return PartitionConfigForIndexes(baseTableInfo->PartitionConfig(), indexTableDesc); + NKikimrSchemeOp::TTableDescription implTableDesc; + + implTableDesc.SetName(NTableVectorKmeansTreeIndex::PostingTable); + + SetImplTablePartitionConfig(baseTablePartitionConfig, indexTableDesc, implTableDesc); + + { + auto parentIdColumn = implTableDesc.AddColumns(); + parentIdColumn->SetName(NTableVectorKmeansTreeIndex::PostingTable_ParentIdColumn); + parentIdColumn->SetType("Uint32"); + parentIdColumn->SetTypeId(NScheme::NTypeIds::Uint32); + parentIdColumn->SetId(0); + } + + FillIndexImplTableColumns(baseTableInfo->Columns, implTableColumns, implTableDesc); + + return implTableDesc; } -NKikimrSchemeOp::TPartitionConfig PartitionConfigForIndexes( - const NKikimrSchemeOp::TTableDescription& baseTableDescr, +NKikimrSchemeOp::TTableDescription CalcVectorKmeansTreePostingImplTableDesc( + const NKikimrSchemeOp::TTableDescription &baseTableDescr, + const NKikimrSchemeOp::TPartitionConfig& baseTablePartitionConfig, + const NTableIndex::TTableColumns& implTableColumns, const NKikimrSchemeOp::TTableDescription& indexTableDesc) { - return PartitionConfigForIndexes(baseTableDescr.GetPartitionConfig(), indexTableDesc); + NKikimrSchemeOp::TTableDescription implTableDesc; + + implTableDesc.SetName(NTableVectorKmeansTreeIndex::PostingTable); + + SetImplTablePartitionConfig(baseTablePartitionConfig, indexTableDesc, implTableDesc); + + { + auto parentIdColumn = implTableDesc.AddColumns(); + parentIdColumn->SetName(NTableVectorKmeansTreeIndex::PostingTable_ParentIdColumn); + parentIdColumn->SetType("Uint32"); + parentIdColumn->SetTypeId(NScheme::NTypeIds::Uint32); + parentIdColumn->SetId(0); + } + + FillIndexImplTableColumns(baseTableDescr.GetColumns(), implTableColumns, implTableDesc); + + return implTableDesc; } + + bool ExtractTypes(const NKikimrSchemeOp::TTableDescription& baseTableDescr, TColumnTypes& columnTypes, TString& explain) { const NScheme::TTypeRegistry* typeRegistry = AppData()->TypeRegistry; Y_ABORT_UNLESS(typeRegistry); diff --git a/ydb/core/tx/schemeshard/schemeshard_utils.h b/ydb/core/tx/schemeshard/schemeshard_utils.h index b74161e931ef..b63b33b1a4af 100644 --- a/ydb/core/tx/schemeshard/schemeshard_utils.h +++ b/ydb/core/tx/schemeshard/schemeshard_utils.h @@ -146,14 +146,22 @@ NKikimrSchemeOp::TTableDescription CalcImplTableDesc( const NTableIndex::TTableColumns& implTableColumns, const NKikimrSchemeOp::TTableDescription& indexTableDesc); -NKikimrSchemeOp::TPartitionConfig PartitionConfigForIndexes( - const NSchemeShard::TTableInfo::TPtr& baseTableInfo, +NKikimrSchemeOp::TTableDescription CalcVectorKmeansTreeLevelImplTableDesc( + const NKikimrSchemeOp::TPartitionConfig& baseTablePartitionConfig, const NKikimrSchemeOp::TTableDescription& indexTableDesc); -NKikimrSchemeOp::TPartitionConfig PartitionConfigForIndexes( - const NKikimrSchemeOp::TTableDescription& baseTableDesc, +NKikimrSchemeOp::TTableDescription CalcVectorKmeansTreePostingImplTableDesc( + const NSchemeShard::TTableInfo::TPtr& baseTableInfo, + const NKikimrSchemeOp::TPartitionConfig& baseTablePartitionConfig, + const NTableIndex::TTableColumns& implTableColumns, const NKikimrSchemeOp::TTableDescription& indexTableDesc); +NKikimrSchemeOp::TTableDescription CalcVectorKmeansTreePostingImplTableDesc( + const NKikimrSchemeOp::TTableDescription &baseTableDescr, + const NKikimrSchemeOp::TPartitionConfig& baseTablePartitionConfig, + const NTableIndex::TTableColumns& implTableColumns, + const NKikimrSchemeOp::TTableDescription& indexTableDesc); + TTableColumns ExtractInfo(const NSchemeShard::TTableInfo::TPtr& tableInfo); TTableColumns ExtractInfo(const NKikimrSchemeOp::TTableDescription& tableDesc); TIndexColumns ExtractInfo(const NKikimrSchemeOp::TIndexCreationConfig& indexDesc); @@ -189,21 +197,37 @@ bool CommonCheck(const TTableDesc& tableDesc, const NKikimrSchemeOp::TIndexCreat return false; } - if (!IsCompatibleIndex(baseTableColumns, indexKeys, error)) { + if (!IsCompatibleIndex(indexDesc.GetType(), baseTableColumns, indexKeys, error)) { status = NKikimrScheme::EStatus::StatusInvalidParameter; return false; } - TColumnTypes columnsTypes; - if (!ExtractTypes(tableDesc, columnsTypes, error)) { + TColumnTypes baseColumnTypes; + if (!ExtractTypes(tableDesc, baseColumnTypes, error)) { status = NKikimrScheme::EStatus::StatusInvalidParameter; return false; } - implTableColumns = CalcTableImplDescription(baseTableColumns, indexKeys); - if (!IsCompatibleKeyTypes(columnsTypes, implTableColumns, uniformTable, error)) { - status = NKikimrScheme::EStatus::StatusInvalidParameter; - return false; + implTableColumns = CalcTableImplDescription(indexDesc.GetType(), baseTableColumns, indexKeys); + + if (indexDesc.GetType() == NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree) { + //We have already checked this in IsCompatibleIndex + Y_ABORT_UNLESS(indexKeys.KeyColumns.size() == 1); + + const TString& indexColumnName = indexKeys.KeyColumns[0]; + Y_ABORT_UNLESS(baseColumnTypes.contains(indexColumnName)); + auto typeInfo = baseColumnTypes.at(indexColumnName); + + if (typeInfo.GetTypeId() != NScheme::NTypeIds::String) { + status = NKikimrScheme::EStatus::StatusInvalidParameter; + error = TStringBuilder() << "Index column '" << indexColumnName << "' expected type 'String' but got " << NScheme::TypeName(typeInfo); + return false; + } + } else { + if (!IsCompatibleKeyTypes(baseColumnTypes, implTableColumns, uniformTable, error)) { + status = NKikimrScheme::EStatus::StatusInvalidParameter; + return false; + } } if (implTableColumns.Keys.size() > schemeLimits.MaxTableKeyColumns) { diff --git a/ydb/core/tx/schemeshard/ut_helpers/ls_checks.cpp b/ydb/core/tx/schemeshard/ut_helpers/ls_checks.cpp index 8a415d0cad6c..59b41331ffb2 100644 --- a/ydb/core/tx/schemeshard/ut_helpers/ls_checks.cpp +++ b/ydb/core/tx/schemeshard/ut_helpers/ls_checks.cpp @@ -471,8 +471,7 @@ void IsResourcePool(const NKikimrScheme::TEvDescribeSchemeResult& record) { UNIT_ASSERT_VALUES_EQUAL(selfPath.GetPathType(), NKikimrSchemeOp::EPathTypeResourcePool); } -TCheckFunc CheckColumns(const TString& name, const TSet& columns, const TSet& droppedColumns, const TSet keyColumns, - NKikimrSchemeOp::EPathState pathState) { +TCheckFunc CheckColumns(const TString& name, const TSet& columns, const TSet& droppedColumns, const TSet keyColumns, bool strictCount) { return [=] (const NKikimrScheme::TEvDescribeSchemeResult& record) { UNIT_ASSERT(record.HasPathDescription()); NKikimrSchemeOp::TPathDescription descr = record.GetPathDescription(); @@ -483,12 +482,13 @@ TCheckFunc CheckColumns(const TString& name, const TSet& columns, const TString curName = self.GetName(); ui32 curPathState = self.GetPathState(); UNIT_ASSERT_STRINGS_EQUAL(curName, name); - UNIT_ASSERT_VALUES_EQUAL(curPathState, (ui32)pathState); + UNIT_ASSERT_VALUES_EQUAL(curPathState, (ui32)NKikimrSchemeOp::EPathState::EPathStateNoChanges); UNIT_ASSERT(descr.HasTable()); NKikimrSchemeOp::TTableDescription table = descr.GetTable(); UNIT_ASSERT(table.ColumnsSize()); + UNIT_ASSERT(!strictCount || columns.size() - droppedColumns.size() == table.ColumnsSize()); for (auto& col : table.GetColumns()) { UNIT_ASSERT(col.HasName()); UNIT_ASSERT(col.HasId()); @@ -499,6 +499,7 @@ TCheckFunc CheckColumns(const TString& name, const TSet& columns, const UNIT_ASSERT(!droppedColumns.contains(name)); } + UNIT_ASSERT(!strictCount || keyColumns.size() == table.KeyColumnNamesSize()); for (auto& keyName : table.GetKeyColumnNames()) { UNIT_ASSERT(keyColumns.contains(keyName)); } @@ -830,6 +831,26 @@ TCheckFunc IndexDataColumns(const TVector& dataColumnNames) { }; } +TCheckFunc VectorIndexDescription(Ydb::Table::VectorIndexSettings_Distance dist, + Ydb::Table::VectorIndexSettings_Similarity similarity, + Ydb::Table::VectorIndexSettings_VectorType vectorType, + ui32 vectorDimension + ) { + return [=] (const NKikimrScheme::TEvDescribeSchemeResult& record) { + if (record.GetPathDescription().GetTableIndex().HasVectorIndexKmeansTreeDescription()) { + const auto& settings = record.GetPathDescription().GetTableIndex().GetVectorIndexKmeansTreeDescription().GetSettings(); + UNIT_ASSERT_VALUES_EQUAL(settings.distance(), dist); + UNIT_ASSERT_VALUES_EQUAL(settings.similarity(), similarity); + UNIT_ASSERT_VALUES_EQUAL(settings.vector_type(), vectorType); + UNIT_ASSERT_VALUES_EQUAL(settings.vector_dimension(), vectorDimension); + } else { + UNIT_FAIL("oneof SpecializedIndexDescription should be set."); + } + }; + +} + + TCheckFunc SequenceName(const TString& name) { return [=] (const NKikimrScheme::TEvDescribeSchemeResult& record) { UNIT_ASSERT_VALUES_EQUAL(record.GetPathDescription().GetSequenceDescription().GetName(), name); diff --git a/ydb/core/tx/schemeshard/ut_helpers/ls_checks.h b/ydb/core/tx/schemeshard/ut_helpers/ls_checks.h index 3cb8f39c5754..363422c3a12e 100644 --- a/ydb/core/tx/schemeshard/ut_helpers/ls_checks.h +++ b/ydb/core/tx/schemeshard/ut_helpers/ls_checks.h @@ -9,6 +9,8 @@ #include #include +#include + #include namespace NSchemeShardUT_Private { @@ -94,8 +96,7 @@ namespace NLs { void IsExternalDataSource(const NKikimrScheme::TEvDescribeSchemeResult& record); void IsView(const NKikimrScheme::TEvDescribeSchemeResult& record); void IsResourcePool(const NKikimrScheme::TEvDescribeSchemeResult& record); - TCheckFunc CheckColumns(const TString& name, const TSet& columns, const TSet& droppedColumns, const TSet keyColumns, - NKikimrSchemeOp::EPathState pathState = NKikimrSchemeOp::EPathState::EPathStateNoChanges); + TCheckFunc CheckColumns(const TString& name, const TSet& columns, const TSet& droppedColumns, const TSet keyColumns, bool strictCount = false); void CheckBoundaries(const NKikimrScheme::TEvDescribeSchemeResult& record); TCheckFunc PartitionCount(ui32 count); TCheckFunc PartitionKeys(TVector lastShardKeys); @@ -138,6 +139,12 @@ namespace NLs { TCheckFunc IndexState(NKikimrSchemeOp::EIndexState state); TCheckFunc IndexKeys(const TVector& keyNames); TCheckFunc IndexDataColumns(const TVector& dataColumnNames); + + TCheckFunc VectorIndexDescription(Ydb::Table::VectorIndexSettings_Distance dist, + Ydb::Table::VectorIndexSettings_Similarity similarity, + Ydb::Table::VectorIndexSettings_VectorType vectorType, + ui32 vectorDimension + ); TCheckFunc SequenceName(const TString& name); TCheckFunc SequenceIncrement(i64 increment); diff --git a/ydb/core/tx/schemeshard/ut_index/ut_vector_index.cpp b/ydb/core/tx/schemeshard/ut_index/ut_vector_index.cpp new file mode 100644 index 000000000000..c3f6914b961f --- /dev/null +++ b/ydb/core/tx/schemeshard/ut_index/ut_vector_index.cpp @@ -0,0 +1,196 @@ +#include +#include +#include +#include +#include +#include +#include +#include + + +using namespace NKikimr; +using namespace NSchemeShard; +using namespace NSchemeShardUT_Private; +using namespace NKikimr::NTableIndex::NTableVectorKmeansTreeIndex; + +Y_UNIT_TEST_SUITE(TVectorIndexTests) { + Y_UNIT_TEST(CreateTable) { + TTestBasicRuntime runtime; + TTestEnv env(runtime); + ui64 txId = 100; + + TestCreateIndexedTable(runtime, ++txId, "/MyRoot", R"( + TableDescription { + Name: "vectors" + Columns { Name: "id" Type: "Uint64" } + Columns { Name: "embedding" Type: "String" } + Columns { Name: "covered" Type: "String" } + Columns { Name: "another" Type: "String" } + KeyColumnNames: ["id"] + } + IndexDescription { + Name: "idx_vector" + KeyColumnNames: ["embedding"] + DataColumnNames: ["covered"] + Type: EIndexTypeGlobalVectorKmeansTree + VectorIndexKmeansTreeDescription: { Settings : { distance: DISTANCE_COSINE, vector_type: VECTOR_TYPE_FLOAT, vector_dimension: 1024 } } + } + )"); + env.TestWaitNotification(runtime, txId); + + TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/vectors/idx_vector"), + { NLs::PathExist, + NLs::IndexType(NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree), + NLs::IndexState(NKikimrSchemeOp::EIndexStateReady), + NLs::IndexKeys({"embedding"}), + NLs::IndexDataColumns({"covered"}), + NLs::VectorIndexDescription(Ydb::Table::VectorIndexSettings::DISTANCE_COSINE, + Ydb::Table::VectorIndexSettings::SIMILARITY_UNSPECIFIED, + Ydb::Table::VectorIndexSettings::VECTOR_TYPE_FLOAT, + 1024 + ), + }); + + TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/vectors/idx_vector/indexImplLevelTable"), + { NLs::PathExist, + NLs::CheckColumns(LevelTable, {LevelTable_ParentIdColumn, LevelTable_IdColumn, LevelTable_EmbeddingColumn}, {}, {LevelTable_ParentIdColumn, LevelTable_IdColumn}, true) }); + + TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/vectors/idx_vector/indexImplPostingTable"), + { NLs::PathExist, + NLs::CheckColumns(PostingTable, {PostingTable_ParentIdColumn, "id", "covered"}, {}, {PostingTable_ParentIdColumn, "id"}, true) }); + + + TVector dropTxIds; + TestDropTable(runtime, dropTxIds.emplace_back(++txId), "/MyRoot", "vectors"); + env.TestWaitNotification(runtime, dropTxIds); + } + + Y_UNIT_TEST(CreateTableCoveredEmbedding) { + TTestBasicRuntime runtime; + TTestEnv env(runtime); + ui64 txId = 100; + + TestCreateIndexedTable(runtime, ++txId, "/MyRoot", R"( + TableDescription { + Name: "vectors" + Columns { Name: "id" Type: "Uint64" } + Columns { Name: "embedding" Type: "String" } + Columns { Name: "another" Type: "String" } + KeyColumnNames: ["id"] + } + IndexDescription { + Name: "idx_vector" + KeyColumnNames: ["embedding"] + DataColumnNames: ["embedding"] + Type: EIndexTypeGlobalVectorKmeansTree + VectorIndexKmeansTreeDescription: { Settings : { distance: DISTANCE_COSINE, vector_type: VECTOR_TYPE_FLOAT, vector_dimension: 1024 } } + } + )"); + env.TestWaitNotification(runtime, txId); + + TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/vectors/idx_vector"), + { NLs::PathExist, + NLs::IndexType(NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree), + NLs::IndexState(NKikimrSchemeOp::EIndexStateReady), + NLs::IndexKeys({"embedding"}), + NLs::IndexDataColumns({"embedding"}), + NLs::VectorIndexDescription(Ydb::Table::VectorIndexSettings::DISTANCE_COSINE, + Ydb::Table::VectorIndexSettings::SIMILARITY_UNSPECIFIED, + Ydb::Table::VectorIndexSettings::VECTOR_TYPE_FLOAT, + 1024 + ), + }); + + TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/vectors/idx_vector/indexImplLevelTable"), + { NLs::PathExist, + NLs::CheckColumns(LevelTable, {LevelTable_ParentIdColumn, LevelTable_IdColumn, LevelTable_EmbeddingColumn}, {}, {LevelTable_ParentIdColumn, LevelTable_IdColumn}, true) }); + + TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/vectors/idx_vector/indexImplPostingTable"), + { NLs::PathExist, + NLs::CheckColumns(PostingTable, {PostingTable_ParentIdColumn, "id", "embedding"}, {}, {PostingTable_ParentIdColumn, "id"}, true) }); + } + + Y_UNIT_TEST(CreateTableMultiColumn) { + TTestBasicRuntime runtime; + TTestEnv env(runtime); + ui64 txId = 100; + + TestCreateIndexedTable(runtime, ++txId, "/MyRoot", R"( + TableDescription { + Name: "vectors" + Columns { Name: "id1" Type: "String" } + Columns { Name: "id2" Type: "String" } + Columns { Name: "embedding" Type: "String" } + Columns { Name: "covered1" Type: "String" } + Columns { Name: "covered2" Type: "String" } + Columns { Name: "another1" Type: "String" } + Columns { Name: "another2" Type: "String" } + KeyColumnNames: ["id1", "id2"] + } + IndexDescription { + Name: "idx_vector" + KeyColumnNames: ["embedding"] + DataColumnNames: ["covered1", "covered2"] + Type: EIndexTypeGlobalVectorKmeansTree + VectorIndexKmeansTreeDescription: { Settings : { distance: DISTANCE_COSINE, vector_type: VECTOR_TYPE_FLOAT, vector_dimension: 1024 } } + } + )"); + env.TestWaitNotification(runtime, txId); + + TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/vectors/idx_vector"), + { NLs::PathExist, + NLs::IndexType(NKikimrSchemeOp::EIndexTypeGlobalVectorKmeansTree), + NLs::IndexState(NKikimrSchemeOp::EIndexStateReady), + NLs::IndexKeys({"embedding"}), + NLs::IndexDataColumns({"covered1", "covered2"}), + }); + + TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/vectors/idx_vector/indexImplLevelTable"), + { NLs::PathExist, + NLs::CheckColumns(LevelTable, {LevelTable_ParentIdColumn, LevelTable_IdColumn, LevelTable_EmbeddingColumn}, {}, {LevelTable_ParentIdColumn, LevelTable_IdColumn}, true) }); + + TestDescribeResult(DescribePrivatePath(runtime, "/MyRoot/vectors/idx_vector/indexImplPostingTable"), + { NLs::PathExist, + NLs::CheckColumns(PostingTable, {PostingTable_ParentIdColumn, "id1", "id2", "covered1", "covered2"}, {}, {PostingTable_ParentIdColumn, "id1", "id2"}, true) }); + } + + + Y_UNIT_TEST(CreateTableWithError) { + TTestBasicRuntime runtime; + TTestEnv env(runtime); + ui64 txId = 100; + + // base table column should not contains reserved name '-parent' + TestCreateIndexedTable(runtime, ++txId, "/MyRoot", R"( + TableDescription { + Name: "vectors" + Columns { Name: "id" Type: "Uint64" } + Columns { Name: "-parent" Type: "String" } + KeyColumnNames: ["id"] + } + IndexDescription { + Name: "idx_vector" + KeyColumnNames: ["-parent"] + Type: EIndexTypeGlobalVectorKmeansTree + VectorIndexKmeansTreeDescription: { Settings : { distance: DISTANCE_COSINE, vector_type: VECTOR_TYPE_FLOAT, vector_dimension: 1024 } } + } + )", {NKikimrScheme::StatusInvalidParameter}); + + // pk should not be covered + TestCreateIndexedTable(runtime, ++txId, "/MyRoot", R"( + TableDescription { + Name: "vectors" + Columns { Name: "id" Type: "Uint64" } + Columns { Name: "embedding" Type: "String" } + KeyColumnNames: ["id"] + } + IndexDescription { + Name: "idx_vector" + KeyColumnNames: ["embedding"] + DataColumnNames: ["id"] + Type: EIndexTypeGlobalVectorKmeansTree + VectorIndexKmeansTreeDescription: { Settings : { distance: DISTANCE_COSINE, vector_type: VECTOR_TYPE_FLOAT, vector_dimension: 1024 } } + } + )", {NKikimrScheme::StatusInvalidParameter}); + } +} diff --git a/ydb/core/tx/schemeshard/ut_index/ya.make b/ydb/core/tx/schemeshard/ut_index/ya.make index ed4a5fdd9f7b..3c3a129aef5f 100644 --- a/ydb/core/tx/schemeshard/ut_index/ya.make +++ b/ydb/core/tx/schemeshard/ut_index/ya.make @@ -24,6 +24,7 @@ PEERDIR( SRCS( ut_async_index.cpp ut_unique_index.cpp + ut_vector_index.cpp ) YQL_LAST_ABI_VERSION() diff --git a/ydb/core/ydb_convert/table_description.cpp b/ydb/core/ydb_convert/table_description.cpp index a1a0eb6e0ccb..8952209ddeeb 100644 --- a/ydb/core/ydb_convert/table_description.cpp +++ b/ydb/core/ydb_convert/table_description.cpp @@ -822,13 +822,8 @@ void FillPartitioningSettingsImpl(TYdbProto& out, } void FillGlobalIndexSettings(Ydb::Table::GlobalIndexSettings& settings, - const google::protobuf::RepeatedPtrField& indexImplTables + const NKikimrSchemeOp::TTableDescription& indexImplTableDescription ) { - if (indexImplTables.empty()) { - return; - } - const auto& indexImplTableDescription = indexImplTables.Get(0); - if (indexImplTableDescription.SplitBoundarySize()) { NKikimrMiniKQL::TType splitKeyType; Ydb::Table::DescribeTableResult unused; @@ -865,20 +860,33 @@ void FillIndexDescriptionImpl(TYdbProto& out, const NKikimrSchemeOp::TTableDescr case NKikimrSchemeOp::EIndexType::EIndexTypeGlobal: FillGlobalIndexSettings( *index->mutable_global_index()->mutable_settings(), - tableIndex.GetIndexImplTableDescriptions() + tableIndex.GetIndexImplTableDescriptions(0) ); break; case NKikimrSchemeOp::EIndexType::EIndexTypeGlobalAsync: FillGlobalIndexSettings( *index->mutable_global_async_index()->mutable_settings(), - tableIndex.GetIndexImplTableDescriptions() + tableIndex.GetIndexImplTableDescriptions(0) ); break; case NKikimrSchemeOp::EIndexType::EIndexTypeGlobalUnique: FillGlobalIndexSettings( *index->mutable_global_unique_index()->mutable_settings(), - tableIndex.GetIndexImplTableDescriptions() + tableIndex.GetIndexImplTableDescriptions(0) + ); + break; + case NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree: + FillGlobalIndexSettings( + *index->mutable_global_vector_kmeans_tree_index()->mutable_level_table_settings(), + tableIndex.GetIndexImplTableDescriptions(0) ); + FillGlobalIndexSettings( + *index->mutable_global_vector_kmeans_tree_index()->mutable_posting_table_settings(), + tableIndex.GetIndexImplTableDescriptions(1) + ); + + *index->mutable_global_vector_kmeans_tree_index()->mutable_vector_settings() = tableIndex.GetVectorIndexKmeansTreeDescription().GetSettings(); + break; default: break; @@ -933,6 +941,7 @@ bool FillIndexDescription(NKikimrSchemeOp::TIndexedTableCreationConfig& out, } // specific fields + std::vector indexImplTableDescriptionsVector; switch (index.type_case()) { case Ydb::Table::TableIndex::kGlobalIndex: indexDesc->SetType(NKikimrSchemeOp::EIndexType::EIndexTypeGlobal); @@ -946,15 +955,21 @@ bool FillIndexDescription(NKikimrSchemeOp::TIndexedTableCreationConfig& out, indexDesc->SetType(NKikimrSchemeOp::EIndexType::EIndexTypeGlobalUnique); break; + case Ydb::Table::TableIndex::kGlobalVectorKmeansTreeIndex: + indexDesc->SetType(NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree); + *indexDesc->MutableVectorIndexKmeansTreeDescription()->MutableSettings() = index.global_vector_kmeans_tree_index().vector_settings(); + break; + default: // pass through // TODO: maybe return BAD_REQUEST? break; } - if (!FillIndexTablePartitioning(*indexDesc->AddIndexImplTableDescriptions(), index, status, error)) { + if (!FillIndexTablePartitioning(indexImplTableDescriptionsVector, index, status, error)) { return false; } + *indexDesc->MutableIndexImplTableDescriptions() = {indexImplTableDescriptionsVector.begin(), indexImplTableDescriptionsVector.end()}; } return true; diff --git a/ydb/core/ydb_convert/table_settings.cpp b/ydb/core/ydb_convert/table_settings.cpp index 9ee281f3f648..324fefbca031 100644 --- a/ydb/core/ydb_convert/table_settings.cpp +++ b/ydb/core/ydb_convert/table_settings.cpp @@ -405,18 +405,21 @@ bool FillAlterTableSettingsDesc(NKikimrSchemeOp::TTableDescription& tableDesc, } bool FillIndexTablePartitioning( - NKikimrSchemeOp::TTableDescription& out, + std::vector& indexImplTableDescriptions, const Ydb::Table::TableIndex& index, Ydb::StatusIds::StatusCode& code, TString& error ) { - auto fillIndexPartitioning = [&](const Ydb::Table::GlobalIndexSettings& settings) { + auto fillIndexPartitioning = [&](const Ydb::Table::GlobalIndexSettings& settings, std::vector& indexImplTableDescriptions) { + indexImplTableDescriptions.push_back({}); + auto& indexImplTableDescription = indexImplTableDescriptions.back(); + if (settings.has_partitioning_settings()) { - if (!FillPartitioningPolicy(*out.MutablePartitionConfig(), settings, code, error)) { + if (!FillPartitioningPolicy(*indexImplTableDescription.MutablePartitionConfig(), settings, code, error)) { return false; } } if (settings.partitions_case() != Ydb::Table::GlobalIndexSettings::PARTITIONS_NOT_SET) { - if (!FillPartitions(out, settings, code, error)) { + if (!FillPartitions(indexImplTableDescription, settings, code, error)) { return false; } } @@ -425,25 +428,34 @@ bool FillIndexTablePartitioning( switch (index.type_case()) { case Ydb::Table::TableIndex::kGlobalIndex: - if (!fillIndexPartitioning(index.global_index().settings())) { + if (!fillIndexPartitioning(index.global_index().settings(), indexImplTableDescriptions)) { return false; } break; case Ydb::Table::TableIndex::kGlobalAsyncIndex: - if (!fillIndexPartitioning(index.global_async_index().settings())) { + if (!fillIndexPartitioning(index.global_async_index().settings(), indexImplTableDescriptions)) { return false; } break; case Ydb::Table::TableIndex::kGlobalUniqueIndex: - if (!fillIndexPartitioning(index.global_unique_index().settings())) { + if (!fillIndexPartitioning(index.global_unique_index().settings(), indexImplTableDescriptions)) { + return false; + } + break; + + case Ydb::Table::TableIndex::kGlobalVectorKmeansTreeIndex: + if (!fillIndexPartitioning(index.global_vector_kmeans_tree_index().level_table_settings(), indexImplTableDescriptions)) { + return false; + } + if (!fillIndexPartitioning(index.global_vector_kmeans_tree_index().posting_table_settings(), indexImplTableDescriptions)) { return false; } break; case Ydb::Table::TableIndex::TYPE_NOT_SET: - break; + break; } return true; diff --git a/ydb/core/ydb_convert/table_settings.h b/ydb/core/ydb_convert/table_settings.h index 46713ceefa36..49c36c0f6820 100644 --- a/ydb/core/ydb_convert/table_settings.h +++ b/ydb/core/ydb_convert/table_settings.h @@ -70,7 +70,7 @@ bool FillTtlSettings(TTtlSettingsEnabled& out, const Ydb::Table::TtlSettings& in } bool FillIndexTablePartitioning( - NKikimrSchemeOp::TTableDescription& out, + std::vector& indexImplTableDescriptions, const Ydb::Table::TableIndex& index, Ydb::StatusIds::StatusCode& code, TString& error); diff --git a/ydb/public/api/protos/out/out.cpp b/ydb/public/api/protos/out/out.cpp index c8c5c7abfaff..c169fef4c329 100644 --- a/ydb/public/api/protos/out/out.cpp +++ b/ydb/public/api/protos/out/out.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include @@ -29,3 +30,15 @@ Y_DECLARE_OUT_SPEC(, Ydb::Export::ExportProgress::Progress, stream, value) { Y_DECLARE_OUT_SPEC(, Ydb::Import::ImportProgress::Progress, stream, value) { stream << Ydb::Import::ImportProgress_Progress_Name(value); } + +Y_DECLARE_OUT_SPEC(, Ydb::Table::VectorIndexSettings::Distance, stream, value) { + stream << Ydb::Table::VectorIndexSettings::Distance_Name(value); +} + +Y_DECLARE_OUT_SPEC(, Ydb::Table::VectorIndexSettings::Similarity, stream, value) { + stream << Ydb::Table::VectorIndexSettings::Similarity_Name(value); +} + +Y_DECLARE_OUT_SPEC(, Ydb::Table::VectorIndexSettings::VectorType, stream, value) { + stream << Ydb::Table::VectorIndexSettings::VectorType_Name(value); +} diff --git a/ydb/public/api/protos/ydb_table.proto b/ydb/public/api/protos/ydb_table.proto index a698b71c8d66..bd2ce670da82 100644 --- a/ydb/public/api/protos/ydb_table.proto +++ b/ydb/public/api/protos/ydb_table.proto @@ -59,6 +59,37 @@ message GlobalIndexSettings { PartitioningSettings partitioning_settings = 3; } +message VectorIndexSettings { + enum Distance { + DISTANCE_UNSPECIFIED = 0; + DISTANCE_COSINE = 1; + DISTANCE_MANHATTAN = 2; + DISTANCE_EUCLIDEAN = 3; + } + + enum Similarity { + SIMILARITY_UNSPECIFIED = 0; + SIMILARITY_COSINE = 1; + SIMILARITY_INNER_PRODUCT = 2; + } + + enum VectorType { + VECTOR_TYPE_UNSPECIFIED = 0; + VECTOR_TYPE_FLOAT = 1; + VECTOR_TYPE_UINT8 = 2; + VECTOR_TYPE_INT8 = 3; + VECTOR_TYPE_BIT = 4; + } + + oneof metric { + Distance distance = 1; + Similarity similarity = 2; + } + VectorType vector_type = 3; + + uint32 vector_dimension = 4; +} + message GlobalIndex { GlobalIndexSettings settings = 1; } @@ -71,6 +102,12 @@ message GlobalUniqueIndex { GlobalIndexSettings settings = 1; } +message GlobalVectorKMeansTreeIndex { + GlobalIndexSettings level_table_settings = 1; + GlobalIndexSettings posting_table_settings = 2; + VectorIndexSettings vector_settings = 3; +} + // Represent secondary index message TableIndex { // Name of index @@ -82,6 +119,7 @@ message TableIndex { GlobalIndex global_index = 3; GlobalAsyncIndex global_async_index = 4; GlobalUniqueIndex global_unique_index = 6; + GlobalVectorKMeansTreeIndex global_vector_kmeans_tree_index = 7; } // list of columns content to be copied in to index table repeated string data_columns = 5; @@ -105,6 +143,7 @@ message TableIndexDescription { GlobalIndex global_index = 3; GlobalAsyncIndex global_async_index = 5; GlobalUniqueIndex global_unique_index = 8; + GlobalVectorKMeansTreeIndex global_vector_kmeans_tree_index = 9; } Status status = 4; // list of columns content to be copied in to index table diff --git a/ydb/public/sdk/cpp/client/ydb_table/out.cpp b/ydb/public/sdk/cpp/client/ydb_table/out.cpp index 1f16dc74bf6c..b00212884f79 100644 --- a/ydb/public/sdk/cpp/client/ydb_table/out.cpp +++ b/ydb/public/sdk/cpp/client/ydb_table/out.cpp @@ -23,3 +23,69 @@ Y_DECLARE_OUT_SPEC(, NYdb::NTable::TCreateSessionResult, o, x) { Y_DECLARE_OUT_SPEC(, NYdb::NTable::TDescribeTableResult, o, x) { return x.Out(o); } + +Y_DECLARE_OUT_SPEC(, NYdb::NTable::TVectorIndexSettings::EDistance, stream, value) { + auto convertDistance = [] (auto value) -> auto { + switch (value) { + case NYdb::NTable::TVectorIndexSettings::EDistance::Cosine: + return "COSINE"; + case NYdb::NTable::TVectorIndexSettings::EDistance::Manhattan: + return "MANHATTAN"; + case NYdb::NTable::TVectorIndexSettings::EDistance::Euclidean: + return "EUCLIDEAN"; + case NYdb::NTable::TVectorIndexSettings::EDistance::Unknown: + return "UNKNOWN"; + } + }; + + stream << convertDistance(value); +} + +Y_DECLARE_OUT_SPEC(, NYdb::NTable::TVectorIndexSettings::ESimilarity, stream, value) { + auto convertSimilarity = [] (auto value) -> auto { + switch (value) { + case NYdb::NTable::TVectorIndexSettings::ESimilarity::Cosine: + return "COSINE"; + case NYdb::NTable::TVectorIndexSettings::ESimilarity::InnerProduct: + return "INNER_PRODUCT"; + case NYdb::NTable::TVectorIndexSettings::ESimilarity::Unknown: + return "UNKNOWN"; + } + }; + + stream << convertSimilarity(value); +} + +Y_DECLARE_OUT_SPEC(, NYdb::NTable::TVectorIndexSettings::EVectorType, stream, value) { + auto convertVectorType = [] (auto value) -> auto { + switch (value) { + case NYdb::NTable::TVectorIndexSettings::EVectorType::Float: + return "FLOAT"; + case NYdb::NTable::TVectorIndexSettings::EVectorType::Uint8: + return "UINT8"; + case NYdb::NTable::TVectorIndexSettings::EVectorType::Int8: + return "INT8"; + case NYdb::NTable::TVectorIndexSettings::EVectorType::Bit: + return "BIT"; + case NYdb::NTable::TVectorIndexSettings::EVectorType::Unknown: + return "UNKNOWN"; + } + }; + + stream << convertVectorType(value); +} + +Y_DECLARE_OUT_SPEC(, NYdb::NTable::TVectorIndexSettings, stream, value) { + stream << "{"; + + if (const auto* distance = std::get_if(&value.Metric)) { + stream << " distance: " << *distance << ""; + } else if (const auto* similarity = std::get_if(&value.Metric)) { + stream << " similarity: " << *similarity << ""; + } + + stream << ", vector_type: " << value.VectorType << ""; + stream << ", vector_dimension: " << value.VectorDimension << ""; + + stream << " }"; +} \ No newline at end of file diff --git a/ydb/public/sdk/cpp/client/ydb_table/table.cpp b/ydb/public/sdk/cpp/client/ydb_table/table.cpp index 7067585f5663..a340d5dbbf84 100644 --- a/ydb/public/sdk/cpp/client/ydb_table/table.cpp +++ b/ydb/public/sdk/cpp/client/ydb_table/table.cpp @@ -27,6 +27,7 @@ #include #include #include +#include #include @@ -468,6 +469,14 @@ class TTableDescription::TImpl { Indexes_.emplace_back(TIndexDescription(indexName, type, indexColumns, dataColumns)); } + void AddVectorIndex(const TString& indexName, EIndexType type, const TVector& indexColumns, const TVectorIndexSettings& vectorIndexSettings) { + Indexes_.emplace_back(TIndexDescription(indexName, type, indexColumns, {}, {}, vectorIndexSettings)); + } + + void AddVectorIndex(const TString& indexName, EIndexType type, const TVector& indexColumns, const TVector& dataColumns, const TVectorIndexSettings& vectorIndexSettings) { + Indexes_.emplace_back(TIndexDescription(indexName, type, indexColumns, dataColumns, {}, vectorIndexSettings)); + } + void AddChangefeed(const TString& name, EChangefeedMode mode, EChangefeedFormat format) { Changefeeds_.emplace_back(name, mode, format); } @@ -764,6 +773,14 @@ void TTableDescription::AddUniqueSecondaryIndex(const TString& indexName, const AddSecondaryIndex(indexName, EIndexType::GlobalUnique, indexColumns, dataColumns); } +void TTableDescription::AddVectorKMeansTreeSecondaryIndex(const TString& indexName, const TVector& indexColumns, const TVectorIndexSettings& vectorIndexSettings) { + Impl_->AddVectorIndex(indexName, EIndexType::GlobalVectorKMeansTree, indexColumns, vectorIndexSettings); +} + +void TTableDescription::AddVectorKMeansTreeSecondaryIndex(const TString& indexName, const TVector& indexColumns, const TVector& dataColumns, const TVectorIndexSettings& vectorIndexSettings) { + Impl_->AddVectorIndex(indexName, EIndexType::GlobalVectorKMeansTree, indexColumns, dataColumns, vectorIndexSettings); +} + void TTableDescription::AddSecondaryIndex(const TString& indexName, const TVector& indexColumns) { AddSyncSecondaryIndex(indexName, indexColumns); } @@ -1211,6 +1228,16 @@ TTableBuilder& TTableBuilder::AddUniqueSecondaryIndex(const TString& indexName, return AddSecondaryIndex(indexName, EIndexType::GlobalUnique, indexColumns); } +TTableBuilder& TTableBuilder::AddVectorKMeansTreeSecondaryIndex(const TString& indexName, const TVector& indexColumns, const TVector& dataColumns, const TVectorIndexSettings& vectorIndexSettings) { + TableDescription_.AddVectorKMeansTreeSecondaryIndex(indexName, indexColumns, dataColumns, vectorIndexSettings); + return *this; +} + +TTableBuilder& TTableBuilder::AddVectorKMeansTreeSecondaryIndex(const TString& indexName, const TVector& indexColumns, const TVectorIndexSettings& vectorIndexSettings) { + TableDescription_.AddVectorKMeansTreeSecondaryIndex(indexName, indexColumns, vectorIndexSettings); + return *this; +} + TTableBuilder& TTableBuilder::AddSecondaryIndex(const TString& indexName, const TString& indexColumn) { return AddSyncSecondaryIndex(indexName, indexColumn); } @@ -2215,20 +2242,22 @@ TIndexDescription::TIndexDescription( EIndexType type, const TVector& indexColumns, const TVector& dataColumns, - const TGlobalIndexSettings& settings + const TVector& globalIndexSettings, + const std::optional& vectorIndexSettings ) : IndexName_(name) , IndexType_(type) , IndexColumns_(indexColumns) , DataColumns_(dataColumns) - , GlobalIndexSettings_(settings) + , GlobalIndexSettings_(globalIndexSettings) + , VectorIndexSettings_(vectorIndexSettings) {} TIndexDescription::TIndexDescription( const TString& name, const TVector& indexColumns, const TVector& dataColumns, - const TGlobalIndexSettings& settings -) : TIndexDescription(name, EIndexType::GlobalSync, indexColumns, dataColumns, settings) + const TVector& globalIndexSettings +) : TIndexDescription(name, EIndexType::GlobalSync, indexColumns, dataColumns, globalIndexSettings) {} TIndexDescription::TIndexDescription(const Ydb::Table::TableIndex& tableIndex) @@ -2255,6 +2284,10 @@ const TVector& TIndexDescription::GetDataColumns() const { return DataColumns_; } +const std::optional& TIndexDescription::GetVectorIndexSettings() const { + return VectorIndexSettings_; +} + ui64 TIndexDescription::GetSizeBytes() const { return SizeBytes; } @@ -2292,12 +2325,128 @@ void TGlobalIndexSettings::SerializeTo(Ydb::Table::GlobalIndexSettings& settings std::visit(std::move(variantVisitor), Partitions); } +template +TVectorIndexSettings TVectorIndexSettings::FromProto(const TProto& proto) { + auto convertDistance = [] (auto distance) -> auto { + switch (distance) { + case Ydb::Table::VectorIndexSettings::DISTANCE_COSINE: + return EDistance::Cosine; + case Ydb::Table::VectorIndexSettings::DISTANCE_MANHATTAN: + return EDistance::Manhattan; + case Ydb::Table::VectorIndexSettings::DISTANCE_EUCLIDEAN: + return EDistance::Euclidean; + default: + return EDistance::Unknown; + } + }; + + auto convertSimilarity = [] (auto similarity) -> auto { + switch (similarity) { + case Ydb::Table::VectorIndexSettings::SIMILARITY_COSINE: + return ESimilarity::Cosine; + case Ydb::Table::VectorIndexSettings::SIMILARITY_INNER_PRODUCT: + return ESimilarity::InnerProduct; + default: + return ESimilarity::Unknown; + } + }; + + auto convertVectorType = [] (auto vectorType) -> auto { + switch (vectorType) { + case Ydb::Table::VectorIndexSettings::VECTOR_TYPE_FLOAT: + return EVectorType::Float; + case Ydb::Table::VectorIndexSettings::VECTOR_TYPE_UINT8: + return EVectorType::Uint8; + case Ydb::Table::VectorIndexSettings::VECTOR_TYPE_INT8: + return EVectorType::Int8; + case Ydb::Table::VectorIndexSettings::VECTOR_TYPE_BIT: + return EVectorType::Bit; + default: + return EVectorType::Unknown; + } + }; + + + auto metricFromProto = [&](const auto& proto) -> TVectorIndexSettings::TMetric { + switch (proto.metric_case()) { + case TProto::kDistance: + return convertDistance(proto.distance()); + case TProto::kSimilarity: + return convertSimilarity(proto.similarity()); + default: + return {}; + } + }; + + return { + .Metric = metricFromProto(proto), + .VectorType = convertVectorType(proto.vector_type()), + .VectorDimension = proto.vector_dimension() + }; +} + +void TVectorIndexSettings::SerializeTo(Ydb::Table::VectorIndexSettings& settings) const { + auto convertDistance = [] (auto distance) -> auto { + switch (distance) { + case EDistance::Cosine: + return Ydb::Table::VectorIndexSettings::DISTANCE_COSINE; + case EDistance::Manhattan: + return Ydb::Table::VectorIndexSettings::DISTANCE_MANHATTAN; + case EDistance::Euclidean: + return Ydb::Table::VectorIndexSettings::DISTANCE_EUCLIDEAN; + case EDistance::Unknown: + return Ydb::Table::VectorIndexSettings::DISTANCE_UNSPECIFIED; + } + }; + + auto convertSimilarity = [] (auto similarity) -> auto { + switch (similarity) { + case ESimilarity::Cosine: + return Ydb::Table::VectorIndexSettings::SIMILARITY_COSINE; + case ESimilarity::InnerProduct: + return Ydb::Table::VectorIndexSettings::SIMILARITY_INNER_PRODUCT; + case ESimilarity::Unknown: + return Ydb::Table::VectorIndexSettings::SIMILARITY_UNSPECIFIED; + } + }; + + auto convertVectorType = [] (auto vectorType) -> auto { + switch (vectorType) { + case EVectorType::Float: + return Ydb::Table::VectorIndexSettings::VECTOR_TYPE_FLOAT; + case EVectorType::Uint8: + return Ydb::Table::VectorIndexSettings::VECTOR_TYPE_UINT8; + case EVectorType::Int8: + return Ydb::Table::VectorIndexSettings::VECTOR_TYPE_INT8; + case EVectorType::Bit: + return Ydb::Table::VectorIndexSettings::VECTOR_TYPE_BIT; + case EVectorType::Unknown: + return Ydb::Table::VectorIndexSettings::VECTOR_TYPE_UNSPECIFIED; + } + }; + + + if (const auto* distance = std::get_if(&Metric)) { + settings.set_distance(convertDistance(*distance)); + } else if (const auto* similarity = std::get_if(&Metric)) { + settings.set_similarity(convertSimilarity(*similarity)); + } + + settings.set_vector_type(convertVectorType(VectorType)); + settings.set_vector_dimension(VectorDimension); +} + +void TVectorIndexSettings::Out(IOutputStream& o) const { + o << *this; +} + template TIndexDescription TIndexDescription::FromProto(const TProto& proto) { EIndexType type; TVector indexColumns; TVector dataColumns; - TGlobalIndexSettings globalIndexSettings; + TVector globalIndexSettings; + std::optional vectorIndexSettings; indexColumns.assign(proto.index_columns().begin(), proto.index_columns().end()); dataColumns.assign(proto.data_columns().begin(), proto.data_columns().end()); @@ -2305,22 +2454,31 @@ TIndexDescription TIndexDescription::FromProto(const TProto& proto) { switch (proto.type_case()) { case TProto::kGlobalIndex: type = EIndexType::GlobalSync; - globalIndexSettings = TGlobalIndexSettings::FromProto(proto.global_index().settings()); + globalIndexSettings.emplace_back(TGlobalIndexSettings::FromProto(proto.global_index().settings())); break; case TProto::kGlobalAsyncIndex: type = EIndexType::GlobalAsync; - globalIndexSettings = TGlobalIndexSettings::FromProto(proto.global_async_index().settings()); + globalIndexSettings.emplace_back(TGlobalIndexSettings::FromProto(proto.global_async_index().settings())); break; case TProto::kGlobalUniqueIndex: type = EIndexType::GlobalUnique; - globalIndexSettings = TGlobalIndexSettings::FromProto(proto.global_unique_index().settings()); + globalIndexSettings.emplace_back(TGlobalIndexSettings::FromProto(proto.global_unique_index().settings())); + break; + case TProto::kGlobalVectorKmeansTreeIndex: { + type = EIndexType::GlobalVectorKMeansTree; + const auto &vectorProto = proto.global_vector_kmeans_tree_index(); + globalIndexSettings.emplace_back(TGlobalIndexSettings::FromProto(vectorProto.level_table_settings())); + globalIndexSettings.emplace_back(TGlobalIndexSettings::FromProto(vectorProto.posting_table_settings())); + vectorIndexSettings = TVectorIndexSettings::FromProto(vectorProto.vector_settings()); break; + } default: // fallback to global sync type = EIndexType::GlobalSync; + globalIndexSettings.resize(1); break; } - auto result = TIndexDescription(proto.name(), type, indexColumns, dataColumns, globalIndexSettings); + auto result = TIndexDescription(proto.name(), type, indexColumns, dataColumns, globalIndexSettings, vectorIndexSettings); if constexpr (std::is_same_v) { result.SizeBytes = proto.size_bytes(); } @@ -2337,15 +2495,38 @@ void TIndexDescription::SerializeTo(Ydb::Table::TableIndex& proto) const { *proto.mutable_data_columns() = {DataColumns_.begin(), DataColumns_.end()}; switch (IndexType_) { - case EIndexType::GlobalSync: - GlobalIndexSettings_.SerializeTo(*proto.mutable_global_index()->mutable_settings()); + case EIndexType::GlobalSync: { + auto& settings = *proto.mutable_global_index()->mutable_settings(); + if (GlobalIndexSettings_.size() == 1) + GlobalIndexSettings_[0].SerializeTo(settings); break; - case EIndexType::GlobalAsync: - GlobalIndexSettings_.SerializeTo(*proto.mutable_global_async_index()->mutable_settings()); + } + case EIndexType::GlobalAsync: { + auto& settings = *proto.mutable_global_async_index()->mutable_settings(); + if (GlobalIndexSettings_.size() == 1) + GlobalIndexSettings_[0].SerializeTo(settings); break; - case EIndexType::GlobalUnique: - GlobalIndexSettings_.SerializeTo(*proto.mutable_global_unique_index()->mutable_settings()); + } + case EIndexType::GlobalUnique: { + auto& settings = *proto.mutable_global_unique_index()->mutable_settings(); + if (GlobalIndexSettings_.size() == 1) + GlobalIndexSettings_[0].SerializeTo(settings); break; + } + case EIndexType::GlobalVectorKMeansTree: { + auto* global_vector_kmeans_tree_index = proto.mutable_global_vector_kmeans_tree_index(); + auto& level_settings = *global_vector_kmeans_tree_index->mutable_level_table_settings(); + auto& posting_settings = *global_vector_kmeans_tree_index->mutable_posting_table_settings(); + auto& vector_settings = *global_vector_kmeans_tree_index->mutable_vector_settings(); + if (GlobalIndexSettings_.size() == 2) { + GlobalIndexSettings_[0].SerializeTo(level_settings); + GlobalIndexSettings_[1].SerializeTo(posting_settings); + } + if (VectorIndexSettings_) { + VectorIndexSettings_->SerializeTo(vector_settings); + } + break; + } case EIndexType::Unknown: break; } @@ -2367,6 +2548,9 @@ void TIndexDescription::Out(IOutputStream& o) const { o << ", data_columns: [" << JoinSeq(", ", DataColumns_) << "]"; } + if (VectorIndexSettings_) { + o << ", vector_settings: " << *VectorIndexSettings_ << ""; + } o << " }"; } diff --git a/ydb/public/sdk/cpp/client/ydb_table/table.h b/ydb/public/sdk/cpp/client/ydb_table/table.h index 4e20f5259f38..9284665cd76b 100644 --- a/ydb/public/sdk/cpp/client/ydb_table/table.h +++ b/ydb/public/sdk/cpp/client/ydb_table/table.h @@ -25,6 +25,7 @@ class ChangefeedDescription; class DescribeTableResult; class ExplicitPartitions; class GlobalIndexSettings; +class VectorIndexSettings; class PartitioningSettings; class DateTypeColumnModeSettings; class TtlSettings; @@ -192,6 +193,45 @@ struct TGlobalIndexSettings { void SerializeTo(Ydb::Table::GlobalIndexSettings& proto) const; }; +struct TVectorIndexSettings { +public: + enum class EDistance { + Cosine, + Manhattan, + Euclidean, + + Unknown = std::numeric_limits::max() + }; + + enum class ESimilarity { + Cosine, + InnerProduct, + + Unknown = std::numeric_limits::max() + }; + + enum class EVectorType { + Float, + Uint8, + Int8, + Bit, + + Unknown = std::numeric_limits::max() + }; + using TMetric = std::variant; + + TMetric Metric; + EVectorType VectorType; + ui32 VectorDimension; + + template + static TVectorIndexSettings FromProto(const TProto& proto); + + void SerializeTo(Ydb::Table::VectorIndexSettings& settings) const; + + void Out(IOutputStream &o) const; +}; + //! Represents index description class TIndexDescription { friend class NYdb::TProtoAccessor; @@ -202,20 +242,22 @@ class TIndexDescription { EIndexType type, const TVector& indexColumns, const TVector& dataColumns = {}, - const TGlobalIndexSettings& settings = {} + const TVector& globalIndexSettings = {}, + const std::optional& vectorIndexSettings = {} ); TIndexDescription( const TString& name, const TVector& indexColumns, const TVector& dataColumns = {}, - const TGlobalIndexSettings& settings = {} + const TVector& globalIndexSettings = {} ); const TString& GetIndexName() const; EIndexType GetIndexType() const; const TVector& GetIndexColumns() const; const TVector& GetDataColumns() const; + const std::optional& GetVectorIndexSettings() const; ui64 GetSizeBytes() const; void SerializeTo(Ydb::Table::TableIndex& proto) const; @@ -234,7 +276,8 @@ class TIndexDescription { EIndexType IndexType_; TVector IndexColumns_; TVector DataColumns_; - TGlobalIndexSettings GlobalIndexSettings_; + TVector GlobalIndexSettings_; + std::optional VectorIndexSettings_; ui64 SizeBytes = 0; }; @@ -608,6 +651,9 @@ class TTableDescription { // unique void AddUniqueSecondaryIndex(const TString& indexName, const TVector& indexColumns); void AddUniqueSecondaryIndex(const TString& indexName, const TVector& indexColumns, const TVector& dataColumns); + // vector KMeansTree + void AddVectorKMeansTreeSecondaryIndex(const TString& indexName, const TVector& indexColumns, const TVectorIndexSettings& vectorIndexSettings); + void AddVectorKMeansTreeSecondaryIndex(const TString& indexName, const TVector& indexColumns, const TVector& dataColumns, const TVectorIndexSettings& vectorIndexSettings); // default void AddSecondaryIndex(const TString& indexName, const TVector& indexColumns); @@ -827,6 +873,10 @@ class TTableBuilder { TTableBuilder& AddUniqueSecondaryIndex(const TString& indexName, const TVector& indexColumns); TTableBuilder& AddUniqueSecondaryIndex(const TString& indexName, const TVector& indexColumns, const TVector& dataColumns); + // vector KMeansTree + TTableBuilder& AddVectorKMeansTreeSecondaryIndex(const TString& indexName, const TVector& indexColumns, const TVectorIndexSettings& vectorIndexSettings); + TTableBuilder& AddVectorKMeansTreeSecondaryIndex(const TString& indexName, const TVector& indexColumns, const TVector& dataColumns, const TVectorIndexSettings& vectorIndexSettings); + // default TTableBuilder& AddSecondaryIndex(const TString& indexName, const TVector& indexColumns, const TVector& dataColumns); TTableBuilder& AddSecondaryIndex(const TString& indexName, const TVector& indexColumns); diff --git a/ydb/public/sdk/cpp/client/ydb_table/table_enum.h b/ydb/public/sdk/cpp/client/ydb_table/table_enum.h index 25b57b005b79..1660706f57a7 100644 --- a/ydb/public/sdk/cpp/client/ydb_table/table_enum.h +++ b/ydb/public/sdk/cpp/client/ydb_table/table_enum.h @@ -28,6 +28,7 @@ enum class EIndexType { GlobalSync, GlobalAsync, GlobalUnique, + GlobalVectorKMeansTree, Unknown = std::numeric_limits::max() };