From 1d72949cc1c9cc2fca1b740798c83ec857107745 Mon Sep 17 00:00:00 2001 From: Daniil Timizhev Date: Fri, 24 Oct 2025 18:42:24 +0300 Subject: [PATCH 01/25] Rename files --- ydb/core/formats/arrow/arrow_batch_builder.cpp | 2 +- ydb/core/formats/arrow/arrow_helpers_minikql.cpp | 2 +- .../{kqp_result_set_arrow.cpp => kqp_formats_arrow.cpp} | 2 +- .../{kqp_result_set_arrow.h => kqp_formats_arrow.h} | 0 .../kqp/common/result_set_format/kqp_result_set_builders.cpp | 2 +- .../{kqp_result_set_arrow_ut.cpp => kqp_formats_arrow_ut.cpp} | 2 +- ydb/core/kqp/common/result_set_format/ut/ya.make | 2 +- ydb/core/kqp/common/result_set_format/ya.make | 2 +- 8 files changed, 7 insertions(+), 7 deletions(-) rename ydb/core/kqp/common/result_set_format/{kqp_result_set_arrow.cpp => kqp_formats_arrow.cpp} (99%) rename ydb/core/kqp/common/result_set_format/{kqp_result_set_arrow.h => kqp_formats_arrow.h} (100%) rename ydb/core/kqp/common/result_set_format/ut/{kqp_result_set_arrow_ut.cpp => kqp_formats_arrow_ut.cpp} (99%) diff --git a/ydb/core/formats/arrow/arrow_batch_builder.cpp b/ydb/core/formats/arrow/arrow_batch_builder.cpp index cdb7d79913f8..b0cc3cd4ceff 100644 --- a/ydb/core/formats/arrow/arrow_batch_builder.cpp +++ b/ydb/core/formats/arrow/arrow_batch_builder.cpp @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include #include diff --git a/ydb/core/formats/arrow/arrow_helpers_minikql.cpp b/ydb/core/formats/arrow/arrow_helpers_minikql.cpp index 6d846347e599..351a47cfd8e6 100644 --- a/ydb/core/formats/arrow/arrow_helpers_minikql.cpp +++ b/ydb/core/formats/arrow/arrow_helpers_minikql.cpp @@ -1,6 +1,6 @@ #include "arrow_helpers_minikql.h" -#include +#include #include namespace NKikimr::NArrow { diff --git a/ydb/core/kqp/common/result_set_format/kqp_result_set_arrow.cpp b/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp similarity index 99% rename from ydb/core/kqp/common/result_set_format/kqp_result_set_arrow.cpp rename to ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp index 51db4942a44a..d92ee90ff8fd 100644 --- a/ydb/core/kqp/common/result_set_format/kqp_result_set_arrow.cpp +++ b/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp @@ -1,4 +1,4 @@ -#include "kqp_result_set_arrow.h" +#include "kqp_formats_arrow.h" #include #include diff --git a/ydb/core/kqp/common/result_set_format/kqp_result_set_arrow.h b/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.h similarity index 100% rename from ydb/core/kqp/common/result_set_format/kqp_result_set_arrow.h rename to ydb/core/kqp/common/result_set_format/kqp_formats_arrow.h diff --git a/ydb/core/kqp/common/result_set_format/kqp_result_set_builders.cpp b/ydb/core/kqp/common/result_set_format/kqp_result_set_builders.cpp index e67360a0f4e6..8dd8edf4b2c5 100644 --- a/ydb/core/kqp/common/result_set_format/kqp_result_set_builders.cpp +++ b/ydb/core/kqp/common/result_set_format/kqp_result_set_builders.cpp @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/ydb/core/kqp/common/result_set_format/ut/kqp_result_set_arrow_ut.cpp b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp similarity index 99% rename from ydb/core/kqp/common/result_set_format/ut/kqp_result_set_arrow_ut.cpp rename to ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp index 2339e1fdd98e..16cc83a8eb83 100644 --- a/ydb/core/kqp/common/result_set_format/ut/kqp_result_set_arrow_ut.cpp +++ b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp @@ -2,7 +2,7 @@ #include -#include +#include #include #include diff --git a/ydb/core/kqp/common/result_set_format/ut/ya.make b/ydb/core/kqp/common/result_set_format/ut/ya.make index e53b7c288dd1..34fff6a3c22f 100644 --- a/ydb/core/kqp/common/result_set_format/ut/ya.make +++ b/ydb/core/kqp/common/result_set_format/ut/ya.make @@ -5,7 +5,7 @@ FORK_SUBTESTS() SIZE(MEDIUM) SRCS( - kqp_result_set_arrow_ut.cpp + kqp_formats_arrow_ut.cpp ) YQL_LAST_ABI_VERSION() diff --git a/ydb/core/kqp/common/result_set_format/ya.make b/ydb/core/kqp/common/result_set_format/ya.make index 4c390964fe9d..6fc718eb9b67 100644 --- a/ydb/core/kqp/common/result_set_format/ya.make +++ b/ydb/core/kqp/common/result_set_format/ya.make @@ -1,7 +1,7 @@ LIBRARY() SRCS( - kqp_result_set_arrow.cpp + kqp_formats_arrow.cpp kqp_result_set_builders.cpp ) From f52671ae806e71da1610537abe4727cca75d0552 Mon Sep 17 00:00:00 2001 From: Daniil Timizhev Date: Sat, 25 Oct 2025 00:09:21 +0300 Subject: [PATCH 02/25] Global refactor for test functions --- .../result_set_format/kqp_formats_arrow.cpp | 607 +++--------------- .../result_set_format/kqp_formats_arrow.h | 162 +++-- .../ut/kqp_formats_arrow_ut.cpp | 100 ++- .../ut/kqp_formats_ut_helpers.cpp | 413 ++++++++++++ .../ut/kqp_formats_ut_helpers.h | 54 ++ .../kqp/common/result_set_format/ut/ya.make | 1 + 6 files changed, 697 insertions(+), 640 deletions(-) create mode 100644 ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp create mode 100644 ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.h diff --git a/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp b/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp index d92ee90ff8fd..4af78e195c2c 100644 --- a/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp +++ b/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp @@ -1,7 +1,9 @@ #include "kqp_formats_arrow.h" #include -#include + +#include +#include #include #include #include @@ -11,107 +13,6 @@ namespace NKikimr::NKqp::NFormats { namespace { -template -struct TTypeWrapper { - using T = TArrowType; -}; - -/** - * @brief Function to switch MiniKQL DataType correctly and uniformly converting - * it to arrow type using callback - * - * @tparam TFunc Callback type - * @param typeId Type callback work with. - * @param callback Template function of signature (TTypeWrapper) -> bool - * @return Result of execution of callback or false if the type typeId is not - * supported. - */ -template -bool SwitchMiniKQLDataTypeToArrowType(NUdf::EDataSlot type, TFunc &&callback) { - switch (type) { - case NUdf::EDataSlot::Int8: - return callback(TTypeWrapper()); - case NUdf::EDataSlot::Uint8: - case NUdf::EDataSlot::Bool: - return callback(TTypeWrapper()); - case NUdf::EDataSlot::Int16: - return callback(TTypeWrapper()); - case NUdf::EDataSlot::Date: - case NUdf::EDataSlot::Uint16: - return callback(TTypeWrapper()); - case NUdf::EDataSlot::Int32: - case NUdf::EDataSlot::Date32: - return callback(TTypeWrapper()); - case NUdf::EDataSlot::Datetime: - case NUdf::EDataSlot::Uint32: - return callback(TTypeWrapper()); - case NUdf::EDataSlot::Int64: - case NUdf::EDataSlot::Interval: - case NUdf::EDataSlot::Datetime64: - case NUdf::EDataSlot::Timestamp64: - case NUdf::EDataSlot::Interval64: - return callback(TTypeWrapper()); - case NUdf::EDataSlot::Uint64: - case NUdf::EDataSlot::Timestamp: - return callback(TTypeWrapper()); - case NUdf::EDataSlot::Float: - return callback(TTypeWrapper()); - case NUdf::EDataSlot::Double: - return callback(TTypeWrapper()); - case NUdf::EDataSlot::Utf8: - case NUdf::EDataSlot::Json: - case NUdf::EDataSlot::DyNumber: - case NUdf::EDataSlot::JsonDocument: - return callback(TTypeWrapper()); - case NUdf::EDataSlot::String: - case NUdf::EDataSlot::Yson: - return callback(TTypeWrapper()); - case NUdf::EDataSlot::Decimal: - case NUdf::EDataSlot::Uuid: - return callback(TTypeWrapper()); - case NUdf::EDataSlot::TzDate: - case NUdf::EDataSlot::TzDatetime: - case NUdf::EDataSlot::TzTimestamp: - case NUdf::EDataSlot::TzDate32: - case NUdf::EDataSlot::TzDatetime64: - case NUdf::EDataSlot::TzTimestamp64: - return callback(TTypeWrapper()); - } -} - -bool NeedWrapByExternalOptional(const NMiniKQL::TType* type) { - switch (type->GetKind()) { - case NMiniKQL::TType::EKind::Void: - case NMiniKQL::TType::EKind::Null: - case NMiniKQL::TType::EKind::Variant: - case NMiniKQL::TType::EKind::Optional: - case NMiniKQL::TType::EKind::EmptyList: - case NMiniKQL::TType::EKind::EmptyDict: - return true; - case NMiniKQL::TType::EKind::Data: - case NMiniKQL::TType::EKind::Struct: - case NMiniKQL::TType::EKind::Tuple: - case NMiniKQL::TType::EKind::List: - case NMiniKQL::TType::EKind::Dict: - case NMiniKQL::TType::EKind::Tagged: - return false; - case NMiniKQL::TType::EKind::Type: - case NMiniKQL::TType::EKind::Stream: - case NMiniKQL::TType::EKind::Callable: - case NMiniKQL::TType::EKind::Any: - case NMiniKQL::TType::EKind::Resource: - case NMiniKQL::TType::EKind::Flow: - case NMiniKQL::TType::EKind::ReservedKind: - case NMiniKQL::TType::EKind::Block: - case NMiniKQL::TType::EKind::Pg: - case NMiniKQL::TType::EKind::Multi: - case NMiniKQL::TType::EKind::Linear: - YQL_ENSURE(false, "Unsupported type: " << type->GetKindAsStr()); - return false; - } - return false; -} - template std::shared_ptr CreateEmptyArrowImpl(NUdf::EDataSlot slot) { Y_UNUSED(slot); @@ -212,7 +113,7 @@ std::shared_ptr GetArrowType(const NMiniKQL::TDictType* dictTyp auto keyArrowType = NFormats::GetArrowType(keyType); auto payloadArrowType = NFormats::GetArrowType(payloadType); - auto custom =std::make_shared("custom", arrow::uint64(), false); + auto custom = std::make_shared("custom", arrow::uint64(), false); if (keyType->GetKind() == NMiniKQL::TType::EKind::Optional) { std::vector> items; @@ -240,7 +141,6 @@ std::shared_ptr GetArrowType(const NMiniKQL::TVariantType* vari tupleType = static_cast(innerType); } - // Create Union of unions if there are more types then arrow::dense_union supports. if (variantType->GetAlternativesCount() > arrow::UnionType::kMaxTypeCode) { ui32 numberOfGroups = (variantType->GetAlternativesCount() - 1) / arrow::UnionType::kMaxTypeCode + 1; types.reserve(numberOfGroups); @@ -271,7 +171,6 @@ std::shared_ptr GetArrowType(const NMiniKQL::TVariantType* vari return arrow::dense_union(types); } - // Else put all types in one arrow::dense_union types.reserve(variantType->GetAlternativesCount()); for (ui32 index = 0; index < variantType->GetAlternativesCount(); ++index) { auto itemName = (structType == nullptr) @@ -476,14 +375,22 @@ void AppendDataValue(arrow::ArrayBuilder* builder, N if (!value.HasValue()) { status = typedBuilder->AppendNull(); } else { - if (dataSlot == NUdf::EDataSlot::Uuid) { - auto data = value.AsStringRef(); - status = typedBuilder->Append(data.Data()); - } else if (dataSlot == NUdf::EDataSlot::Decimal) { - auto intVal = value.GetInt128(); - status = typedBuilder->Append(reinterpret_cast(&intVal)); - } else { - YQL_ENSURE(false, "Unexpected data slot"); + switch (dataSlot) { + case NUdf::EDataSlot::Uuid: { + auto data = value.AsStringRef(); + status = typedBuilder->Append(data.Data()); + break; + } + + case NUdf::EDataSlot::Decimal: { + auto intVal = value.GetInt128(); + status = typedBuilder->Append(reinterpret_cast(&intVal)); + break; + } + + default: { + YQL_ENSURE(false, "Unexpected data slot"); + } } } YQL_ENSURE(status.ok(), "Failed to append data value: " << status.ToString()); @@ -491,6 +398,43 @@ void AppendDataValue(arrow::ArrayBuilder* builder, N } // namespace +bool NeedWrapByExternalOptional(const NMiniKQL::TType* type) { + switch (type->GetKind()) { + case NMiniKQL::TType::EKind::Void: + case NMiniKQL::TType::EKind::Null: + case NMiniKQL::TType::EKind::Variant: + case NMiniKQL::TType::EKind::Optional: + case NMiniKQL::TType::EKind::EmptyList: + case NMiniKQL::TType::EKind::EmptyDict: { + return true; + } + + case NMiniKQL::TType::EKind::Data: + case NMiniKQL::TType::EKind::Struct: + case NMiniKQL::TType::EKind::Tuple: + case NMiniKQL::TType::EKind::List: + case NMiniKQL::TType::EKind::Dict: + case NMiniKQL::TType::EKind::Tagged: { + return false; + } + + case NMiniKQL::TType::EKind::Type: + case NMiniKQL::TType::EKind::Stream: + case NMiniKQL::TType::EKind::Callable: + case NMiniKQL::TType::EKind::Any: + case NMiniKQL::TType::EKind::Resource: + case NMiniKQL::TType::EKind::Flow: + case NMiniKQL::TType::EKind::ReservedKind: + case NMiniKQL::TType::EKind::Block: + case NMiniKQL::TType::EKind::Pg: + case NMiniKQL::TType::EKind::Multi: + case NMiniKQL::TType::EKind::Linear: { + YQL_ENSURE(false, "Unsupported type: " << type->GetKindAsStr()); + } + } + return false; +} + std::shared_ptr GetArrowType(const NMiniKQL::TType* type) { switch (type->GetKind()) { case NMiniKQL::TType::EKind::Null: { @@ -538,7 +482,22 @@ std::shared_ptr GetArrowType(const NMiniKQL::TType* type) { return GetArrowType(variantType); } - default: { + case NMiniKQL::TType::EKind::Tagged: { + auto taggedType = static_cast(type); + return GetArrowType(taggedType->GetBaseType()); + } + + case NMiniKQL::TType::EKind::Type: + case NMiniKQL::TType::EKind::Stream: + case NMiniKQL::TType::EKind::Callable: + case NMiniKQL::TType::EKind::Any: + case NMiniKQL::TType::EKind::Resource: + case NMiniKQL::TType::EKind::Flow: + case NMiniKQL::TType::EKind::ReservedKind: + case NMiniKQL::TType::EKind::Block: + case NMiniKQL::TType::EKind::Pg: + case NMiniKQL::TType::EKind::Multi: + case NMiniKQL::TType::EKind::Linear: { YQL_ENSURE(false, "Unsupported type: " << type->GetKindAsStr()); } } @@ -827,8 +786,6 @@ void AppendElement(NUdf::TUnboxedValue value, arrow::ArrayBuilder* builder, cons } case NMiniKQL::TType::EKind::Variant: { - // TODO Need to properly convert variants containing more than 127*127 - // types? auto variantType = static_cast(type); YQL_ENSURE(builder->type()->id() == arrow::Type::DENSE_UNION, "Unexpected builder type"); @@ -872,415 +829,25 @@ void AppendElement(NUdf::TUnboxedValue value, arrow::ArrayBuilder* builder, cons break; } - default: { - YQL_ENSURE(false, "Unsupported type: " << type->GetKindAsStr()); - } - } -} - -namespace NTestUtils { - -namespace { - -template -NUdf::TUnboxedValue GetUnboxedValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { - Y_UNUSED(dataSlot); - using TArrayType = typename arrow::TypeTraits::ArrayType; - auto array = std::static_pointer_cast(column); - return NUdf::TUnboxedValuePod(static_cast(array->Value(row))); -} - -template <> // For darwin build -NUdf::TUnboxedValue GetUnboxedValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { - Y_UNUSED(dataSlot); - auto array = std::static_pointer_cast(column); - return NUdf::TUnboxedValuePod(static_cast(array->Value(row))); -} - -template <> // For darwin build -NUdf::TUnboxedValue GetUnboxedValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { - Y_UNUSED(dataSlot); - auto array = std::static_pointer_cast(column); - return NUdf::TUnboxedValuePod(static_cast(array->Value(row))); -} - -template <> -NUdf::TUnboxedValue GetUnboxedValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { - auto array = std::static_pointer_cast(column); - YQL_ENSURE(array->num_fields() == 2, "StructArray of some TzDate type should have 2 fields"); - - auto datetimeArray = array->field(0); - auto timezoneArray = std::static_pointer_cast(array->field(1)); - - NUdf::TUnboxedValuePod value; - auto typeId = datetimeArray->type_id(); - - switch (dataSlot) { - case NUdf::EDataSlot::TzDate: { - YQL_ENSURE(typeId == arrow::Type::UINT16); - value = NUdf::TUnboxedValuePod(static_cast( - std::static_pointer_cast(datetimeArray)->Value(row))); - break; - } - - case NUdf::EDataSlot::TzDatetime: { - YQL_ENSURE(typeId == arrow::Type::UINT32); - value = NUdf::TUnboxedValuePod(static_cast( - std::static_pointer_cast(datetimeArray)->Value(row))); - break; - } - - case NUdf::EDataSlot::TzTimestamp: { - YQL_ENSURE(typeId == arrow::Type::UINT64); - value = NUdf::TUnboxedValuePod(static_cast( - std::static_pointer_cast(datetimeArray)->Value(row))); - break; - } - - case NUdf::EDataSlot::TzDate32: { - YQL_ENSURE(typeId == arrow::Type::INT32); - value = NUdf::TUnboxedValuePod(static_cast( - std::static_pointer_cast(datetimeArray)->Value(row))); - break; - } - - case NUdf::EDataSlot::TzDatetime64: - case NUdf::EDataSlot::TzTimestamp64: { - YQL_ENSURE(typeId == arrow::Type::INT64); - value = NUdf::TUnboxedValuePod(static_cast( - std::static_pointer_cast(datetimeArray)->Value(row))); - break; - } - - default: { - YQL_ENSURE(false, "Unexpected timezone datetime data type"); - return NUdf::TUnboxedValuePod(); - } - } - - auto view = timezoneArray->Value(row); - value.SetTimezoneId(NMiniKQL::GetTimezoneId(NUdf::TStringRef(view.data(), view.size()))); - return value; -} - -template <> -NUdf::TUnboxedValue GetUnboxedValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { - Y_UNUSED(dataSlot); - auto array = std::static_pointer_cast(column); - auto data = array->GetView(row); - return NMiniKQL::MakeString(NUdf::TStringRef(data.data(), data.size())); -} - -template <> -NUdf::TUnboxedValue GetUnboxedValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { - auto array = std::static_pointer_cast(column); - auto data = array->GetView(row); - - switch (dataSlot) { - case NUdf::EDataSlot::Utf8: - case NUdf::EDataSlot::Json: { - return NMiniKQL::MakeString(NUdf::TStringRef(data.data(), data.size())); - } - - case NUdf::EDataSlot::JsonDocument: { - auto variant = NBinaryJson::SerializeToBinaryJson(TStringBuf(data.data(), data.size())); - if (std::holds_alternative(variant)) { - const auto& json = std::get(variant); - return NMiniKQL::MakeString(NUdf::TStringRef(json.Data(), json.Size())); - } - - YQL_ENSURE(false, "Cannot serialize to binary json"); - break; - } - - case NUdf::EDataSlot::DyNumber: { - auto number = NDyNumber::ParseDyNumberString(TStringBuf(data.data(), data.size())); - if (number.Defined()) { - return NMiniKQL::MakeString(*number); - } - - YQL_ENSURE(false, "Failed to convert string to DyNumber"); - break; - } - - default: { - YQL_ENSURE(false, "Unexpected data slot"); - } - } - return NUdf::TUnboxedValuePod(); -} - -template <> -NUdf::TUnboxedValue GetUnboxedValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { - auto array = std::static_pointer_cast(column); - auto data = array->GetView(row); - - switch (dataSlot) { - case NUdf::EDataSlot::Uuid: { - return NMiniKQL::MakeString(NUdf::TStringRef(data.data(), data.size())); - } - - case NUdf::EDataSlot::Decimal: { - NYql::NDecimal::TInt128 value; - std::memcpy(&value, data.data(), data.size()); - return NUdf::TUnboxedValuePod(value); - } - - default: { - YQL_ENSURE(false, "Unexpected data slot"); - } - } - return NUdf::TUnboxedValuePod(); -} - -} // namespace - -std::unique_ptr MakeArrowBuilder(const NMiniKQL::TType* type) { - auto arrayType = GetArrowType(type); - std::unique_ptr builder; - auto status = arrow::MakeBuilder(arrow::default_memory_pool(), arrayType, &builder); - YQL_ENSURE(status.ok(), "Failed to make arrow builder: " << status.ToString()); - return builder; -} - -std::shared_ptr MakeArray(NMiniKQL::TUnboxedValueVector& values, const NMiniKQL::TType* itemType) { - auto builder = MakeArrowBuilder(itemType); - auto status = builder->Reserve(values.size()); - YQL_ENSURE(status.ok(), "Failed to reserve space for array: " << status.ToString()); - for (auto& value : values) { - AppendElement(value, builder.get(), itemType); - } - std::shared_ptr result; - status = builder->Finish(&result); - YQL_ENSURE(status.ok(), "Failed to finish array: " << status.ToString()); - return result; -} - -NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& array, ui64 row, const NMiniKQL::TType* itemType, - const NMiniKQL::THolderFactory& holderFactory) -{ - if (array->IsNull(row)) { - return NUdf::TUnboxedValuePod(); - } - - switch (itemType->GetKind()) { - case NMiniKQL::TType::EKind::Void: - case NMiniKQL::TType::EKind::Null: - case NMiniKQL::TType::EKind::EmptyList: - case NMiniKQL::TType::EKind::EmptyDict: { + case NMiniKQL::TType::EKind::Tagged: { + // TODO: Support Tagged type break; } - case NMiniKQL::TType::EKind::Data: { - auto dataType = static_cast(itemType); - NUdf::TUnboxedValue result; - auto dataSlot = *dataType->GetDataSlot().Get(); - bool success = SwitchMiniKQLDataTypeToArrowType(dataSlot, - [&](TTypeWrapper typeHolder) { - Y_UNUSED(typeHolder); - result = GetUnboxedValue(array, row, dataSlot); - return true; - }); - YQL_ENSURE(success, "Failed to extract unboxed value from arrow array"); - return result; - } - - case NMiniKQL::TType::EKind::Struct: { - auto structType = static_cast(itemType); - - YQL_ENSURE(array->type_id() == arrow::Type::STRUCT, "Unexpected array type"); - auto typedArray = static_pointer_cast(array); - YQL_ENSURE(static_cast(typedArray->num_fields()) == structType->GetMembersCount(), "Unexpected count of fields"); - - NUdf::TUnboxedValue* itemsPtr = nullptr; - auto result = holderFactory.CreateDirectArrayHolder(structType->GetMembersCount(), itemsPtr); - - for (ui32 index = 0; index < structType->GetMembersCount(); ++index) { - auto memberType = structType->GetMemberType(index); - itemsPtr[index] = ExtractUnboxedValue(typedArray->field(index), row, memberType, holderFactory); - } - return result; - } - - case NMiniKQL::TType::EKind::Tuple: { - auto tupleType = static_cast(itemType); - - YQL_ENSURE(array->type_id() == arrow::Type::STRUCT, "Unexpected array type"); - auto typedArray = static_pointer_cast(array); - YQL_ENSURE(static_cast(typedArray->num_fields()) == tupleType->GetElementsCount(), "Unexpected count of fields"); - - NUdf::TUnboxedValue* itemsPtr = nullptr; - auto result = holderFactory.CreateDirectArrayHolder(tupleType->GetElementsCount(), itemsPtr); - - for (ui32 index = 0; index < tupleType->GetElementsCount(); ++index) { - auto elementType = tupleType->GetElementType(index); - itemsPtr[index] = ExtractUnboxedValue(typedArray->field(index), row, elementType, holderFactory); - } - return result; - } - - case NMiniKQL::TType::EKind::Optional: { - auto optionalType = static_cast(itemType); - auto innerOptionalType = optionalType->GetItemType(); - - if (NeedWrapByExternalOptional(innerOptionalType)) { - YQL_ENSURE(array->type_id() == arrow::Type::STRUCT, "Unexpected array type"); - - auto innerArray = array; - auto innerType = itemType; - - NUdf::TUnboxedValue value; - int depth = 0; - - while (innerArray->type_id() == arrow::Type::STRUCT) { - auto structArray = static_pointer_cast(innerArray); - YQL_ENSURE(structArray->num_fields() == 1, "Unexpected count of fields"); - - if (structArray->IsNull(row)) { - value = NUdf::TUnboxedValuePod(); - break; - } - - innerType = static_cast(innerType)->GetItemType(); - innerArray = structArray->field(0); - ++depth; - } - - auto wrap = NeedWrapByExternalOptional(innerType); - if (wrap || !innerArray->IsNull(row)) { - value = ExtractUnboxedValue(innerArray, row, innerType, holderFactory); - if (wrap) { - --depth; - } - } - - for (int i = 0; i < depth; ++i) { - value = value.MakeOptional(); - } - return value; - } - - return ExtractUnboxedValue(array, row, innerOptionalType, holderFactory).Release().MakeOptional(); - } - - case NMiniKQL::TType::EKind::List: { - auto listType = static_cast(itemType); - - YQL_ENSURE(array->type_id() == arrow::Type::LIST, "Unexpected array type"); - auto typedArray = static_pointer_cast(array); - - auto arraySlice = typedArray->value_slice(row); - auto itemType = listType->GetItemType(); - const auto len = arraySlice->length(); - - NUdf::TUnboxedValue* items = nullptr; - auto list = holderFactory.CreateDirectArrayHolder(len, items); - for (ui64 i = 0; i < static_cast(len); ++i) { - *items++ = ExtractUnboxedValue(arraySlice, i, itemType, holderFactory); - } - return list; - } - - case NMiniKQL::TType::EKind::Dict: { - auto dictType = static_cast(itemType); - - auto keyType = dictType->GetKeyType(); - auto payloadType = dictType->GetPayloadType(); - auto dictBuilder = holderFactory.NewDict(dictType, NUdf::TDictFlags::EDictKind::Hashed); - - std::shared_ptr keyArray = nullptr; - std::shared_ptr payloadArray = nullptr; - ui64 dictLength = 0; - ui64 offset = 0; - - YQL_ENSURE(array->type_id() == arrow::Type::STRUCT, "Unexpected array type"); - auto wrapArray = static_pointer_cast(array); - YQL_ENSURE(wrapArray->num_fields() == 2, "Unexpected count of fields"); - - auto dictSlice = wrapArray->field(0); - - if (keyType->GetKind() == NMiniKQL::TType::EKind::Optional) { - YQL_ENSURE(dictSlice->type_id() == arrow::Type::LIST, "Unexpected array type"); - auto listArray = static_pointer_cast(dictSlice); - - auto arraySlice = listArray->value_slice(row); - YQL_ENSURE(arraySlice->type_id() == arrow::Type::STRUCT, "Unexpected array type"); - auto structArray = static_pointer_cast(arraySlice); - YQL_ENSURE(structArray->num_fields() == 2, "Unexpected count of fields"); - - dictLength = arraySlice->length(); - keyArray = structArray->field(0); - payloadArray = structArray->field(1); - } else { - YQL_ENSURE(dictSlice->type_id() == arrow::Type::MAP, "Unexpected array type"); - auto mapArray = static_pointer_cast(dictSlice); - - dictLength = mapArray->value_length(row); - offset = mapArray->value_offset(row); - keyArray = mapArray->keys(); - payloadArray = mapArray->items(); - } - - for (ui64 i = offset; i < offset + static_cast(dictLength); ++i) { - auto key = ExtractUnboxedValue(keyArray, i, keyType, holderFactory); - auto payload = ExtractUnboxedValue(payloadArray, i, payloadType, holderFactory); - dictBuilder->Add(std::move(key), std::move(payload)); - } - return dictBuilder->Build(); - } - - case NMiniKQL::TType::EKind::Variant: { - // TODO Need to properly convert variants containing more than 127*127 - // types? - auto variantType = static_cast(itemType); - - YQL_ENSURE(array->type_id() == arrow::Type::DENSE_UNION, "Unexpected array type"); - auto unionArray = static_pointer_cast(array); - - auto variantIndex = unionArray->child_id(row); - auto rowInChild = unionArray->value_offset(row); - std::shared_ptr valuesArray = unionArray->field(variantIndex); - - if (variantType->GetAlternativesCount() > arrow::UnionType::kMaxTypeCode) { - // Go one step deeper - YQL_ENSURE(valuesArray->type_id() == arrow::Type::DENSE_UNION, "Unexpected array type"); - auto innerUnionArray = static_pointer_cast(valuesArray); - auto innerVariantIndex = innerUnionArray->child_id(rowInChild); - - rowInChild = innerUnionArray->value_offset(rowInChild); - valuesArray = innerUnionArray->field(innerVariantIndex); - variantIndex =variantIndex * arrow::UnionType::kMaxTypeCode + innerVariantIndex; - } - - NMiniKQL::TType* innerType = variantType->GetUnderlyingType(); - if (innerType->IsStruct()) { - innerType =static_cast(innerType)->GetMemberType(variantIndex); - } else { - YQL_ENSURE(innerType->IsTuple(), "Unexpected underlying variant type: " << innerType->GetKindAsStr()); - innerType = static_cast(innerType)->GetElementType(variantIndex); - } - - NUdf::TUnboxedValue value = ExtractUnboxedValue(valuesArray, rowInChild, innerType, holderFactory); - return holderFactory.CreateVariantHolder(value.Release(), variantIndex); - } - default: { - YQL_ENSURE(false, "Unsupported type: " << itemType->GetKindAsStr()); + case NMiniKQL::TType::EKind::Type: + case NMiniKQL::TType::EKind::Stream: + case NMiniKQL::TType::EKind::Callable: + case NMiniKQL::TType::EKind::Any: + case NMiniKQL::TType::EKind::Resource: + case NMiniKQL::TType::EKind::Flow: + case NMiniKQL::TType::EKind::ReservedKind: + case NMiniKQL::TType::EKind::Block: + case NMiniKQL::TType::EKind::Pg: + case NMiniKQL::TType::EKind::Multi: + case NMiniKQL::TType::EKind::Linear: { + YQL_ENSURE(false, "Unsupported type: " << type->GetKindAsStr()); } } - return NUdf::TUnboxedValuePod(); } -NMiniKQL::TUnboxedValueVector ExtractUnboxedValues(const std::shared_ptr& array, const NMiniKQL::TType* itemType, - const NMiniKQL::THolderFactory& holderFactory) -{ - NMiniKQL::TUnboxedValueVector values; - values.reserve(array->length()); - for (auto i = 0; i < array->length(); ++i) { - values.push_back(ExtractUnboxedValue(array, i, itemType, holderFactory)); - } - return values; -} - -} // namespace NTestUtils - } // namespace NKikimr::NKqp::NFormats diff --git a/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.h b/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.h index 0c3c62a0e2da..8e41d1cf9142 100644 --- a/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.h +++ b/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.h @@ -2,67 +2,93 @@ #include -#include #include namespace NKikimr::NKqp::NFormats { +namespace { + +template +struct TTypeWrapper { + using T = TArrowType; +}; + +} // namespace + /** - * @brief Convert TType to the arrow::DataType object - * - * The logic of this conversion is from YQL-15332: - * - * Void, Null => NullType - * Bool => Uint8 - * Integral => Uint8..Uint64, Int8..Int64 - * Floats => Float, Double - * Date => Uint16 - * Datetime => Uint32 - * Timestamp => Uint64 - * Interval => Int64 - * Date32 => Int32 - * Interval64, Timestamp64, Datetime64 => Int64 - * Utf8, Json => String - * String, Yson, JsonDocument => Binary - * Decimal, UUID => FixedSizeBinary(16) - * Timezone datetime type => StructArray - * DyNumber => BinaryArray - * - * Struct, Tuple, EmptyList, EmptyDict => StructArray - * Names of fields constructed from tuple are just empty strings. - * - * List => ListArray + * @brief Function to switch MiniKQL DataType correctly and uniformly converting + * it to arrow type using callback * - * Variant => DenseUnionArray - * If variant contains more than 127 items then we map - * Variant => DenseUnionArray - * TODO Implement convertion of data to DenseUnionArray and - * back - * - * Optional => StructArray if T is Variant - * Because DenseUnionArray does not have validity bitmap - * Optional => T for other types - * By default, other types have a validity bitmap - * - * Optional...>> => - * StructArray...>> For example: - * - Optional> => StructArray - * Int32 has validity bitmap, so we wrap it in StructArray N - 1 times, where - * N is the number of Optional levels - * - Optional>> => - * StructArray>> DenseUnionArray does - * not have validity bitmap, so we wrap it in StructArray N times, where N is - * the number of Optional levels - * - * Dict => StructArray, - * Uint64Array (on demand, default: 0)> We do not use arrow::DictArray because - * it must be used for encoding not for mapping keys to values. - * (https://arrow.apache.org/docs/cpp/api/array.html#classarrow_1_1_dictionary_array) - * If the type of dict key is optional then we map - * Dict, ValueType> => - * StructArray, Uint64Array (on - * demand, default: 0)> because keys of MapArray can not be nullable + * @tparam TFunc Callback type + * @param typeId Type callback work with. + * @param callback Template function of signature (TTypeWrapper) -> bool + * @return Result of execution of callback or false if the type typeId is not + * supported. + */ +template +bool SwitchMiniKQLDataTypeToArrowType(NUdf::EDataSlot typeId, TFunc&& callback) { + switch (typeId) { + case NUdf::EDataSlot::Int8: + return callback(TTypeWrapper()); + case NUdf::EDataSlot::Uint8: + case NUdf::EDataSlot::Bool: + return callback(TTypeWrapper()); + case NUdf::EDataSlot::Int16: + return callback(TTypeWrapper()); + case NUdf::EDataSlot::Date: + case NUdf::EDataSlot::Uint16: + return callback(TTypeWrapper()); + case NUdf::EDataSlot::Int32: + case NUdf::EDataSlot::Date32: + return callback(TTypeWrapper()); + case NUdf::EDataSlot::Datetime: + case NUdf::EDataSlot::Uint32: + return callback(TTypeWrapper()); + case NUdf::EDataSlot::Int64: + case NUdf::EDataSlot::Interval: + case NUdf::EDataSlot::Datetime64: + case NUdf::EDataSlot::Timestamp64: + case NUdf::EDataSlot::Interval64: + return callback(TTypeWrapper()); + case NUdf::EDataSlot::Uint64: + case NUdf::EDataSlot::Timestamp: + return callback(TTypeWrapper()); + case NUdf::EDataSlot::Float: + return callback(TTypeWrapper()); + case NUdf::EDataSlot::Double: + return callback(TTypeWrapper()); + case NUdf::EDataSlot::Utf8: + case NUdf::EDataSlot::Json: + case NUdf::EDataSlot::DyNumber: + case NUdf::EDataSlot::JsonDocument: + return callback(TTypeWrapper()); + case NUdf::EDataSlot::String: + case NUdf::EDataSlot::Yson: + return callback(TTypeWrapper()); + case NUdf::EDataSlot::Decimal: + case NUdf::EDataSlot::Uuid: + return callback(TTypeWrapper()); + case NUdf::EDataSlot::TzDate: + case NUdf::EDataSlot::TzDatetime: + case NUdf::EDataSlot::TzTimestamp: + case NUdf::EDataSlot::TzDate32: + case NUdf::EDataSlot::TzDatetime64: + case NUdf::EDataSlot::TzTimestamp64: + return callback(TTypeWrapper()); + } +} + +/** + * @brief Check if the type needs to be wrapped by external optional. + * For example, some types does not have validity bitmap. * + * @param type Yql type to check + * @return true if the type needs to be wrapped by external optional, false otherwise + */ +bool NeedWrapByExternalOptional(const NMiniKQL::TType* type); + +/** + * @brief Convert TType to the arrow::DataType object * * @param type Yql type to parse * @return std::shared_ptr arrow type of the same structure as @@ -70,22 +96,22 @@ namespace NKikimr::NKqp::NFormats { */ std::shared_ptr GetArrowType(const NMiniKQL::TType* type); +/** + * @brief Check if the type can be converted to arrow type. + * + * @param type Yql type to check + * @return true if the type is compatible with arrow, false otherwise + */ bool IsArrowCompatible(const NMiniKQL::TType* type); +/** + * @brief Append UnboxedValue to arrow Array via arrow Builder. + * This function is used in TArrowBatchBuilder. + * + * @param value value to append + * @param builder arrow Builder with proper type used to append converted value array + * @param type Yql type of the element + */ void AppendElement(NUdf::TUnboxedValue value, arrow::ArrayBuilder* builder, const NMiniKQL::TType* type); -namespace NTestUtils { - -std::unique_ptr MakeArrowBuilder(const NMiniKQL::TType* type); - -std::shared_ptr MakeArray(NMiniKQL::TUnboxedValueVector& values, const NMiniKQL::TType* itemType); - -NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& array, ui64 row, - const NMiniKQL::TType* itemType, const NMiniKQL::THolderFactory& holderFactory); - -NMiniKQL::TUnboxedValueVector ExtractUnboxedValues(const std::shared_ptr& array, - const NMiniKQL::TType* itemType, const NMiniKQL::THolderFactory& holderFactory); - -} // namespace NTestUtils - } // namespace NKikimr::NKqp::NFormats diff --git a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp index 16cc83a8eb83..1cf4128a073a 100644 --- a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp +++ b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp @@ -2,14 +2,11 @@ #include +#include #include - -#include -#include #include - #include -#include + #include #include #include @@ -20,7 +17,6 @@ #include using namespace NKikimr::NMiniKQL; -using namespace NKikimr::NArrow; using namespace NYql; inline static constexpr size_t TEST_ARRAY_SIZE = 1 << 16; @@ -782,7 +778,7 @@ void TestDataTypeConversion(arrow::Type::type arrowTypeId) { values.emplace_back(GetValueOfBasicType(type, i)); } - auto array = NTestUtils::MakeArray(values, type); + auto array = MakeArray(values, type); UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); UNIT_ASSERT(array->length() == static_cast(values.size())); @@ -809,7 +805,7 @@ void TestDataTypeConversion(arrow::Type::type arrowTypeId) { auto val = NBinaryJson::SerializeToJson(values[i].AsStringRef()); UNIT_ASSERT(static_cast(typedArray->Value(i)) == val); } else { - auto value = NTestUtils::ExtractUnboxedValue(array, i, type, context.HolderFactory); + auto value = ExtractUnboxedValue(array, i, type, context.HolderFactory); AssertUnboxedValuesAreEqual(value, values[i], type); } } else { @@ -843,7 +839,7 @@ void TestFixedSizeBinaryDataTypeConversion() { values.emplace_back(GetValueOfBasicType(type, i)); } - auto array = NTestUtils::MakeArray(values, type); + auto array = MakeArray(values, type); UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); UNIT_ASSERT(array->length() == static_cast(values.size())); @@ -882,7 +878,7 @@ void TestSingularTypeConversion() { values.emplace_back(); } - auto array = NTestUtils::MakeArray(values, type); + auto array = MakeArray(values, type); UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); UNIT_ASSERT(array->length() == static_cast(TEST_ARRAY_SIZE)); @@ -1062,7 +1058,7 @@ Y_UNIT_TEST_SUITE(DqUnboxedValueToNativeArrowConversion) { UNIT_ASSERT(NFormats::IsArrowCompatible(structType)); auto values = context.CreateStructs(100); - auto array = NFormats::NTestUtils::MakeArray(values, structType); + auto array = NFormats::MakeArray(values, structType); UNIT_ASSERT(array->ValidateFull().ok()); UNIT_ASSERT(array->length() == static_cast(values.size())); @@ -1103,7 +1099,7 @@ Y_UNIT_TEST_SUITE(DqUnboxedValueToNativeArrowConversion) { UNIT_ASSERT(NFormats::IsArrowCompatible(tupleType)); auto values = context.CreateTuples(100); - auto array = NFormats::NTestUtils::MakeArray(values, tupleType); + auto array = NFormats::MakeArray(values, tupleType); UNIT_ASSERT(array->ValidateFull().ok()); UNIT_ASSERT(array->length() == static_cast(values.size())); @@ -1143,7 +1139,7 @@ Y_UNIT_TEST_SUITE(DqUnboxedValueToNativeArrowConversion) { Y_ABORT_UNLESS(NFormats::IsArrowCompatible(listType)); auto values = context.CreateListOfJsons(100); - auto array = NFormats::NTestUtils::MakeArray(values, listType); + auto array = NFormats::MakeArray(values, listType); UNIT_ASSERT(array->ValidateFull().ok()); UNIT_ASSERT(static_cast(array->length()) == values.size()); UNIT_ASSERT(array->type_id() == arrow::Type::LIST); @@ -1176,7 +1172,7 @@ Y_UNIT_TEST_SUITE(DqUnboxedValueToNativeArrowConversion) { Y_ABORT_UNLESS(NFormats::IsArrowCompatible(listType)); auto values = context.CreateOptionalListOfOptional(100); - auto array = NFormats::NTestUtils::MakeArray(values, listType); + auto array = NFormats::MakeArray(values, listType); UNIT_ASSERT(array->ValidateFull().ok()); UNIT_ASSERT(static_cast(array->length()) == values.size()); UNIT_ASSERT(array->type_id() == arrow::Type::LIST); @@ -1218,7 +1214,7 @@ Y_UNIT_TEST_SUITE(DqUnboxedValueToNativeArrowConversion) { // UNIT_ASSERT(NFormats::IsArrowCompatible(variantType)); // auto values = context.CreateVariantOverStruct(100); - // auto array = NFormats::NTestUtils::MakeArray(values, variantType); + // auto array = NFormats::MakeArray(values, variantType); // UNIT_ASSERT(array->ValidateFull().ok()); // UNIT_ASSERT(static_cast(array->length()) == values.size()); // UNIT_ASSERT(array->type_id() == arrow::Type::DENSE_UNION); @@ -1268,7 +1264,7 @@ Y_UNIT_TEST_SUITE(DqUnboxedValueToNativeArrowConversion) { // UNIT_ASSERT(NFormats::IsArrowCompatible(variantType)); // auto values = context.CreateOptionalVariantOverStruct(100); - // auto array = NFormats::NTestUtils::MakeArray(values, variantType); + // auto array = NFormats::MakeArray(values, variantType); // UNIT_ASSERT(array->ValidateFull().ok()); // UNIT_ASSERT(static_cast(array->length()) == values.size()); // UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); @@ -1331,7 +1327,7 @@ Y_UNIT_TEST_SUITE(DqUnboxedValueToNativeArrowConversion) { // UNIT_ASSERT(NFormats::IsArrowCompatible(variantType)); // auto values = context.CreateDoubleOptionalVariantOverStruct(100); - // auto array = NFormats::NTestUtils::MakeArray(values, variantType); + // auto array = NFormats::MakeArray(values, variantType); // UNIT_ASSERT(array->ValidateFull().ok()); // UNIT_ASSERT(static_cast(array->length()) == values.size()); // UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); @@ -1403,7 +1399,7 @@ Y_UNIT_TEST_SUITE(DqUnboxedValueToNativeArrowConversion) { UNIT_ASSERT(NFormats::IsArrowCompatible(variantType)); auto values = context.CreateVariantOverTupleWithOptionals(100); - auto array = NFormats::NTestUtils::MakeArray(values, variantType); + auto array = NFormats::MakeArray(values, variantType); UNIT_ASSERT(array->ValidateFull().ok()); UNIT_ASSERT(static_cast(array->length()) == values.size()); UNIT_ASSERT(array->type_id() == arrow::Type::DENSE_UNION); @@ -1461,7 +1457,7 @@ Y_UNIT_TEST_SUITE(DqUnboxedValueToNativeArrowConversion) { UNIT_ASSERT(NFormats::IsArrowCompatible(variantType)); auto values = context.CreateOptionalVariantOverTupleWithOptionals(100); - auto array = NFormats::NTestUtils::MakeArray(values, variantType); + auto array = NFormats::MakeArray(values, variantType); UNIT_ASSERT(array->ValidateFull().ok()); UNIT_ASSERT(static_cast(array->length()) == values.size()); UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); @@ -1531,7 +1527,7 @@ Y_UNIT_TEST_SUITE(DqUnboxedValueToNativeArrowConversion) { UNIT_ASSERT(NFormats::IsArrowCompatible(variantType)); auto values = context.CreateDoubleOptionalVariantOverTupleWithOptionals(100); - auto array = NFormats::NTestUtils::MakeArray(values, variantType); + auto array = NFormats::MakeArray(values, variantType); UNIT_ASSERT(array->ValidateFull().ok()); UNIT_ASSERT(static_cast(array->length()) == values.size()); UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); @@ -1610,7 +1606,7 @@ Y_UNIT_TEST_SUITE(DqUnboxedValueDoNotFitToArrow) { UNIT_ASSERT(NFormats::IsArrowCompatible(dictType)); auto values = context.CreateDictUtf8ToInterval(100); - auto array = NFormats::NTestUtils::MakeArray(values, dictType); + auto array = NFormats::MakeArray(values, dictType); UNIT_ASSERT(array->ValidateFull().ok()); UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); @@ -1655,7 +1651,7 @@ Y_UNIT_TEST_SUITE(DqUnboxedValueDoNotFitToArrow) { UNIT_ASSERT(NFormats::IsArrowCompatible(dictType)); auto values = context.CreateDictOptionalToTuple(100); - auto array = NFormats::NTestUtils::MakeArray(values, dictType); + auto array = NFormats::MakeArray(values, dictType); UNIT_ASSERT(array->ValidateFull().ok()); UNIT_ASSERT_EQUAL(static_cast(array->length()), values.size()); UNIT_ASSERT_EQUAL(array->type_id(), arrow::Type::STRUCT); @@ -1710,7 +1706,7 @@ Y_UNIT_TEST_SUITE(DqUnboxedValueDoNotFitToArrow) { UNIT_ASSERT(NFormats::IsArrowCompatible(doubleOptionalType)); auto values = context.CreateOptionalOfOptional(100); - auto array = NFormats::NTestUtils::MakeArray(values, doubleOptionalType); + auto array = NFormats::MakeArray(values, doubleOptionalType); UNIT_ASSERT(array->ValidateFull().ok()); UNIT_ASSERT_EQUAL(static_cast(array->length()), values.size()); @@ -1764,7 +1760,7 @@ Y_UNIT_TEST_SUITE(DqUnboxedValueDoNotFitToArrow) { UNIT_ASSERT(NFormats::IsArrowCompatible(variantType)); auto values = context.CreateLargeVariant(1000); - auto array = NFormats::NTestUtils::MakeArray(values, variantType); + auto array = NFormats::MakeArray(values, variantType); UNIT_ASSERT(array->ValidateFull().ok()); UNIT_ASSERT_EQUAL(static_cast(array->length()), values.size()); UNIT_ASSERT_EQUAL(array->type_id(), arrow::Type::DENSE_UNION); @@ -1786,8 +1782,8 @@ Y_UNIT_TEST_SUITE(ConvertUnboxedValueToArrowAndBack){ auto structType = context.GetStructType(); auto values = context.CreateStructs(100); - auto array = NFormats::NTestUtils::MakeArray(values, structType); - auto restoredValues = NFormats::NTestUtils::ExtractUnboxedValues(array, structType, context.HolderFactory); + auto array = NFormats::MakeArray(values, structType); + auto restoredValues = NFormats::ExtractUnboxedValues(array, structType, context.HolderFactory); UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); for (ui64 index = 0; index < values.size(); ++index) { AssertUnboxedValuesAreEqual(values[index], restoredValues[index], structType); @@ -1801,8 +1797,8 @@ Y_UNIT_TEST_SUITE(ConvertUnboxedValueToArrowAndBack){ UNIT_ASSERT(NFormats::IsArrowCompatible(tupleType)); auto values = context.CreateTuples(100); - auto array = NFormats::NTestUtils::MakeArray(values, tupleType); - auto restoredValues = NFormats::NTestUtils::ExtractUnboxedValues(array, tupleType, context.HolderFactory); + auto array = NFormats::MakeArray(values, tupleType); + auto restoredValues = NFormats::ExtractUnboxedValues(array, tupleType, context.HolderFactory); UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); for (ui64 index = 0; index < values.size(); ++index) { AssertUnboxedValuesAreEqual(values[index], restoredValues[index], tupleType); @@ -1816,8 +1812,8 @@ Y_UNIT_TEST_SUITE(ConvertUnboxedValueToArrowAndBack){ UNIT_ASSERT(NFormats::IsArrowCompatible(dictType)); auto values = context.CreateDictUtf8ToInterval(100); - auto array = NFormats::NTestUtils::MakeArray(values, dictType); - auto restoredValues = NFormats::NTestUtils::ExtractUnboxedValues(array, dictType, context.HolderFactory); + auto array = NFormats::MakeArray(values, dictType); + auto restoredValues = NFormats::ExtractUnboxedValues(array, dictType, context.HolderFactory); UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); for (ui64 index = 0; index < values.size(); ++index) { AssertUnboxedValuesAreEqual(values[index], restoredValues[index], dictType); @@ -1831,8 +1827,8 @@ Y_UNIT_TEST_SUITE(ConvertUnboxedValueToArrowAndBack){ Y_ABORT_UNLESS(NFormats::IsArrowCompatible(listType)); auto values = context.CreateListOfJsons(100); - auto array = NFormats::NTestUtils::MakeArray(values, listType); - auto restoredValues = NFormats::NTestUtils::ExtractUnboxedValues(array, listType, context.HolderFactory); + auto array = NFormats::MakeArray(values, listType); + auto restoredValues = NFormats::ExtractUnboxedValues(array, listType, context.HolderFactory); UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); for (ui64 index = 0; index < values.size(); ++index) { AssertUnboxedValuesAreEqual(values[index], restoredValues[index], listType); @@ -1846,8 +1842,8 @@ Y_UNIT_TEST_SUITE(ConvertUnboxedValueToArrowAndBack){ Y_ABORT_UNLESS(NFormats::IsArrowCompatible(listType)); auto values = context.CreateOptionalListOfOptional(100); - auto array = NFormats::NTestUtils::MakeArray(values, listType); - auto restoredValues = NFormats::NTestUtils::ExtractUnboxedValues(array, listType, context.HolderFactory); + auto array = NFormats::MakeArray(values, listType); + auto restoredValues = NFormats::ExtractUnboxedValues(array, listType, context.HolderFactory); UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); for (ui64 index = 0; index < values.size(); ++index) { AssertUnboxedValuesAreEqual(values[index], restoredValues[index], listType); @@ -1861,8 +1857,8 @@ Y_UNIT_TEST_SUITE(ConvertUnboxedValueToArrowAndBack){ // UNIT_ASSERT(NFormats::IsArrowCompatible(variantType)); // auto values = context.CreateVariantOverStruct(100); - // auto array = NFormats::NTestUtils::MakeArray(values, variantType); - // auto restoredValues = NFormats::NTestUtils::ExtractUnboxedValues(array, variantType, context.HolderFactory); + // auto array = NFormats::MakeArray(values, variantType); + // auto restoredValues = NFormats::ExtractUnboxedValues(array, variantType, context.HolderFactory); // UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); // for (ui64 index = 0; index < values.size(); ++index) { // AssertUnboxedValuesAreEqual(values[index], restoredValues[index], variantType); @@ -1876,8 +1872,8 @@ Y_UNIT_TEST_SUITE(ConvertUnboxedValueToArrowAndBack){ // UNIT_ASSERT(NFormats::IsArrowCompatible(optionalVariantType)); // auto values = context.CreateOptionalVariantOverStruct(100); - // auto array = NFormats::NTestUtils::MakeArray(values, optionalVariantType); - // auto restoredValues = NFormats::NTestUtils::ExtractUnboxedValues(array, optionalVariantType, context.HolderFactory); + // auto array = NFormats::MakeArray(values, optionalVariantType); + // auto restoredValues = NFormats::ExtractUnboxedValues(array, optionalVariantType, context.HolderFactory); // UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); // for (ui64 index = 0; index < values.size(); ++index) { // AssertUnboxedValuesAreEqual(values[index], restoredValues[index], optionalVariantType); @@ -1891,8 +1887,8 @@ Y_UNIT_TEST_SUITE(ConvertUnboxedValueToArrowAndBack){ // UNIT_ASSERT(NFormats::IsArrowCompatible(doubleOptionalVariantType)); // auto values = context.CreateDoubleOptionalVariantOverStruct(100); - // auto array = NFormats::NTestUtils::MakeArray(values, doubleOptionalVariantType); - // auto restoredValues = NFormats::NTestUtils::ExtractUnboxedValues(array, doubleOptionalVariantType, context.HolderFactory); + // auto array = NFormats::MakeArray(values, doubleOptionalVariantType); + // auto restoredValues = NFormats::ExtractUnboxedValues(array, doubleOptionalVariantType, context.HolderFactory); // UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); // for (ui64 index = 0; index < values.size(); ++index) { // AssertUnboxedValuesAreEqual(values[index], restoredValues[index], doubleOptionalVariantType); @@ -1906,8 +1902,8 @@ Y_UNIT_TEST_SUITE(ConvertUnboxedValueToArrowAndBack){ UNIT_ASSERT(NFormats::IsArrowCompatible(variantType)); auto values = context.CreateVariantOverTupleWithOptionals(100); - auto array = NFormats::NTestUtils::MakeArray(values, variantType); - auto restoredValues = NFormats::NTestUtils::ExtractUnboxedValues(array, variantType, context.HolderFactory); + auto array = NFormats::MakeArray(values, variantType); + auto restoredValues = NFormats::ExtractUnboxedValues(array, variantType, context.HolderFactory); UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); for (ui64 index = 0; index < values.size(); ++index) { AssertUnboxedValuesAreEqual(values[index], restoredValues[index], variantType); @@ -1921,8 +1917,8 @@ Y_UNIT_TEST_SUITE(ConvertUnboxedValueToArrowAndBack){ UNIT_ASSERT(NFormats::IsArrowCompatible(optionalVariantType)); auto values = context.CreateOptionalVariantOverTupleWithOptionals(100); - auto array = NFormats::NTestUtils::MakeArray(values, optionalVariantType); - auto restoredValues = NFormats::NTestUtils::ExtractUnboxedValues(array, optionalVariantType, context.HolderFactory); + auto array = NFormats::MakeArray(values, optionalVariantType); + auto restoredValues = NFormats::ExtractUnboxedValues(array, optionalVariantType, context.HolderFactory); UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); for (ui64 index = 0; index < values.size(); ++index) { AssertUnboxedValuesAreEqual(values[index], restoredValues[index], optionalVariantType); @@ -1936,8 +1932,8 @@ Y_UNIT_TEST_SUITE(ConvertUnboxedValueToArrowAndBack){ UNIT_ASSERT(NFormats::IsArrowCompatible(doubleOptionalVariantType)); auto values = context.CreateDoubleOptionalVariantOverTupleWithOptionals(100); - auto array = NFormats::NTestUtils::MakeArray(values, doubleOptionalVariantType); - auto restoredValues = NFormats::NTestUtils::ExtractUnboxedValues(array, doubleOptionalVariantType, context.HolderFactory); + auto array = NFormats::MakeArray(values, doubleOptionalVariantType); + auto restoredValues = NFormats::ExtractUnboxedValues(array, doubleOptionalVariantType, context.HolderFactory); UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); for (ui64 index = 0; index < values.size(); ++index) { AssertUnboxedValuesAreEqual(values[index], restoredValues[index], doubleOptionalVariantType); @@ -1951,8 +1947,8 @@ Y_UNIT_TEST_SUITE(ConvertUnboxedValueToArrowAndBack){ UNIT_ASSERT(NFormats::IsArrowCompatible(dictType)); auto values = context.CreateDictOptionalToTuple(100); - auto array = NFormats::NTestUtils::MakeArray(values, dictType); - auto restoredValues = NFormats::NTestUtils::ExtractUnboxedValues(array, dictType, context.HolderFactory); + auto array = NFormats::MakeArray(values, dictType); + auto restoredValues = NFormats::ExtractUnboxedValues(array, dictType, context.HolderFactory); UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); for (ui64 index = 0; index < values.size(); ++index) { AssertUnboxedValuesAreEqual(values[index], restoredValues[index], dictType); @@ -1966,8 +1962,8 @@ Y_UNIT_TEST_SUITE(ConvertUnboxedValueToArrowAndBack){ UNIT_ASSERT(NFormats::IsArrowCompatible(doubleOptionalType)); auto values = context.CreateOptionalOfOptional(100); - auto array = NFormats::NTestUtils::MakeArray(values, doubleOptionalType); - auto restoredValues = NFormats::NTestUtils::ExtractUnboxedValues(array, doubleOptionalType, context.HolderFactory); + auto array = NFormats::MakeArray(values, doubleOptionalType); + auto restoredValues = NFormats::ExtractUnboxedValues(array, doubleOptionalType, context.HolderFactory); UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); for (ui64 index = 0; index < values.size(); ++index) { AssertUnboxedValuesAreEqual(values[index], restoredValues[index], doubleOptionalType); @@ -1981,8 +1977,8 @@ Y_UNIT_TEST_SUITE(ConvertUnboxedValueToArrowAndBack){ UNIT_ASSERT(NFormats::IsArrowCompatible(variantType)); auto values = context.CreateLargeVariant(1000); - auto array = NFormats::NTestUtils::MakeArray(values, variantType); - auto restoredValues = NFormats::NTestUtils::ExtractUnboxedValues(array, variantType, context.HolderFactory); + auto array = NFormats::MakeArray(values, variantType); + auto restoredValues = NFormats::ExtractUnboxedValues(array, variantType, context.HolderFactory); UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); for (ui64 index = 0; index < values.size(); ++index) { AssertUnboxedValuesAreEqual(values[index], restoredValues[index], variantType); diff --git a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp new file mode 100644 index 000000000000..7d09b3e9469a --- /dev/null +++ b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp @@ -0,0 +1,413 @@ +#include "kqp_formats_ut_helpers.h" + +#include + +#include +#include +#include +#include + +namespace NKikimr::NKqp::NFormats { + +namespace { + +template +NUdf::TUnboxedValue GetUnboxedValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { + Y_UNUSED(dataSlot); + using TArrayType = typename arrow::TypeTraits::ArrayType; + auto array = std::static_pointer_cast(column); + return NUdf::TUnboxedValuePod(static_cast(array->Value(row))); +} + +template <> // For darwin build +NUdf::TUnboxedValue GetUnboxedValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { + Y_UNUSED(dataSlot); + auto array = std::static_pointer_cast(column); + return NUdf::TUnboxedValuePod(static_cast(array->Value(row))); +} + +template <> // For darwin build +NUdf::TUnboxedValue GetUnboxedValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { + Y_UNUSED(dataSlot); + auto array = std::static_pointer_cast(column); + return NUdf::TUnboxedValuePod(static_cast(array->Value(row))); +} + +template <> +NUdf::TUnboxedValue GetUnboxedValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { + auto array = std::static_pointer_cast(column); + YQL_ENSURE(array->num_fields() == 2, "StructArray of some TzDate type should have 2 fields"); + + auto datetimeArray = array->field(0); + auto timezoneArray = std::static_pointer_cast(array->field(1)); + + NUdf::TUnboxedValuePod value; + auto typeId = datetimeArray->type_id(); + + switch (dataSlot) { + case NUdf::EDataSlot::TzDate: { + YQL_ENSURE(typeId == arrow::Type::UINT16); + value = NUdf::TUnboxedValuePod(static_cast( + std::static_pointer_cast(datetimeArray)->Value(row))); + break; + } + + case NUdf::EDataSlot::TzDatetime: { + YQL_ENSURE(typeId == arrow::Type::UINT32); + value = NUdf::TUnboxedValuePod(static_cast( + std::static_pointer_cast(datetimeArray)->Value(row))); + break; + } + + case NUdf::EDataSlot::TzTimestamp: { + YQL_ENSURE(typeId == arrow::Type::UINT64); + value = NUdf::TUnboxedValuePod(static_cast( + std::static_pointer_cast(datetimeArray)->Value(row))); + break; + } + + case NUdf::EDataSlot::TzDate32: { + YQL_ENSURE(typeId == arrow::Type::INT32); + value = NUdf::TUnboxedValuePod(static_cast( + std::static_pointer_cast(datetimeArray)->Value(row))); + break; + } + + case NUdf::EDataSlot::TzDatetime64: + case NUdf::EDataSlot::TzTimestamp64: { + YQL_ENSURE(typeId == arrow::Type::INT64); + value = NUdf::TUnboxedValuePod(static_cast( + std::static_pointer_cast(datetimeArray)->Value(row))); + break; + } + + default: { + YQL_ENSURE(false, "Unexpected timezone datetime data type"); + return NUdf::TUnboxedValuePod(); + } + } + + auto view = timezoneArray->Value(row); + value.SetTimezoneId(NMiniKQL::GetTimezoneId(NUdf::TStringRef(view.data(), view.size()))); + return value; +} + +template <> +NUdf::TUnboxedValue GetUnboxedValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { + Y_UNUSED(dataSlot); + auto array = std::static_pointer_cast(column); + auto data = array->GetView(row); + return NMiniKQL::MakeString(NUdf::TStringRef(data.data(), data.size())); +} + +template <> +NUdf::TUnboxedValue GetUnboxedValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { + auto array = std::static_pointer_cast(column); + auto data = array->GetView(row); + + switch (dataSlot) { + case NUdf::EDataSlot::Utf8: + case NUdf::EDataSlot::Json: { + return NMiniKQL::MakeString(NUdf::TStringRef(data.data(), data.size())); + } + + case NUdf::EDataSlot::JsonDocument: { + auto variant = NBinaryJson::SerializeToBinaryJson(TStringBuf(data.data(), data.size())); + if (std::holds_alternative(variant)) { + const auto& json = std::get(variant); + return NMiniKQL::MakeString(NUdf::TStringRef(json.Data(), json.Size())); + } + + YQL_ENSURE(false, "Cannot serialize to binary json"); + break; + } + + case NUdf::EDataSlot::DyNumber: { + auto number = NDyNumber::ParseDyNumberString(TStringBuf(data.data(), data.size())); + if (number.Defined()) { + return NMiniKQL::MakeString(*number); + } + + YQL_ENSURE(false, "Failed to convert string to DyNumber"); + break; + } + + default: { + YQL_ENSURE(false, "Unexpected data slot"); + } + } + return NUdf::TUnboxedValuePod(); +} + +template <> +NUdf::TUnboxedValue GetUnboxedValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { + auto array = std::static_pointer_cast(column); + auto data = array->GetView(row); + + switch (dataSlot) { + case NUdf::EDataSlot::Uuid: { + return NMiniKQL::MakeString(NUdf::TStringRef(data.data(), data.size())); + } + + case NUdf::EDataSlot::Decimal: { + NYql::NDecimal::TInt128 value; + std::memcpy(&value, data.data(), data.size()); + return NUdf::TUnboxedValuePod(value); + } + + default: { + YQL_ENSURE(false, "Unexpected data slot"); + } + } + return NUdf::TUnboxedValuePod(); +} + +} // namespace + +std::unique_ptr MakeArrowBuilder(const NMiniKQL::TType* type) { + auto arrayType = GetArrowType(type); + std::unique_ptr builder; + auto status = arrow::MakeBuilder(arrow::default_memory_pool(), arrayType, &builder); + YQL_ENSURE(status.ok(), "Failed to make arrow builder: " << status.ToString()); + return builder; +} + +std::shared_ptr MakeArray(NMiniKQL::TUnboxedValueVector& values, const NMiniKQL::TType* itemType) { + auto builder = MakeArrowBuilder(itemType); + auto status = builder->Reserve(values.size()); + YQL_ENSURE(status.ok(), "Failed to reserve space for array: " << status.ToString()); + for (auto& value : values) { + AppendElement(value, builder.get(), itemType); + } + std::shared_ptr result; + status = builder->Finish(&result); + YQL_ENSURE(status.ok(), "Failed to finish array: " << status.ToString()); + return result; +} + +NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& array, ui64 row, const NMiniKQL::TType* itemType, + const NMiniKQL::THolderFactory& holderFactory) +{ + if (array->IsNull(row)) { + return NUdf::TUnboxedValuePod(); + } + + switch (itemType->GetKind()) { + case NMiniKQL::TType::EKind::Void: + case NMiniKQL::TType::EKind::Null: + case NMiniKQL::TType::EKind::EmptyList: + case NMiniKQL::TType::EKind::EmptyDict: { + break; + } + + case NMiniKQL::TType::EKind::Data: { + auto dataType = static_cast(itemType); + NUdf::TUnboxedValue result; + auto dataSlot = *dataType->GetDataSlot().Get(); + bool success = SwitchMiniKQLDataTypeToArrowType(dataSlot, + [&](TTypeWrapper typeHolder) { + Y_UNUSED(typeHolder); + result = GetUnboxedValue(array, row, dataSlot); + return true; + }); + YQL_ENSURE(success, "Failed to extract unboxed value from arrow array"); + return result; + } + + case NMiniKQL::TType::EKind::Struct: { + auto structType = static_cast(itemType); + + YQL_ENSURE(array->type_id() == arrow::Type::STRUCT, "Unexpected array type"); + auto typedArray = static_pointer_cast(array); + YQL_ENSURE(static_cast(typedArray->num_fields()) == structType->GetMembersCount(), "Unexpected count of fields"); + + NUdf::TUnboxedValue* itemsPtr = nullptr; + auto result = holderFactory.CreateDirectArrayHolder(structType->GetMembersCount(), itemsPtr); + + for (ui32 index = 0; index < structType->GetMembersCount(); ++index) { + auto memberType = structType->GetMemberType(index); + itemsPtr[index] = ExtractUnboxedValue(typedArray->field(index), row, memberType, holderFactory); + } + return result; + } + + case NMiniKQL::TType::EKind::Tuple: { + auto tupleType = static_cast(itemType); + + YQL_ENSURE(array->type_id() == arrow::Type::STRUCT, "Unexpected array type"); + auto typedArray = static_pointer_cast(array); + YQL_ENSURE(static_cast(typedArray->num_fields()) == tupleType->GetElementsCount(), "Unexpected count of fields"); + + NUdf::TUnboxedValue* itemsPtr = nullptr; + auto result = holderFactory.CreateDirectArrayHolder(tupleType->GetElementsCount(), itemsPtr); + + for (ui32 index = 0; index < tupleType->GetElementsCount(); ++index) { + auto elementType = tupleType->GetElementType(index); + itemsPtr[index] = ExtractUnboxedValue(typedArray->field(index), row, elementType, holderFactory); + } + return result; + } + + case NMiniKQL::TType::EKind::Optional: { + auto optionalType = static_cast(itemType); + auto innerOptionalType = optionalType->GetItemType(); + + if (NeedWrapByExternalOptional(innerOptionalType)) { + YQL_ENSURE(array->type_id() == arrow::Type::STRUCT, "Unexpected array type"); + + auto innerArray = array; + auto innerType = itemType; + + NUdf::TUnboxedValue value; + int depth = 0; + + while (innerArray->type_id() == arrow::Type::STRUCT) { + auto structArray = static_pointer_cast(innerArray); + YQL_ENSURE(structArray->num_fields() == 1, "Unexpected count of fields"); + + if (structArray->IsNull(row)) { + value = NUdf::TUnboxedValuePod(); + break; + } + + innerType = static_cast(innerType)->GetItemType(); + innerArray = structArray->field(0); + ++depth; + } + + auto wrap = NeedWrapByExternalOptional(innerType); + if (wrap || !innerArray->IsNull(row)) { + value = ExtractUnboxedValue(innerArray, row, innerType, holderFactory); + if (wrap) { + --depth; + } + } + + for (int i = 0; i < depth; ++i) { + value = value.MakeOptional(); + } + return value; + } + + return ExtractUnboxedValue(array, row, innerOptionalType, holderFactory).Release().MakeOptional(); + } + + case NMiniKQL::TType::EKind::List: { + auto listType = static_cast(itemType); + + YQL_ENSURE(array->type_id() == arrow::Type::LIST, "Unexpected array type"); + auto typedArray = static_pointer_cast(array); + + auto arraySlice = typedArray->value_slice(row); + auto itemType = listType->GetItemType(); + const auto len = arraySlice->length(); + + NUdf::TUnboxedValue* items = nullptr; + auto list = holderFactory.CreateDirectArrayHolder(len, items); + for (ui64 i = 0; i < static_cast(len); ++i) { + *items++ = ExtractUnboxedValue(arraySlice, i, itemType, holderFactory); + } + return list; + } + + case NMiniKQL::TType::EKind::Dict: { + auto dictType = static_cast(itemType); + + auto keyType = dictType->GetKeyType(); + auto payloadType = dictType->GetPayloadType(); + auto dictBuilder = holderFactory.NewDict(dictType, NUdf::TDictFlags::EDictKind::Hashed); + + std::shared_ptr keyArray = nullptr; + std::shared_ptr payloadArray = nullptr; + ui64 dictLength = 0; + ui64 offset = 0; + + YQL_ENSURE(array->type_id() == arrow::Type::STRUCT, "Unexpected array type"); + auto wrapArray = static_pointer_cast(array); + YQL_ENSURE(wrapArray->num_fields() == 2, "Unexpected count of fields"); + + auto dictSlice = wrapArray->field(0); + + if (keyType->GetKind() == NMiniKQL::TType::EKind::Optional) { + YQL_ENSURE(dictSlice->type_id() == arrow::Type::LIST, "Unexpected array type"); + auto listArray = static_pointer_cast(dictSlice); + + auto arraySlice = listArray->value_slice(row); + YQL_ENSURE(arraySlice->type_id() == arrow::Type::STRUCT, "Unexpected array type"); + auto structArray = static_pointer_cast(arraySlice); + YQL_ENSURE(structArray->num_fields() == 2, "Unexpected count of fields"); + + dictLength = arraySlice->length(); + keyArray = structArray->field(0); + payloadArray = structArray->field(1); + } else { + YQL_ENSURE(dictSlice->type_id() == arrow::Type::MAP, "Unexpected array type"); + auto mapArray = static_pointer_cast(dictSlice); + + dictLength = mapArray->value_length(row); + offset = mapArray->value_offset(row); + keyArray = mapArray->keys(); + payloadArray = mapArray->items(); + } + + for (ui64 i = offset; i < offset + static_cast(dictLength); ++i) { + auto key = ExtractUnboxedValue(keyArray, i, keyType, holderFactory); + auto payload = ExtractUnboxedValue(payloadArray, i, payloadType, holderFactory); + dictBuilder->Add(std::move(key), std::move(payload)); + } + return dictBuilder->Build(); + } + + case NMiniKQL::TType::EKind::Variant: { + // TODO Need to properly convert variants containing more than 127*127 + // types? + auto variantType = static_cast(itemType); + + YQL_ENSURE(array->type_id() == arrow::Type::DENSE_UNION, "Unexpected array type"); + auto unionArray = static_pointer_cast(array); + + auto variantIndex = unionArray->child_id(row); + auto rowInChild = unionArray->value_offset(row); + std::shared_ptr valuesArray = unionArray->field(variantIndex); + + if (variantType->GetAlternativesCount() > arrow::UnionType::kMaxTypeCode) { + // Go one step deeper + YQL_ENSURE(valuesArray->type_id() == arrow::Type::DENSE_UNION, "Unexpected array type"); + auto innerUnionArray = static_pointer_cast(valuesArray); + auto innerVariantIndex = innerUnionArray->child_id(rowInChild); + + rowInChild = innerUnionArray->value_offset(rowInChild); + valuesArray = innerUnionArray->field(innerVariantIndex); + variantIndex =variantIndex * arrow::UnionType::kMaxTypeCode + innerVariantIndex; + } + + NMiniKQL::TType* innerType = variantType->GetUnderlyingType(); + if (innerType->IsStruct()) { + innerType =static_cast(innerType)->GetMemberType(variantIndex); + } else { + YQL_ENSURE(innerType->IsTuple(), "Unexpected underlying variant type: " << innerType->GetKindAsStr()); + innerType = static_cast(innerType)->GetElementType(variantIndex); + } + + NUdf::TUnboxedValue value = ExtractUnboxedValue(valuesArray, rowInChild, innerType, holderFactory); + return holderFactory.CreateVariantHolder(value.Release(), variantIndex); + } + default: { + YQL_ENSURE(false, "Unsupported type: " << itemType->GetKindAsStr()); + } + } + return NUdf::TUnboxedValuePod(); +} + +NMiniKQL::TUnboxedValueVector ExtractUnboxedValues(const std::shared_ptr& array, const NMiniKQL::TType* itemType, + const NMiniKQL::THolderFactory& holderFactory) +{ + NMiniKQL::TUnboxedValueVector values; + values.reserve(array->length()); + for (auto i = 0; i < array->length(); ++i) { + values.push_back(ExtractUnboxedValue(array, i, itemType, holderFactory)); + } + return values; +} + +} // namespace NKikimr::NKqp::NFormats diff --git a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.h b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.h new file mode 100644 index 000000000000..3a2221468418 --- /dev/null +++ b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.h @@ -0,0 +1,54 @@ +#pragma once + +#include + +#include +#include + +namespace NKikimr::NKqp::NFormats { + +/** + * @brief Make arrow array builder for given type. + * The type is converted to arrow type by NKqp::NFormats::GetArrowType function. + * + * @param type type to make builder for + * @return unique pointer to arrow array builder + */ +std::unique_ptr MakeArrowBuilder(const NMiniKQL::TType* type); + +/** + * @brief Make arrow array for given values and type. + * The type is converted to arrow type by NKqp::NFormats::GetArrowType function. + * + * @param values values to make array for + * @param itemType type of each element to parse it and to construct corresponding arrow type + * @return shared pointer to arrow array + */ +std::shared_ptr MakeArray(NMiniKQL::TUnboxedValueVector& values, const NMiniKQL::TType* itemType); + +/** + * @brief Extract unboxed value from arrow array for given row and type. + * The type of the item and the arrow array type must be the same by NKqp::NFormats::GetArrowType function. + * + * @param array arrow array to extract value from + * @param row row to extract value from + * @param itemType type of each element to parse it and to construct corresponding arrow type + * @param holderFactory holder factory to use + * @return unboxed value + */ +NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& array, ui64 row, + const NMiniKQL::TType* itemType, const NMiniKQL::THolderFactory& holderFactory); + +/** + * @brief Extract unboxed values from arrow array for given type. + * The type of items and the arrow array type must be the same by NKqp::NFormats::GetArrowType function. + * + * @param array arrow array to extract values from + * @param itemType type of each element to parse it and to construct corresponding arrow type + * @param holderFactory holder factory to use + * @return vector of unboxed values + */ +NMiniKQL::TUnboxedValueVector ExtractUnboxedValues(const std::shared_ptr& array, + const NMiniKQL::TType* itemType, const NMiniKQL::THolderFactory& holderFactory); + +} // namespace NKikimr::NKqp::NFormats diff --git a/ydb/core/kqp/common/result_set_format/ut/ya.make b/ydb/core/kqp/common/result_set_format/ut/ya.make index 34fff6a3c22f..0af51ab0a206 100644 --- a/ydb/core/kqp/common/result_set_format/ut/ya.make +++ b/ydb/core/kqp/common/result_set_format/ut/ya.make @@ -5,6 +5,7 @@ FORK_SUBTESTS() SIZE(MEDIUM) SRCS( + kqp_formats_ut_helpers.cpp kqp_formats_arrow_ut.cpp ) From 9276ca1f214c29e74badc5ae3f3f9a73b5e90300 Mon Sep 17 00:00:00 2001 From: Daniil Timizhev Date: Wed, 5 Nov 2025 15:18:19 +0300 Subject: [PATCH 03/25] Update comments, rm TTypeWrapper --- .../result_set_format/kqp_formats_arrow.cpp | 6 +- .../result_set_format/kqp_formats_arrow.h | 148 ++++++++++++------ .../ut/kqp_formats_ut_helpers.cpp | 3 +- 3 files changed, 104 insertions(+), 53 deletions(-) diff --git a/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp b/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp index 4af78e195c2c..154ebc1f4615 100644 --- a/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp +++ b/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp @@ -62,8 +62,7 @@ std::shared_ptr CreateEmptyArrowImpl(NUdf::E std::shared_ptr GetArrowType(const NMiniKQL::TDataType* dataType) { std::shared_ptr result; bool success = SwitchMiniKQLDataTypeToArrowType(*dataType->GetDataSlot().Get(), - [&](TTypeWrapper typeHolder) { - Y_UNUSED(typeHolder); + [&]() { result = CreateEmptyArrowImpl(*dataType->GetDataSlot().Get()); return true; }); @@ -612,8 +611,7 @@ void AppendElement(NUdf::TUnboxedValue value, arrow::ArrayBuilder* builder, cons case NMiniKQL::TType::EKind::Data: { auto dataType = static_cast(type); auto slot = *dataType->GetDataSlot().Get(); - bool success = SwitchMiniKQLDataTypeToArrowType( slot, [&](TTypeWrapper typeHolder) { - Y_UNUSED(typeHolder); + bool success = SwitchMiniKQLDataTypeToArrowType(slot, [&]() { AppendDataValue(builder, value, slot); return true; }); diff --git a/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.h b/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.h index 8e41d1cf9142..92a454acf443 100644 --- a/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.h +++ b/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.h @@ -4,113 +4,167 @@ #include -namespace NKikimr::NKqp::NFormats { - -namespace { - -template -struct TTypeWrapper { - using T = TArrowType; -}; +/** + * @file kqp_formats_arrow.h + * @brief Utilities for converting MiniKQL types to Apache Arrow types and vice versa. + * + * This module provides a comprehensive mapping between YQL internal type system (MiniKQL) + * and Apache Arrow format. It handles conversion of both simple data types + * (integers, strings, etc.) and complex types (structs, lists, optionals, etc.). + */ -} // namespace +namespace NKikimr::NKqp::NFormats { /** - * @brief Function to switch MiniKQL DataType correctly and uniformly converting - * it to arrow type using callback - * - * @tparam TFunc Callback type - * @param typeId Type callback work with. - * @param callback Template function of signature (TTypeWrapper) -> bool - * @return Result of execution of callback or false if the type typeId is not - * supported. + * @brief Dispatches MiniKQL data type to corresponding Arrow type via compile-time callback. + * + * This template function provides a type-safe way to map MiniKQL primitive data types + * to their Arrow counterparts. The callback receives the Arrow type as a template parameter, + * allowing for compile-time type dispatch without runtime overhead. + * + * Type mapping overview: + * - Integer types: Int8/16/32/64, UInt8/16/32/64 + * - Floating point: Float, Double + * - Temporal types: Date, Datetime, Timestamp, Interval (and their extended variants) + * - String types: Utf8, Json, JsonDocument (serialized to string), DyNumber (serialized to string) -> arrow::StringType + * - Binary types: String, Yson -> arrow::BinaryType + * - Fixed-size binary: Decimal, Uuid -> arrow::FixedSizeBinaryType + * - Timezone-aware: TzDate, TzDatetime, TzTimestamp -> arrow::StructType + * + * @tparam TFunc Callable type accepting a single template parameter (Arrow type) + * @param typeId The MiniKQL data slot to convert + * @param callback A callable object with signature: template bool operator()() + * @return true if the type is supported and callback executed successfully, false otherwise */ template bool SwitchMiniKQLDataTypeToArrowType(NUdf::EDataSlot typeId, TFunc&& callback) { switch (typeId) { case NUdf::EDataSlot::Int8: - return callback(TTypeWrapper()); + return callback.template operator()(); + case NUdf::EDataSlot::Uint8: case NUdf::EDataSlot::Bool: - return callback(TTypeWrapper()); + return callback.template operator()(); + case NUdf::EDataSlot::Int16: - return callback(TTypeWrapper()); + return callback.template operator()(); + case NUdf::EDataSlot::Date: case NUdf::EDataSlot::Uint16: - return callback(TTypeWrapper()); + return callback.template operator()(); + case NUdf::EDataSlot::Int32: case NUdf::EDataSlot::Date32: - return callback(TTypeWrapper()); + return callback.template operator()(); + case NUdf::EDataSlot::Datetime: case NUdf::EDataSlot::Uint32: - return callback(TTypeWrapper()); + return callback.template operator()(); + case NUdf::EDataSlot::Int64: case NUdf::EDataSlot::Interval: case NUdf::EDataSlot::Datetime64: case NUdf::EDataSlot::Timestamp64: case NUdf::EDataSlot::Interval64: - return callback(TTypeWrapper()); + return callback.template operator()(); + case NUdf::EDataSlot::Uint64: case NUdf::EDataSlot::Timestamp: - return callback(TTypeWrapper()); + return callback.template operator()(); + case NUdf::EDataSlot::Float: - return callback(TTypeWrapper()); + return callback.template operator()(); + case NUdf::EDataSlot::Double: - return callback(TTypeWrapper()); + return callback.template operator()(); + case NUdf::EDataSlot::Utf8: case NUdf::EDataSlot::Json: case NUdf::EDataSlot::DyNumber: case NUdf::EDataSlot::JsonDocument: - return callback(TTypeWrapper()); + return callback.template operator()(); + case NUdf::EDataSlot::String: case NUdf::EDataSlot::Yson: - return callback(TTypeWrapper()); + return callback.template operator()(); + case NUdf::EDataSlot::Decimal: case NUdf::EDataSlot::Uuid: - return callback(TTypeWrapper()); + return callback.template operator()(); + case NUdf::EDataSlot::TzDate: case NUdf::EDataSlot::TzDatetime: case NUdf::EDataSlot::TzTimestamp: case NUdf::EDataSlot::TzDate32: case NUdf::EDataSlot::TzDatetime64: case NUdf::EDataSlot::TzTimestamp64: - return callback(TTypeWrapper()); + return callback.template operator()(); } + return false; } /** - * @brief Check if the type needs to be wrapped by external optional. - * For example, some types does not have validity bitmap. + * @brief Determines if a type requires wrapping in an external Optional layer. + * + * Some MiniKQL types don't have a native validity bitmap in Arrow representation + * (e.g., Variant, Null, Void). These types need to be wrapped in an additional + * struct layer when used as optional values to properly represent NULL states. * - * @param type Yql type to check - * @return true if the type needs to be wrapped by external optional, false otherwise + * @param type The MiniKQL type to check + * @return true if the type needs external Optional wrapping, false otherwise + * + * @note Types that need wrapping: Void, Null, Variant, Optional, EmptyList, EmptyDict */ bool NeedWrapByExternalOptional(const NMiniKQL::TType* type); /** - * @brief Convert TType to the arrow::DataType object + * @brief Converts a MiniKQL type to its corresponding Arrow DataType. + * + * This function recursively converts complex MiniKQL types (Struct, Tuple, List, Dict, + * Variant, Optional) to their Arrow equivalents. The conversion preserves the structure + * and nullability information. * - * @param type Yql type to parse - * @return std::shared_ptr arrow type of the same structure as - * type + * Conversion rules: + * - Data types: mapped according to SwitchMiniKQLDataTypeToArrowType + * - Struct/Tuple: converted to arrow::StructType + * - List: converted to arrow::ListType + * - Dict: converted to arrow::MapType or List (if key is Optional) + * - Variant: converted to arrow::DenseUnionType + * - Optional: nested optionals are flattened and represented via struct wrapping + * + * @param type The MiniKQL type to convert + * @return Shared pointer to corresponding Arrow DataType, or arrow::NullType if unsupported */ std::shared_ptr GetArrowType(const NMiniKQL::TType* type); /** - * @brief Check if the type can be converted to arrow type. + * @brief Checks if a MiniKQL type can be represented in Arrow format. + * + * Not all MiniKQL types are compatible with Arrow. For example, Callable, Stream, + * and Flow types cannot be represented. This function recursively checks complex + * types (Struct, List, etc.) to ensure all nested types are compatible. * - * @param type Yql type to check - * @return true if the type is compatible with arrow, false otherwise + * @param type The MiniKQL type to validate + * @return true if the type can be converted to Arrow format, false otherwise + * + * @note Compatible types: Data, Struct, Tuple, List, Dict, Variant, Optional, Tagged + * @note Incompatible types: Type, Stream, Callable, Any, Resource, Flow, Block, Pg, Multi, Linear */ bool IsArrowCompatible(const NMiniKQL::TType* type); /** - * @brief Append UnboxedValue to arrow Array via arrow Builder. - * This function is used in TArrowBatchBuilder. + * @brief Appends a MiniKQL UnboxedValue to an Arrow ArrayBuilder. + * + * This function is the core serialization routine for converting MiniKQL values + * to Arrow format. It handles all supported MiniKQL types, including + * complex nested structures, and properly manages NULL values. + * + * The builder must be pre-configured with the correct Arrow type matching the + * provided MiniKQL type. Type mismatches will result in assertion failures. * - * @param value value to append - * @param builder arrow Builder with proper type used to append converted value array - * @param type Yql type of the element + * @param value The MiniKQL value to append (may be NULL/empty) + * @param builder The Arrow builder to append to (must match the type) + * @param type The MiniKQL type descriptor for the value */ void AppendElement(NUdf::TUnboxedValue value, arrow::ArrayBuilder* builder, const NMiniKQL::TType* type); diff --git a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp index 7d09b3e9469a..3484efa23a00 100644 --- a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp +++ b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp @@ -205,8 +205,7 @@ NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& arr NUdf::TUnboxedValue result; auto dataSlot = *dataType->GetDataSlot().Get(); bool success = SwitchMiniKQLDataTypeToArrowType(dataSlot, - [&](TTypeWrapper typeHolder) { - Y_UNUSED(typeHolder); + [&]() { result = GetUnboxedValue(array, row, dataSlot); return true; }); From 0459c1648a5ce2cd33f1f7dfb3f7766b5342a4aa Mon Sep 17 00:00:00 2001 From: Daniil Timizhev Date: Wed, 5 Nov 2025 15:32:13 +0300 Subject: [PATCH 04/25] renames and fixes --- .../result_set_format/kqp_formats_arrow.cpp | 42 ++++++++++--------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp b/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp index 154ebc1f4615..5ba864cc699a 100644 --- a/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp +++ b/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp @@ -14,19 +14,19 @@ namespace NKikimr::NKqp::NFormats { namespace { template -std::shared_ptr CreateEmptyArrowImpl(NUdf::EDataSlot slot) { +std::shared_ptr BuildArrowType(NUdf::EDataSlot slot) { Y_UNUSED(slot); return std::make_shared(); } template <> -std::shared_ptr CreateEmptyArrowImpl(NUdf::EDataSlot slot) { +std::shared_ptr BuildArrowType(NUdf::EDataSlot slot) { Y_UNUSED(slot); return arrow::fixed_size_binary(NScheme::FSB_SIZE); } template <> -std::shared_ptr CreateEmptyArrowImpl(NUdf::EDataSlot slot) { +std::shared_ptr BuildArrowType(NUdf::EDataSlot slot) { std::shared_ptr type; switch (slot) { case NUdf::EDataSlot::TzDate: @@ -63,7 +63,7 @@ std::shared_ptr GetArrowType(const NMiniKQL::TDataType* dataTyp std::shared_ptr result; bool success = SwitchMiniKQLDataTypeToArrowType(*dataType->GetDataSlot().Get(), [&]() { - result = CreateEmptyArrowImpl(*dataType->GetDataSlot().Get()); + result = BuildArrowType(*dataType->GetDataSlot().Get()); return true; }); if (success) { @@ -301,7 +301,6 @@ void AppendDataValue(arrow::ArrayBuilder* builder, NUdf::TUnb YQL_ENSURE(status.ok(), "Failed to append data value: " << status.ToString()); } -// Only for timezone datetime types template <> void AppendDataValue(arrow::ArrayBuilder* builder, NUdf::TUnboxedValue value, NUdf::EDataSlot dataSlot) { Y_UNUSED(dataSlot); @@ -373,25 +372,28 @@ void AppendDataValue(arrow::ArrayBuilder* builder, N if (!value.HasValue()) { status = typedBuilder->AppendNull(); - } else { - switch (dataSlot) { - case NUdf::EDataSlot::Uuid: { - auto data = value.AsStringRef(); - status = typedBuilder->Append(data.Data()); - break; - } + YQL_ENSURE(status.ok(), "Failed to append data value: " << status.ToString()); + return; + } - case NUdf::EDataSlot::Decimal: { - auto intVal = value.GetInt128(); - status = typedBuilder->Append(reinterpret_cast(&intVal)); - break; - } + switch (dataSlot) { + case NUdf::EDataSlot::Uuid: { + auto data = value.AsStringRef(); + status = typedBuilder->Append(data.Data()); + break; + } - default: { - YQL_ENSURE(false, "Unexpected data slot"); - } + case NUdf::EDataSlot::Decimal: { + auto intVal = value.GetInt128(); + status = typedBuilder->Append(reinterpret_cast(&intVal)); + break; + } + + default: { + YQL_ENSURE(false, "Unexpected data slot"); } } + YQL_ENSURE(status.ok(), "Failed to append data value: " << status.ToString()); } From 5d3322c573af83d9a5a8b98fd67b1f574394824c Mon Sep 17 00:00:00 2001 From: Daniil Timizhev Date: Wed, 5 Nov 2025 15:45:34 +0300 Subject: [PATCH 05/25] Tuple issues --- ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp b/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp index 5ba864cc699a..9964b34c4fb2 100644 --- a/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp +++ b/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp @@ -89,11 +89,11 @@ std::shared_ptr GetArrowType(const NMiniKQL::TTupleType* tupleT std::vector> fields; fields.reserve(tupleType->GetElementsCount()); for (ui32 index = 0; index < tupleType->GetElementsCount(); ++index) { - auto elementName = std::string("field" + ToString(index)); + auto elementName = "field" + std::to_string(index); auto elementType = tupleType->GetElementType(index); auto elementArrowType = NFormats::GetArrowType(elementType); - fields.push_back(std::make_shared(elementName, elementArrowType, elementType->IsOptional())); + fields.emplace_back(std::make_shared(elementName, elementArrowType, elementType->IsOptional())); } return arrow::struct_(fields); } From 2754cb8d63701190eb78fd03f1224d4645ba2d07 Mon Sep 17 00:00:00 2001 From: Daniil Timizhev Date: Wed, 5 Nov 2025 16:52:30 +0300 Subject: [PATCH 06/25] renames --- .../common/result_set_format/ut/kqp_formats_ut_helpers.cpp | 4 ++-- .../kqp/common/result_set_format/ut/kqp_formats_ut_helpers.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp index 3484efa23a00..f1cccbf1fa3b 100644 --- a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp +++ b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp @@ -172,7 +172,7 @@ std::unique_ptr MakeArrowBuilder(const NMiniKQL::TType* typ return builder; } -std::shared_ptr MakeArray(NMiniKQL::TUnboxedValueVector& values, const NMiniKQL::TType* itemType) { +std::shared_ptr MakeArrowArray(NMiniKQL::TUnboxedValueVector& values, const NMiniKQL::TType* itemType) { auto builder = MakeArrowBuilder(itemType); auto status = builder->Reserve(values.size()); YQL_ENSURE(status.ok(), "Failed to reserve space for array: " << status.ToString()); @@ -398,7 +398,7 @@ NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& arr return NUdf::TUnboxedValuePod(); } -NMiniKQL::TUnboxedValueVector ExtractUnboxedValues(const std::shared_ptr& array, const NMiniKQL::TType* itemType, +NMiniKQL::TUnboxedValueVector ExtractUnboxedVector(const std::shared_ptr& array, const NMiniKQL::TType* itemType, const NMiniKQL::THolderFactory& holderFactory) { NMiniKQL::TUnboxedValueVector values; diff --git a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.h b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.h index 3a2221468418..279a421aab47 100644 --- a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.h +++ b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.h @@ -24,7 +24,7 @@ std::unique_ptr MakeArrowBuilder(const NMiniKQL::TType* typ * @param itemType type of each element to parse it and to construct corresponding arrow type * @return shared pointer to arrow array */ -std::shared_ptr MakeArray(NMiniKQL::TUnboxedValueVector& values, const NMiniKQL::TType* itemType); +std::shared_ptr MakeArrowArray(NMiniKQL::TUnboxedValueVector& values, const NMiniKQL::TType* itemType); /** * @brief Extract unboxed value from arrow array for given row and type. @@ -48,7 +48,7 @@ NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& arr * @param holderFactory holder factory to use * @return vector of unboxed values */ -NMiniKQL::TUnboxedValueVector ExtractUnboxedValues(const std::shared_ptr& array, +NMiniKQL::TUnboxedValueVector ExtractUnboxedVector(const std::shared_ptr& array, const NMiniKQL::TType* itemType, const NMiniKQL::THolderFactory& holderFactory); } // namespace NKikimr::NKqp::NFormats From 9bdcb607db25200939ab8240d35ff076fd1ce14a Mon Sep 17 00:00:00 2001 From: Daniil Timizhev Date: Wed, 5 Nov 2025 16:52:37 +0300 Subject: [PATCH 07/25] namespaces --- .../ut/kqp_formats_arrow_ut.cpp | 162 +++++++++--------- 1 file changed, 81 insertions(+), 81 deletions(-) diff --git a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp index 1cf4128a073a..e215be4539f2 100644 --- a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp +++ b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp @@ -73,7 +73,7 @@ NUdf::TUnboxedValue GetValueOfBasicType(TType* type, ui64 value) { case NUdf::EDataSlot::Double: return NUdf::TUnboxedValuePod(static_cast(value) / 12345); case NUdf::EDataSlot::Decimal: { - auto decimal = NYql::NDecimal::FromString(TStringBuilder() << value << ".123", DECIMAL_PRECISION, DECIMAL_SCALE); + auto decimal = NDecimal::FromString(TStringBuilder() << value << ".123", DECIMAL_PRECISION, DECIMAL_SCALE); return NUdf::TUnboxedValuePod(decimal); } case NUdf::EDataSlot::DyNumber: { @@ -245,7 +245,7 @@ struct TTestContext { } TUnboxedValueVector CreateTuples(ui32 quantity) { - NKikimr::NMiniKQL::TUnboxedValueVector values; + TUnboxedValueVector values; for (ui32 value = 0; value < quantity; ++value) { NUdf::TUnboxedValue* items; auto tupleValue = Vb.NewArray(3, items); @@ -264,7 +264,7 @@ struct TTestContext { } TUnboxedValueVector CreateDictUtf8ToInterval(ui32 quantity) { - NKikimr::NMiniKQL::TUnboxedValueVector values; + TUnboxedValueVector values; auto dictType = GetDictUtf8ToIntervalType(); for (ui32 value = 0; value < quantity; ++value) { auto dictBuilder = Vb.NewDict(dictType, 0); @@ -447,7 +447,7 @@ struct TTestContext { } TUnboxedValueVector CreateVariantOverTupleWithOptionals(ui32 quantity) { - NKikimr::NMiniKQL::TUnboxedValueVector values; + TUnboxedValueVector values; for (ui64 value = 0; value < quantity; ++value) { auto typeIndex = value % 5; NUdf::TUnboxedValue item; @@ -477,7 +477,7 @@ struct TTestContext { } TUnboxedValueVector CreateOptionalVariantOverTupleWithOptionals(ui32 quantity) { - NKikimr::NMiniKQL::TUnboxedValueVector values; + TUnboxedValueVector values; for (ui64 value = 0; value < quantity; ++value) { if (value % 2 == 0) { @@ -513,7 +513,7 @@ struct TTestContext { } TUnboxedValueVector CreateDoubleOptionalVariantOverTupleWithOptionals(ui32 quantity) { - NKikimr::NMiniKQL::TUnboxedValueVector values; + TUnboxedValueVector values; for (ui64 value = 0; value < quantity; ++value) { auto typeIndex = value % 5; NUdf::TUnboxedValue item; @@ -560,7 +560,7 @@ struct TTestContext { } TUnboxedValueVector CreateDictOptionalToTuple(ui32 quantity) { - NKikimr::NMiniKQL::TUnboxedValueVector values; + TUnboxedValueVector values; for (ui64 value = 0; value < quantity; ++value) { auto dictBuilder = Vb.NewDict(GetDictOptionalToTupleType(), 0); for (ui64 i = 0; i < value * value; ++i) { @@ -778,7 +778,7 @@ void TestDataTypeConversion(arrow::Type::type arrowTypeId) { values.emplace_back(GetValueOfBasicType(type, i)); } - auto array = MakeArray(values, type); + auto array = MakeArrowArray(values, type); UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); UNIT_ASSERT(array->length() == static_cast(values.size())); @@ -839,7 +839,7 @@ void TestFixedSizeBinaryDataTypeConversion() { values.emplace_back(GetValueOfBasicType(type, i)); } - auto array = MakeArray(values, type); + auto array = MakeArrowArray(values, type); UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); UNIT_ASSERT(array->length() == static_cast(values.size())); @@ -852,10 +852,10 @@ void TestFixedSizeBinaryDataTypeConversion() { for (size_t i = 0; i < TEST_ARRAY_SIZE; ++i) { auto view = typedArray->GetView(i); if constexpr (IsDecimalType) { - NYql::NDecimal::TInt128 actual; + NDecimal::TInt128 actual; std::memcpy(&actual, view.data(), view.size()); - NYql::NDecimal::TInt128 expected = values[i].GetInt128(); + NDecimal::TInt128 expected = values[i].GetInt128(); UNIT_ASSERT(actual == expected); } else { auto expected = values[i].AsStringRef(); @@ -878,7 +878,7 @@ void TestSingularTypeConversion() { values.emplace_back(); } - auto array = MakeArray(values, type); + auto array = MakeArrowArray(values, type); UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); UNIT_ASSERT(array->length() == static_cast(TEST_ARRAY_SIZE)); @@ -1055,10 +1055,10 @@ Y_UNIT_TEST_SUITE(DqUnboxedValueToNativeArrowConversion) { TTestContext context; auto structType = context.GetStructType(); - UNIT_ASSERT(NFormats::IsArrowCompatible(structType)); + UNIT_ASSERT(IsArrowCompatible(structType)); auto values = context.CreateStructs(100); - auto array = NFormats::MakeArray(values, structType); + auto array = MakeArrowArray(values, structType); UNIT_ASSERT(array->ValidateFull().ok()); UNIT_ASSERT(array->length() == static_cast(values.size())); @@ -1096,10 +1096,10 @@ Y_UNIT_TEST_SUITE(DqUnboxedValueToNativeArrowConversion) { TTestContext context; auto tupleType = context.GetTupleType(); - UNIT_ASSERT(NFormats::IsArrowCompatible(tupleType)); + UNIT_ASSERT(IsArrowCompatible(tupleType)); auto values = context.CreateTuples(100); - auto array = NFormats::MakeArray(values, tupleType); + auto array = MakeArrowArray(values, tupleType); UNIT_ASSERT(array->ValidateFull().ok()); UNIT_ASSERT(array->length() == static_cast(values.size())); @@ -1136,10 +1136,10 @@ Y_UNIT_TEST_SUITE(DqUnboxedValueToNativeArrowConversion) { TTestContext context; auto listType = context.GetListOfJsonsType(); - Y_ABORT_UNLESS(NFormats::IsArrowCompatible(listType)); + Y_ABORT_UNLESS(IsArrowCompatible(listType)); auto values = context.CreateListOfJsons(100); - auto array = NFormats::MakeArray(values, listType); + auto array = MakeArrowArray(values, listType); UNIT_ASSERT(array->ValidateFull().ok()); UNIT_ASSERT(static_cast(array->length()) == values.size()); UNIT_ASSERT(array->type_id() == arrow::Type::LIST); @@ -1169,10 +1169,10 @@ Y_UNIT_TEST_SUITE(DqUnboxedValueToNativeArrowConversion) { TTestContext context; auto listType = context.GetOptionalListOfOptional(); - Y_ABORT_UNLESS(NFormats::IsArrowCompatible(listType)); + Y_ABORT_UNLESS(IsArrowCompatible(listType)); auto values = context.CreateOptionalListOfOptional(100); - auto array = NFormats::MakeArray(values, listType); + auto array = MakeArrowArray(values, listType); UNIT_ASSERT(array->ValidateFull().ok()); UNIT_ASSERT(static_cast(array->length()) == values.size()); UNIT_ASSERT(array->type_id() == arrow::Type::LIST); @@ -1211,10 +1211,10 @@ Y_UNIT_TEST_SUITE(DqUnboxedValueToNativeArrowConversion) { // TTestContext context; // auto variantType = context.GetVariantOverStructType(); - // UNIT_ASSERT(NFormats::IsArrowCompatible(variantType)); + // UNIT_ASSERT(IsArrowCompatible(variantType)); // auto values = context.CreateVariantOverStruct(100); - // auto array = NFormats::MakeArray(values, variantType); + // auto array = MakeArrowArray(values, variantType); // UNIT_ASSERT(array->ValidateFull().ok()); // UNIT_ASSERT(static_cast(array->length()) == values.size()); // UNIT_ASSERT(array->type_id() == arrow::Type::DENSE_UNION); @@ -1261,10 +1261,10 @@ Y_UNIT_TEST_SUITE(DqUnboxedValueToNativeArrowConversion) { // TTestContext context; // auto variantType = context.GetOptionalVariantOverStructType(); - // UNIT_ASSERT(NFormats::IsArrowCompatible(variantType)); + // UNIT_ASSERT(IsArrowCompatible(variantType)); // auto values = context.CreateOptionalVariantOverStruct(100); - // auto array = NFormats::MakeArray(values, variantType); + // auto array = MakeArrowArray(values, variantType); // UNIT_ASSERT(array->ValidateFull().ok()); // UNIT_ASSERT(static_cast(array->length()) == values.size()); // UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); @@ -1324,10 +1324,10 @@ Y_UNIT_TEST_SUITE(DqUnboxedValueToNativeArrowConversion) { // TTestContext context; // auto variantType = context.GetDoubleOptionalVariantOverStructType(); - // UNIT_ASSERT(NFormats::IsArrowCompatible(variantType)); + // UNIT_ASSERT(IsArrowCompatible(variantType)); // auto values = context.CreateDoubleOptionalVariantOverStruct(100); - // auto array = NFormats::MakeArray(values, variantType); + // auto array = MakeArrowArray(values, variantType); // UNIT_ASSERT(array->ValidateFull().ok()); // UNIT_ASSERT(static_cast(array->length()) == values.size()); // UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); @@ -1396,10 +1396,10 @@ Y_UNIT_TEST_SUITE(DqUnboxedValueToNativeArrowConversion) { TTestContext context; auto variantType = context.GetVariantOverTupleWithOptionalsType(); - UNIT_ASSERT(NFormats::IsArrowCompatible(variantType)); + UNIT_ASSERT(IsArrowCompatible(variantType)); auto values = context.CreateVariantOverTupleWithOptionals(100); - auto array = NFormats::MakeArray(values, variantType); + auto array = MakeArrowArray(values, variantType); UNIT_ASSERT(array->ValidateFull().ok()); UNIT_ASSERT(static_cast(array->length()) == values.size()); UNIT_ASSERT(array->type_id() == arrow::Type::DENSE_UNION); @@ -1454,10 +1454,10 @@ Y_UNIT_TEST_SUITE(DqUnboxedValueToNativeArrowConversion) { TTestContext context; auto variantType = context.GetOptionalVariantOverTupleWithOptionalsType(); - UNIT_ASSERT(NFormats::IsArrowCompatible(variantType)); + UNIT_ASSERT(IsArrowCompatible(variantType)); auto values = context.CreateOptionalVariantOverTupleWithOptionals(100); - auto array = NFormats::MakeArray(values, variantType); + auto array = MakeArrowArray(values, variantType); UNIT_ASSERT(array->ValidateFull().ok()); UNIT_ASSERT(static_cast(array->length()) == values.size()); UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); @@ -1524,10 +1524,10 @@ Y_UNIT_TEST_SUITE(DqUnboxedValueToNativeArrowConversion) { TTestContext context; auto variantType = context.GetDoubleOptionalVariantOverTupleWithOptionalsType(); - UNIT_ASSERT(NFormats::IsArrowCompatible(variantType)); + UNIT_ASSERT(IsArrowCompatible(variantType)); auto values = context.CreateDoubleOptionalVariantOverTupleWithOptionals(100); - auto array = NFormats::MakeArray(values, variantType); + auto array = MakeArrowArray(values, variantType); UNIT_ASSERT(array->ValidateFull().ok()); UNIT_ASSERT(static_cast(array->length()) == values.size()); UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); @@ -1603,10 +1603,10 @@ Y_UNIT_TEST_SUITE(DqUnboxedValueDoNotFitToArrow) { TTestContext context; auto dictType = context.GetDictUtf8ToIntervalType(); - UNIT_ASSERT(NFormats::IsArrowCompatible(dictType)); + UNIT_ASSERT(IsArrowCompatible(dictType)); auto values = context.CreateDictUtf8ToInterval(100); - auto array = NFormats::MakeArray(values, dictType); + auto array = MakeArrowArray(values, dictType); UNIT_ASSERT(array->ValidateFull().ok()); UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); @@ -1648,10 +1648,10 @@ Y_UNIT_TEST_SUITE(DqUnboxedValueDoNotFitToArrow) { TTestContext context; auto dictType = context.GetDictOptionalToTupleType(); - UNIT_ASSERT(NFormats::IsArrowCompatible(dictType)); + UNIT_ASSERT(IsArrowCompatible(dictType)); auto values = context.CreateDictOptionalToTuple(100); - auto array = NFormats::MakeArray(values, dictType); + auto array = MakeArrowArray(values, dictType); UNIT_ASSERT(array->ValidateFull().ok()); UNIT_ASSERT_EQUAL(static_cast(array->length()), values.size()); UNIT_ASSERT_EQUAL(array->type_id(), arrow::Type::STRUCT); @@ -1703,10 +1703,10 @@ Y_UNIT_TEST_SUITE(DqUnboxedValueDoNotFitToArrow) { TTestContext context; auto doubleOptionalType = context.GetOptionalOfOptionalType(); - UNIT_ASSERT(NFormats::IsArrowCompatible(doubleOptionalType)); + UNIT_ASSERT(IsArrowCompatible(doubleOptionalType)); auto values = context.CreateOptionalOfOptional(100); - auto array = NFormats::MakeArray(values, doubleOptionalType); + auto array = MakeArrowArray(values, doubleOptionalType); UNIT_ASSERT(array->ValidateFull().ok()); UNIT_ASSERT_EQUAL(static_cast(array->length()), values.size()); @@ -1757,10 +1757,10 @@ Y_UNIT_TEST_SUITE(DqUnboxedValueDoNotFitToArrow) { ui32 numberOfTypes = 500; auto variantType = context.GetLargeVariantType(numberOfTypes); - UNIT_ASSERT(NFormats::IsArrowCompatible(variantType)); + UNIT_ASSERT(IsArrowCompatible(variantType)); auto values = context.CreateLargeVariant(1000); - auto array = NFormats::MakeArray(values, variantType); + auto array = MakeArrowArray(values, variantType); UNIT_ASSERT(array->ValidateFull().ok()); UNIT_ASSERT_EQUAL(static_cast(array->length()), values.size()); UNIT_ASSERT_EQUAL(array->type_id(), arrow::Type::DENSE_UNION); @@ -1782,8 +1782,8 @@ Y_UNIT_TEST_SUITE(ConvertUnboxedValueToArrowAndBack){ auto structType = context.GetStructType(); auto values = context.CreateStructs(100); - auto array = NFormats::MakeArray(values, structType); - auto restoredValues = NFormats::ExtractUnboxedValues(array, structType, context.HolderFactory); + auto array = MakeArrowArray(values, structType); + auto restoredValues = ExtractUnboxedVector(array, structType, context.HolderFactory); UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); for (ui64 index = 0; index < values.size(); ++index) { AssertUnboxedValuesAreEqual(values[index], restoredValues[index], structType); @@ -1794,11 +1794,11 @@ Y_UNIT_TEST_SUITE(ConvertUnboxedValueToArrowAndBack){ TTestContext context; auto tupleType = context.GetTupleType(); - UNIT_ASSERT(NFormats::IsArrowCompatible(tupleType)); + UNIT_ASSERT(IsArrowCompatible(tupleType)); auto values = context.CreateTuples(100); - auto array = NFormats::MakeArray(values, tupleType); - auto restoredValues = NFormats::ExtractUnboxedValues(array, tupleType, context.HolderFactory); + auto array = MakeArrowArray(values, tupleType); + auto restoredValues = ExtractUnboxedVector(array, tupleType, context.HolderFactory); UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); for (ui64 index = 0; index < values.size(); ++index) { AssertUnboxedValuesAreEqual(values[index], restoredValues[index], tupleType); @@ -1809,11 +1809,11 @@ Y_UNIT_TEST_SUITE(ConvertUnboxedValueToArrowAndBack){ TTestContext context; auto dictType = context.GetDictUtf8ToIntervalType(); - UNIT_ASSERT(NFormats::IsArrowCompatible(dictType)); + UNIT_ASSERT(IsArrowCompatible(dictType)); auto values = context.CreateDictUtf8ToInterval(100); - auto array = NFormats::MakeArray(values, dictType); - auto restoredValues = NFormats::ExtractUnboxedValues(array, dictType, context.HolderFactory); + auto array = MakeArrowArray(values, dictType); + auto restoredValues = ExtractUnboxedVector(array, dictType, context.HolderFactory); UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); for (ui64 index = 0; index < values.size(); ++index) { AssertUnboxedValuesAreEqual(values[index], restoredValues[index], dictType); @@ -1824,11 +1824,11 @@ Y_UNIT_TEST_SUITE(ConvertUnboxedValueToArrowAndBack){ TTestContext context; auto listType = context.GetListOfJsonsType(); - Y_ABORT_UNLESS(NFormats::IsArrowCompatible(listType)); + Y_ABORT_UNLESS(IsArrowCompatible(listType)); auto values = context.CreateListOfJsons(100); - auto array = NFormats::MakeArray(values, listType); - auto restoredValues = NFormats::ExtractUnboxedValues(array, listType, context.HolderFactory); + auto array = MakeArrowArray(values, listType); + auto restoredValues = ExtractUnboxedVector(array, listType, context.HolderFactory); UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); for (ui64 index = 0; index < values.size(); ++index) { AssertUnboxedValuesAreEqual(values[index], restoredValues[index], listType); @@ -1839,11 +1839,11 @@ Y_UNIT_TEST_SUITE(ConvertUnboxedValueToArrowAndBack){ TTestContext context; auto listType = context.GetOptionalListOfOptional(); - Y_ABORT_UNLESS(NFormats::IsArrowCompatible(listType)); + Y_ABORT_UNLESS(IsArrowCompatible(listType)); auto values = context.CreateOptionalListOfOptional(100); - auto array = NFormats::MakeArray(values, listType); - auto restoredValues = NFormats::ExtractUnboxedValues(array, listType, context.HolderFactory); + auto array = MakeArrowArray(values, listType); + auto restoredValues = ExtractUnboxedVector(array, listType, context.HolderFactory); UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); for (ui64 index = 0; index < values.size(); ++index) { AssertUnboxedValuesAreEqual(values[index], restoredValues[index], listType); @@ -1854,11 +1854,11 @@ Y_UNIT_TEST_SUITE(ConvertUnboxedValueToArrowAndBack){ // TTestContext context; // auto variantType = context.GetVariantOverStructType(); - // UNIT_ASSERT(NFormats::IsArrowCompatible(variantType)); + // UNIT_ASSERT(IsArrowCompatible(variantType)); // auto values = context.CreateVariantOverStruct(100); - // auto array = NFormats::MakeArray(values, variantType); - // auto restoredValues = NFormats::ExtractUnboxedValues(array, variantType, context.HolderFactory); + // auto array = MakeArrowArray(values, variantType); + // auto restoredValues = ExtractUnboxedVector(array, variantType, context.HolderFactory); // UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); // for (ui64 index = 0; index < values.size(); ++index) { // AssertUnboxedValuesAreEqual(values[index], restoredValues[index], variantType); @@ -1869,11 +1869,11 @@ Y_UNIT_TEST_SUITE(ConvertUnboxedValueToArrowAndBack){ // TTestContext context; // auto optionalVariantType = context.GetOptionalVariantOverStructType(); - // UNIT_ASSERT(NFormats::IsArrowCompatible(optionalVariantType)); + // UNIT_ASSERT(IsArrowCompatible(optionalVariantType)); // auto values = context.CreateOptionalVariantOverStruct(100); - // auto array = NFormats::MakeArray(values, optionalVariantType); - // auto restoredValues = NFormats::ExtractUnboxedValues(array, optionalVariantType, context.HolderFactory); + // auto array = MakeArrowArray(values, optionalVariantType); + // auto restoredValues = ExtractUnboxedVector(array, optionalVariantType, context.HolderFactory); // UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); // for (ui64 index = 0; index < values.size(); ++index) { // AssertUnboxedValuesAreEqual(values[index], restoredValues[index], optionalVariantType); @@ -1884,11 +1884,11 @@ Y_UNIT_TEST_SUITE(ConvertUnboxedValueToArrowAndBack){ // TTestContext context; // auto doubleOptionalVariantType = context.GetDoubleOptionalVariantOverStructType(); - // UNIT_ASSERT(NFormats::IsArrowCompatible(doubleOptionalVariantType)); + // UNIT_ASSERT(IsArrowCompatible(doubleOptionalVariantType)); // auto values = context.CreateDoubleOptionalVariantOverStruct(100); - // auto array = NFormats::MakeArray(values, doubleOptionalVariantType); - // auto restoredValues = NFormats::ExtractUnboxedValues(array, doubleOptionalVariantType, context.HolderFactory); + // auto array = MakeArrowArray(values, doubleOptionalVariantType); + // auto restoredValues = ExtractUnboxedVector(array, doubleOptionalVariantType, context.HolderFactory); // UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); // for (ui64 index = 0; index < values.size(); ++index) { // AssertUnboxedValuesAreEqual(values[index], restoredValues[index], doubleOptionalVariantType); @@ -1899,11 +1899,11 @@ Y_UNIT_TEST_SUITE(ConvertUnboxedValueToArrowAndBack){ TTestContext context; auto variantType = context.GetVariantOverTupleWithOptionalsType(); - UNIT_ASSERT(NFormats::IsArrowCompatible(variantType)); + UNIT_ASSERT(IsArrowCompatible(variantType)); auto values = context.CreateVariantOverTupleWithOptionals(100); - auto array = NFormats::MakeArray(values, variantType); - auto restoredValues = NFormats::ExtractUnboxedValues(array, variantType, context.HolderFactory); + auto array = MakeArrowArray(values, variantType); + auto restoredValues = ExtractUnboxedVector(array, variantType, context.HolderFactory); UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); for (ui64 index = 0; index < values.size(); ++index) { AssertUnboxedValuesAreEqual(values[index], restoredValues[index], variantType); @@ -1914,11 +1914,11 @@ Y_UNIT_TEST_SUITE(ConvertUnboxedValueToArrowAndBack){ TTestContext context; auto optionalVariantType = context.GetOptionalVariantOverTupleWithOptionalsType(); - UNIT_ASSERT(NFormats::IsArrowCompatible(optionalVariantType)); + UNIT_ASSERT(IsArrowCompatible(optionalVariantType)); auto values = context.CreateOptionalVariantOverTupleWithOptionals(100); - auto array = NFormats::MakeArray(values, optionalVariantType); - auto restoredValues = NFormats::ExtractUnboxedValues(array, optionalVariantType, context.HolderFactory); + auto array = MakeArrowArray(values, optionalVariantType); + auto restoredValues = ExtractUnboxedVector(array, optionalVariantType, context.HolderFactory); UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); for (ui64 index = 0; index < values.size(); ++index) { AssertUnboxedValuesAreEqual(values[index], restoredValues[index], optionalVariantType); @@ -1929,11 +1929,11 @@ Y_UNIT_TEST_SUITE(ConvertUnboxedValueToArrowAndBack){ TTestContext context; auto doubleOptionalVariantType = context.GetDoubleOptionalVariantOverTupleWithOptionalsType(); - UNIT_ASSERT(NFormats::IsArrowCompatible(doubleOptionalVariantType)); + UNIT_ASSERT(IsArrowCompatible(doubleOptionalVariantType)); auto values = context.CreateDoubleOptionalVariantOverTupleWithOptionals(100); - auto array = NFormats::MakeArray(values, doubleOptionalVariantType); - auto restoredValues = NFormats::ExtractUnboxedValues(array, doubleOptionalVariantType, context.HolderFactory); + auto array = MakeArrowArray(values, doubleOptionalVariantType); + auto restoredValues = ExtractUnboxedVector(array, doubleOptionalVariantType, context.HolderFactory); UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); for (ui64 index = 0; index < values.size(); ++index) { AssertUnboxedValuesAreEqual(values[index], restoredValues[index], doubleOptionalVariantType); @@ -1944,11 +1944,11 @@ Y_UNIT_TEST_SUITE(ConvertUnboxedValueToArrowAndBack){ TTestContext context; auto dictType = context.GetDictOptionalToTupleType(); - UNIT_ASSERT(NFormats::IsArrowCompatible(dictType)); + UNIT_ASSERT(IsArrowCompatible(dictType)); auto values = context.CreateDictOptionalToTuple(100); - auto array = NFormats::MakeArray(values, dictType); - auto restoredValues = NFormats::ExtractUnboxedValues(array, dictType, context.HolderFactory); + auto array = MakeArrowArray(values, dictType); + auto restoredValues = ExtractUnboxedVector(array, dictType, context.HolderFactory); UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); for (ui64 index = 0; index < values.size(); ++index) { AssertUnboxedValuesAreEqual(values[index], restoredValues[index], dictType); @@ -1959,11 +1959,11 @@ Y_UNIT_TEST_SUITE(ConvertUnboxedValueToArrowAndBack){ TTestContext context; auto doubleOptionalType = context.GetOptionalOfOptionalType(); - UNIT_ASSERT(NFormats::IsArrowCompatible(doubleOptionalType)); + UNIT_ASSERT(IsArrowCompatible(doubleOptionalType)); auto values = context.CreateOptionalOfOptional(100); - auto array = NFormats::MakeArray(values, doubleOptionalType); - auto restoredValues = NFormats::ExtractUnboxedValues(array, doubleOptionalType, context.HolderFactory); + auto array = MakeArrowArray(values, doubleOptionalType); + auto restoredValues = ExtractUnboxedVector(array, doubleOptionalType, context.HolderFactory); UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); for (ui64 index = 0; index < values.size(); ++index) { AssertUnboxedValuesAreEqual(values[index], restoredValues[index], doubleOptionalType); @@ -1974,11 +1974,11 @@ Y_UNIT_TEST_SUITE(ConvertUnboxedValueToArrowAndBack){ TTestContext context; auto variantType = context.GetLargeVariantType(500); - UNIT_ASSERT(NFormats::IsArrowCompatible(variantType)); + UNIT_ASSERT(IsArrowCompatible(variantType)); auto values = context.CreateLargeVariant(1000); - auto array = NFormats::MakeArray(values, variantType); - auto restoredValues = NFormats::ExtractUnboxedValues(array, variantType, context.HolderFactory); + auto array = MakeArrowArray(values, variantType); + auto restoredValues = ExtractUnboxedVector(array, variantType, context.HolderFactory); UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); for (ui64 index = 0; index < values.size(); ++index) { AssertUnboxedValuesAreEqual(values[index], restoredValues[index], variantType); From 547248c07cb9605638437bd5d57cb4d05ef62d1b Mon Sep 17 00:00:00 2001 From: Daniil Timizhev Date: Wed, 5 Nov 2025 17:35:26 +0300 Subject: [PATCH 08/25] Nested type list test --- .../ut/kqp_formats_arrow_ut.cpp | 118 ++++++++++-------- 1 file changed, 64 insertions(+), 54 deletions(-) diff --git a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp index e215be4539f2..19841ece98a2 100644 --- a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp +++ b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -280,21 +281,46 @@ struct TTestContext { return values; } - TType* GetListOfJsonsType() { - TType* itemType = TDataType::Create(NUdf::TDataType::Id, TypeEnv); - return TListType::Create(itemType, TypeEnv); + TType* GetList(TType* itemType, bool optional = false) { + if (optional) { + itemType = TOptionalType::Create(itemType, TypeEnv); + } + auto listType = TListType::Create(itemType, TypeEnv); + if (optional) { + return TOptionalType::Create(listType, TypeEnv); + } + return listType; } - TUnboxedValueVector CreateListOfJsons(ui32 quantity) { + TUnboxedValueVector CreateList(ui32 quantity, TType* itemType, bool optional = false) { TUnboxedValueVector values; + values.reserve(quantity); + for (ui64 value = 0; value < quantity; ++value) { + if (optional && value % 2 == 0) { + values.emplace_back(NUdf::TUnboxedValuePod()); + continue; + } + TUnboxedValueVector items; items.reserve(value); for (ui64 i = 0; i < value; ++i) { - std::string json = TStringBuilder() << "{'item':" << i << "}"; - items.push_back(MakeString(NUdf::TStringRef(json.data(), json.size()))); + if (optional) { + if (i % 2 == 0) { + items.push_back(NUdf::TUnboxedValuePod()); + } else { + items.push_back(GetValueOfBasicType(itemType, i).MakeOptional()); + } + } else { + items.push_back(GetValueOfBasicType(itemType, i)); + } } + auto listValue = Vb.NewList(items.data(), value); + if (optional) { + listValue = std::move(listValue).MakeOptional(); + } + values.emplace_back(std::move(listValue)); } return values; @@ -1048,6 +1074,38 @@ Y_UNIT_TEST_SUITE(KqpFormats_Arrow_Conversion) { Y_UNIT_TEST(DataType_EmptyDict) { TestSingularTypeConversion(); } + + // Nested types + Y_UNIT_TEST_TWIN(NestedType_List, IsOptional) { + TTestContext context; + + for (auto itemType: context.BasicTypes) { + auto listType = context.GetList(itemType, IsOptional); + auto values = context.CreateList(100, itemType, IsOptional); + + auto array = MakeArrowArray(values, listType); + UNIT_ASSERT(array->ValidateFull().ok()); + UNIT_ASSERT(array->length() == static_cast(values.size())); + UNIT_ASSERT(array->type_id() == arrow::Type::LIST); + + if (IsOptional) { + auto listArray = static_pointer_cast(array); + for (size_t i = 0; i < values.size(); ++i) { + if (i % 2 == 0) { + UNIT_ASSERT(listArray->IsNull(i)); + continue; + } + + UNIT_ASSERT(!listArray->IsNull(i)); + auto slice = listArray->value_slice(i); + + for (size_t j = 0; j < static_cast(slice->length()); ++j) { + UNIT_ASSERT(j % 2 == 0 ? slice->IsNull(j) : !slice->IsNull(j)); + } + } + } + } + } } Y_UNIT_TEST_SUITE(DqUnboxedValueToNativeArrowConversion) { @@ -1132,39 +1190,6 @@ Y_UNIT_TEST_SUITE(DqUnboxedValueToNativeArrowConversion) { } } - Y_UNIT_TEST(ListOfJsons) { - TTestContext context; - - auto listType = context.GetListOfJsonsType(); - Y_ABORT_UNLESS(IsArrowCompatible(listType)); - - auto values = context.CreateListOfJsons(100); - auto array = MakeArrowArray(values, listType); - UNIT_ASSERT(array->ValidateFull().ok()); - UNIT_ASSERT(static_cast(array->length()) == values.size()); - UNIT_ASSERT(array->type_id() == arrow::Type::LIST); - auto listArray = static_pointer_cast(array); - - UNIT_ASSERT(listArray->num_fields() == 1); - UNIT_ASSERT(listArray->value_type()->id() == arrow::Type::STRING); - auto jsonArray = static_pointer_cast(listArray->values()); - auto index = 0; - auto innerIndex = 0; - for (const auto& value: values) { - UNIT_ASSERT(value.GetListLength() == static_cast(listArray->value_length(index))); - const auto iter = value.GetListIterator(); - for (NUdf::TUnboxedValue item; iter.Next(item);) { - auto view = jsonArray->GetView(innerIndex); - std::string itemArrow(view.data(), view.size()); - auto stringRef = item.AsStringRef(); - std::string itemList(stringRef.Data(), stringRef.Size()); - UNIT_ASSERT(itemList == itemArrow); - ++innerIndex; - } - ++index; - } - } - Y_UNIT_TEST(OptionalListOfOptional) { TTestContext context; @@ -1820,21 +1845,6 @@ Y_UNIT_TEST_SUITE(ConvertUnboxedValueToArrowAndBack){ } } - Y_UNIT_TEST(ListOfJsons) { - TTestContext context; - - auto listType = context.GetListOfJsonsType(); - Y_ABORT_UNLESS(IsArrowCompatible(listType)); - - auto values = context.CreateListOfJsons(100); - auto array = MakeArrowArray(values, listType); - auto restoredValues = ExtractUnboxedVector(array, listType, context.HolderFactory); - UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); - for (ui64 index = 0; index < values.size(); ++index) { - AssertUnboxedValuesAreEqual(values[index], restoredValues[index], listType); - } - } - Y_UNIT_TEST(OptionalListOfOptional) { TTestContext context; From 22d812d9a915924bdf5d3ea354672dc20c25f8a5 Mon Sep 17 00:00:00 2001 From: Daniil Timizhev Date: Wed, 5 Nov 2025 18:19:23 +0300 Subject: [PATCH 09/25] Assert UVs by func --- .../ut/kqp_formats_arrow_ut.cpp | 253 +++++++++--------- 1 file changed, 126 insertions(+), 127 deletions(-) diff --git a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp index 19841ece98a2..cce21e96648b 100644 --- a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp +++ b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp @@ -671,116 +671,126 @@ void AssertUnboxedValuesAreEqual(NUdf::TUnboxedValue& left, NUdf::TUnboxedValue& case TType::EKind::Data: { auto dataType = static_cast(type); auto dataSlot = *dataType->GetDataSlot().Get(); - // Json-like type are not comparable so just skip them - if (dataSlot != NUdf::EDataSlot::Json && dataSlot != NUdf::EDataSlot::Yson && dataSlot != NUdf::EDataSlot::JsonDocument) { - UNIT_ASSERT(NUdf::EquateValues(dataSlot, left, right)); - } - break; - } - case TType::EKind::Optional: { - UNIT_ASSERT_EQUAL(left.HasValue(), right.HasValue()); - if (left.HasValue()) { - auto innerType = static_cast(type)->GetItemType(); - NUdf::TUnboxedValue leftInner = left.GetOptionalValue(); - NUdf::TUnboxedValue rightInner = right.GetOptionalValue(); - AssertUnboxedValuesAreEqual(leftInner, rightInner, innerType); - } - break; - } - - case TType::EKind::List: { - auto listType = static_cast(type); - auto itemType = listType->GetItemType(); - auto leftPtr = left.GetElements(); - auto rightPtr = right.GetElements(); - UNIT_ASSERT_EQUAL(leftPtr != nullptr, rightPtr != nullptr); - if (leftPtr != nullptr) { - auto leftLen = left.GetListLength(); - auto rightLen = right.GetListLength(); - UNIT_ASSERT_EQUAL(leftLen, rightLen); - while (leftLen > 0) { - NUdf::TUnboxedValue leftItem = *leftPtr++; - NUdf::TUnboxedValue rightItem = *rightPtr++; - AssertUnboxedValuesAreEqual(leftItem, rightItem, itemType); - --leftLen; - } - } else { - const auto leftIter = left.GetListIterator(); - const auto rightIter = right.GetListIterator(); - NUdf::TUnboxedValue leftItem; - NUdf::TUnboxedValue rightItem; - bool leftHasValue = leftIter.Next(leftItem); - bool rightHasValue = rightIter.Next(leftItem); - while (leftHasValue && rightHasValue) { - AssertUnboxedValuesAreEqual(leftItem, rightItem, itemType); - leftHasValue = leftIter.Next(leftItem); - rightHasValue = rightIter.Next(leftItem); + switch (dataSlot) { + case NUdf::EDataSlot::JsonDocument: + case NUdf::EDataSlot::Json: + case NUdf::EDataSlot::Yson: { + UNIT_ASSERT_VALUES_EQUAL(std::string(left.AsStringRef()), std::string(right.AsStringRef())); + break; } - UNIT_ASSERT_EQUAL(leftHasValue, rightHasValue); - } - break; - } - case TType::EKind::Struct: { - auto structType = static_cast(type); - UNIT_ASSERT_EQUAL(left.GetListLength(), structType->GetMembersCount()); - UNIT_ASSERT_EQUAL(right.GetListLength(), structType->GetMembersCount()); - for (ui32 index = 0; index < structType->GetMembersCount(); ++index) { - auto memberType = structType->GetMemberType(index); - NUdf::TUnboxedValue leftMember = left.GetElement(index); - NUdf::TUnboxedValue rightMember = right.GetElement(index); - AssertUnboxedValuesAreEqual(leftMember, rightMember, memberType); - } - break; - } - - case TType::EKind::Tuple: { - auto tupleType = static_cast(type); - UNIT_ASSERT_EQUAL(left.GetListLength(), tupleType->GetElementsCount()); - UNIT_ASSERT_EQUAL(right.GetListLength(), tupleType->GetElementsCount()); - for (ui32 index = 0; index < tupleType->GetElementsCount(); ++index) { - auto elementType = tupleType->GetElementType(index); - NUdf::TUnboxedValue leftMember = left.GetElement(index); - NUdf::TUnboxedValue rightMember = right.GetElement(index); - AssertUnboxedValuesAreEqual(leftMember, rightMember, elementType); - } - break; - } - - case TType::EKind::Dict: { - auto dictType = static_cast(type); - auto payloadType = dictType->GetPayloadType(); - - UNIT_ASSERT_EQUAL(left.GetDictLength(), right.GetDictLength()); - const auto leftIter = left.GetDictIterator(); - for (NUdf::TUnboxedValue key, leftPayload; leftIter.NextPair(key, leftPayload);) { - UNIT_ASSERT(right.Contains(key)); - NUdf::TUnboxedValue rightPayload = right.Lookup(key); - AssertUnboxedValuesAreEqual(leftPayload, rightPayload, payloadType); + default: { + UNIT_ASSERT(NUdf::EquateValues(dataSlot, left, right)); + } } break; } - case TType::EKind::Variant: { - auto variantType = static_cast(type); - UNIT_ASSERT_EQUAL(left.GetVariantIndex(), right.GetVariantIndex()); - ui32 variantIndex = left.GetVariantIndex(); - TType* innerType = variantType->GetUnderlyingType(); - if (innerType->IsStruct()) { - innerType = static_cast(innerType)->GetMemberType(variantIndex); - } else { - Y_VERIFY_S(innerType->IsTuple(), "Unexpected underlying variant type: " << innerType->GetKindAsStr()); - innerType = static_cast(innerType)->GetElementType(variantIndex); - } - NUdf::TUnboxedValue leftValue = left.GetVariantItem(); - NUdf::TUnboxedValue rightValue = right.GetVariantItem(); - AssertUnboxedValuesAreEqual(leftValue, rightValue, innerType); - break; + // case TType::EKind::Optional: { + // UNIT_ASSERT_EQUAL(left.HasValue(), right.HasValue()); + // if (left.HasValue()) { + // auto innerType = static_cast(type)->GetItemType(); + // NUdf::TUnboxedValue leftInner = left.GetOptionalValue(); + // NUdf::TUnboxedValue rightInner = right.GetOptionalValue(); + // AssertUnboxedValuesAreEqual(leftInner, rightInner, innerType); + // } + // break; + // } + + // case TType::EKind::List: { + // auto listType = static_cast(type); + // auto itemType = listType->GetItemType(); + // auto leftPtr = left.GetElements(); + // auto rightPtr = right.GetElements(); + // UNIT_ASSERT_EQUAL(leftPtr != nullptr, rightPtr != nullptr); + // if (leftPtr != nullptr) { + // auto leftLen = left.GetListLength(); + // auto rightLen = right.GetListLength(); + // UNIT_ASSERT_EQUAL(leftLen, rightLen); + // while (leftLen > 0) { + // NUdf::TUnboxedValue leftItem = *leftPtr++; + // NUdf::TUnboxedValue rightItem = *rightPtr++; + // AssertUnboxedValuesAreEqual(leftItem, rightItem, itemType); + // --leftLen; + // } + // } else { + // const auto leftIter = left.GetListIterator(); + // const auto rightIter = right.GetListIterator(); + // NUdf::TUnboxedValue leftItem; + // NUdf::TUnboxedValue rightItem; + // bool leftHasValue = leftIter.Next(leftItem); + // bool rightHasValue = rightIter.Next(leftItem); + // while (leftHasValue && rightHasValue) { + // AssertUnboxedValuesAreEqual(leftItem, rightItem, itemType); + // leftHasValue = leftIter.Next(leftItem); + // rightHasValue = rightIter.Next(leftItem); + // } + // UNIT_ASSERT_EQUAL(leftHasValue, rightHasValue); + // } + // break; + // } + + // case TType::EKind::Struct: { + // auto structType = static_cast(type); + // UNIT_ASSERT_EQUAL(left.GetListLength(), structType->GetMembersCount()); + // UNIT_ASSERT_EQUAL(right.GetListLength(), structType->GetMembersCount()); + // for (ui32 index = 0; index < structType->GetMembersCount(); ++index) { + // auto memberType = structType->GetMemberType(index); + // NUdf::TUnboxedValue leftMember = left.GetElement(index); + // NUdf::TUnboxedValue rightMember = right.GetElement(index); + // AssertUnboxedValuesAreEqual(leftMember, rightMember, memberType); + // } + // break; + // } + + // case TType::EKind::Tuple: { + // auto tupleType = static_cast(type); + // UNIT_ASSERT_EQUAL(left.GetListLength(), tupleType->GetElementsCount()); + // UNIT_ASSERT_EQUAL(right.GetListLength(), tupleType->GetElementsCount()); + // for (ui32 index = 0; index < tupleType->GetElementsCount(); ++index) { + // auto elementType = tupleType->GetElementType(index); + // NUdf::TUnboxedValue leftMember = left.GetElement(index); + // NUdf::TUnboxedValue rightMember = right.GetElement(index); + // AssertUnboxedValuesAreEqual(leftMember, rightMember, elementType); + // } + // break; + // } + + // case TType::EKind::Dict: { + // auto dictType = static_cast(type); + // auto payloadType = dictType->GetPayloadType(); + + // UNIT_ASSERT_EQUAL(left.GetDictLength(), right.GetDictLength()); + // const auto leftIter = left.GetDictIterator(); + // for (NUdf::TUnboxedValue key, leftPayload; leftIter.NextPair(key, leftPayload);) { + // UNIT_ASSERT(right.Contains(key)); + // NUdf::TUnboxedValue rightPayload = right.Lookup(key); + // AssertUnboxedValuesAreEqual(leftPayload, rightPayload, payloadType); + // } + // break; + // } + + // case TType::EKind::Variant: { + // auto variantType = static_cast(type); + // UNIT_ASSERT_EQUAL(left.GetVariantIndex(), right.GetVariantIndex()); + // ui32 variantIndex = left.GetVariantIndex(); + // TType* innerType = variantType->GetUnderlyingType(); + // if (innerType->IsStruct()) { + // innerType = static_cast(innerType)->GetMemberType(variantIndex); + // } else { + // UNIT_ASSERT_C(innerType->IsTuple(), "Unexpected underlying variant type: " << innerType->GetKindAsStr()); + // innerType = static_cast(innerType)->GetElementType(variantIndex); + // } + // NUdf::TUnboxedValue leftValue = left.GetVariantItem(); + // NUdf::TUnboxedValue rightValue = right.GetVariantItem(); + // AssertUnboxedValuesAreEqual(leftValue, rightValue, innerType); + // break; + // } + + default: { + UNIT_ASSERT_C(false, TStringBuilder() << "Unsupported type: " << type->GetKindAsStr()); } - - default: - THROW yexception() << "Unsupported type: " << type->GetKindAsStr(); } } @@ -826,22 +836,15 @@ void TestDataTypeConversion(arrow::Type::type arrowTypeId) { } for (size_t i = 0; i < TEST_ARRAY_SIZE; ++i) { - if constexpr (IsStringType) { - if constexpr (std::is_same_v) { - auto val = NBinaryJson::SerializeToJson(values[i].AsStringRef()); - UNIT_ASSERT(static_cast(typedArray->Value(i)) == val); - } else { - auto value = ExtractUnboxedValue(array, i, type, context.HolderFactory); - AssertUnboxedValuesAreEqual(value, values[i], type); - } - } else { - UNIT_ASSERT(static_cast(typedArray->Value(i)) == values[i].Get()); - } + auto left = ExtractUnboxedValue(array, i, type, context.HolderFactory); + auto right = values[i]; - if constexpr (IsTimezoneType) { - auto view = timezoneArray->Value(i); - UNIT_ASSERT(values[i].GetTimezoneId() == GetTimezoneId(NUdf::TStringRef(view.data(), view.size()))); + if constexpr (std::is_same_v) { + left = MakeString(NBinaryJson::SerializeToJson(left.AsStringRef())); + right = MakeString(NBinaryJson::SerializeToJson(right.AsStringRef())); } + + AssertUnboxedValuesAreEqual(left, right, type); } } @@ -876,17 +879,8 @@ void TestFixedSizeBinaryDataTypeConversion() { UNIT_ASSERT(typedArray->byte_width() == NScheme::FSB_SIZE); for (size_t i = 0; i < TEST_ARRAY_SIZE; ++i) { - auto view = typedArray->GetView(i); - if constexpr (IsDecimalType) { - NDecimal::TInt128 actual; - std::memcpy(&actual, view.data(), view.size()); - - NDecimal::TInt128 expected = values[i].GetInt128(); - UNIT_ASSERT(actual == expected); - } else { - auto expected = values[i].AsStringRef(); - UNIT_ASSERT_STRINGS_EQUAL(std::string(view.data(), view.size()), std::string(expected.Data(), expected.Size())); - } + auto arrowValue = ExtractUnboxedValue(array, i, type, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], type); } } @@ -915,6 +909,11 @@ void TestSingularTypeConversion() { auto structArray = static_pointer_cast(array); UNIT_ASSERT(structArray->num_fields() == 0); } + + for (size_t i = 0; i < TEST_ARRAY_SIZE; ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, type, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], type); + } } } // namespace From afaf700a58009c823ed777ce6286e6657c7cddcb Mon Sep 17 00:00:00 2001 From: Daniil Timizhev Date: Wed, 5 Nov 2025 18:31:51 +0300 Subject: [PATCH 10/25] Assert list optional values --- .../ut/kqp_formats_arrow_ut.cpp | 120 ++++++++---------- 1 file changed, 54 insertions(+), 66 deletions(-) diff --git a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp index cce21e96648b..a5faab5c7930 100644 --- a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp +++ b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp @@ -674,6 +674,8 @@ void AssertUnboxedValuesAreEqual(NUdf::TUnboxedValue& left, NUdf::TUnboxedValue& switch (dataSlot) { case NUdf::EDataSlot::JsonDocument: + left = MakeString(NKikimr::NBinaryJson::SerializeToJson(left.AsStringRef())); + right = MakeString(NKikimr::NBinaryJson::SerializeToJson(right.AsStringRef())); case NUdf::EDataSlot::Json: case NUdf::EDataSlot::Yson: { UNIT_ASSERT_VALUES_EQUAL(std::string(left.AsStringRef()), std::string(right.AsStringRef())); @@ -687,49 +689,54 @@ void AssertUnboxedValuesAreEqual(NUdf::TUnboxedValue& left, NUdf::TUnboxedValue& break; } - // case TType::EKind::Optional: { - // UNIT_ASSERT_EQUAL(left.HasValue(), right.HasValue()); - // if (left.HasValue()) { - // auto innerType = static_cast(type)->GetItemType(); - // NUdf::TUnboxedValue leftInner = left.GetOptionalValue(); - // NUdf::TUnboxedValue rightInner = right.GetOptionalValue(); - // AssertUnboxedValuesAreEqual(leftInner, rightInner, innerType); - // } - // break; - // } + case TType::EKind::Optional: { + UNIT_ASSERT_VALUES_EQUAL(left.HasValue(), right.HasValue()); + if (left.HasValue()) { + auto innerType = static_cast(type)->GetItemType(); + NUdf::TUnboxedValue leftInner = left.GetOptionalValue(); + NUdf::TUnboxedValue rightInner = right.GetOptionalValue(); + AssertUnboxedValuesAreEqual(leftInner, rightInner, innerType); + } + break; + } - // case TType::EKind::List: { - // auto listType = static_cast(type); - // auto itemType = listType->GetItemType(); - // auto leftPtr = left.GetElements(); - // auto rightPtr = right.GetElements(); - // UNIT_ASSERT_EQUAL(leftPtr != nullptr, rightPtr != nullptr); - // if (leftPtr != nullptr) { - // auto leftLen = left.GetListLength(); - // auto rightLen = right.GetListLength(); - // UNIT_ASSERT_EQUAL(leftLen, rightLen); - // while (leftLen > 0) { - // NUdf::TUnboxedValue leftItem = *leftPtr++; - // NUdf::TUnboxedValue rightItem = *rightPtr++; - // AssertUnboxedValuesAreEqual(leftItem, rightItem, itemType); - // --leftLen; - // } - // } else { - // const auto leftIter = left.GetListIterator(); - // const auto rightIter = right.GetListIterator(); - // NUdf::TUnboxedValue leftItem; - // NUdf::TUnboxedValue rightItem; - // bool leftHasValue = leftIter.Next(leftItem); - // bool rightHasValue = rightIter.Next(leftItem); - // while (leftHasValue && rightHasValue) { - // AssertUnboxedValuesAreEqual(leftItem, rightItem, itemType); - // leftHasValue = leftIter.Next(leftItem); - // rightHasValue = rightIter.Next(leftItem); - // } - // UNIT_ASSERT_EQUAL(leftHasValue, rightHasValue); - // } - // break; - // } + case TType::EKind::List: { + auto listType = static_cast(type); + auto itemType = listType->GetItemType(); + + auto leftPtr = left.GetElements(); + auto rightPtr = right.GetElements(); + UNIT_ASSERT_VALUES_EQUAL(leftPtr != nullptr, rightPtr != nullptr); + + if (leftPtr != nullptr) { + auto leftLen = left.GetListLength(); + auto rightLen = right.GetListLength(); + UNIT_ASSERT_VALUES_EQUAL(leftLen, rightLen); + + while (leftLen > 0) { + NUdf::TUnboxedValue leftItem = *leftPtr++; + NUdf::TUnboxedValue rightItem = *rightPtr++; + AssertUnboxedValuesAreEqual(leftItem, rightItem, itemType); + --leftLen; + } + } else { + const auto leftIter = left.GetListIterator(); + const auto rightIter = right.GetListIterator(); + + NUdf::TUnboxedValue leftItem; + NUdf::TUnboxedValue rightItem; + bool leftHasValue = leftIter.Next(leftItem); + bool rightHasValue = rightIter.Next(leftItem); + + while (leftHasValue && rightHasValue) { + AssertUnboxedValuesAreEqual(leftItem, rightItem, itemType); + leftHasValue = leftIter.Next(leftItem); + rightHasValue = rightIter.Next(leftItem); + } + UNIT_ASSERT_VALUES_EQUAL(leftHasValue, rightHasValue); + } + break; + } // case TType::EKind::Struct: { // auto structType = static_cast(type); @@ -836,15 +843,8 @@ void TestDataTypeConversion(arrow::Type::type arrowTypeId) { } for (size_t i = 0; i < TEST_ARRAY_SIZE; ++i) { - auto left = ExtractUnboxedValue(array, i, type, context.HolderFactory); - auto right = values[i]; - - if constexpr (std::is_same_v) { - left = MakeString(NBinaryJson::SerializeToJson(left.AsStringRef())); - right = MakeString(NBinaryJson::SerializeToJson(right.AsStringRef())); - } - - AssertUnboxedValuesAreEqual(left, right, type); + auto arrowValue = ExtractUnboxedValue(array, i, type, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], type); } } @@ -1087,21 +1087,9 @@ Y_UNIT_TEST_SUITE(KqpFormats_Arrow_Conversion) { UNIT_ASSERT(array->length() == static_cast(values.size())); UNIT_ASSERT(array->type_id() == arrow::Type::LIST); - if (IsOptional) { - auto listArray = static_pointer_cast(array); - for (size_t i = 0; i < values.size(); ++i) { - if (i % 2 == 0) { - UNIT_ASSERT(listArray->IsNull(i)); - continue; - } - - UNIT_ASSERT(!listArray->IsNull(i)); - auto slice = listArray->value_slice(i); - - for (size_t j = 0; j < static_cast(slice->length()); ++j) { - UNIT_ASSERT(j % 2 == 0 ? slice->IsNull(j) : !slice->IsNull(j)); - } - } + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, listType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], listType); } } } From 075741c74cb7760947cb2444ed1e462e1181a5a3 Mon Sep 17 00:00:00 2001 From: Daniil Timizhev Date: Wed, 5 Nov 2025 23:24:43 +0300 Subject: [PATCH 11/25] Simplify, tuple, rm dicts --- .../ut/kqp_formats_arrow_ut.cpp | 253 +++++------------- 1 file changed, 61 insertions(+), 192 deletions(-) diff --git a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp index a5faab5c7930..85c2c9262cd7 100644 --- a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp +++ b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp @@ -258,69 +258,21 @@ struct TTestContext { return values; } - TType* GetDictUtf8ToIntervalType() { - TType* keyType = TDataType::Create(NUdf::TDataType::Id, TypeEnv); - TType* payloadType = TDataType::Create(NUdf::TDataType::Id, TypeEnv); - return TDictType::Create(keyType, payloadType, TypeEnv); - } - - TUnboxedValueVector CreateDictUtf8ToInterval(ui32 quantity) { - TUnboxedValueVector values; - auto dictType = GetDictUtf8ToIntervalType(); - for (ui32 value = 0; value < quantity; ++value) { - auto dictBuilder = Vb.NewDict(dictType, 0); - for (ui32 i = 0; i < value * value; ++i) { - std::string string = TStringBuilder() << "This is a long string #" << i; - NUdf::TUnboxedValue key = MakeString(NUdf::TStringRef(string.data(), string.size())); - NUdf::TUnboxedValue payload = NUdf::TUnboxedValuePod(static_cast(value * i)); - dictBuilder->Add(std::move(key), std::move(payload)); - } - auto dictValue = dictBuilder->Build(); - values.emplace_back(std::move(dictValue)); - } - return values; + TType* GetListType() { + auto itemType = TDataType::Create(NUdf::TDataType::Id, TypeEnv); + return TListType::Create(itemType, TypeEnv); } - TType* GetList(TType* itemType, bool optional = false) { - if (optional) { - itemType = TOptionalType::Create(itemType, TypeEnv); - } - auto listType = TListType::Create(itemType, TypeEnv); - if (optional) { - return TOptionalType::Create(listType, TypeEnv); - } - return listType; - } - - TUnboxedValueVector CreateList(ui32 quantity, TType* itemType, bool optional = false) { + TUnboxedValueVector CreateLists(ui32 quantity) { TUnboxedValueVector values; values.reserve(quantity); - for (ui64 value = 0; value < quantity; ++value) { - if (optional && value % 2 == 0) { - values.emplace_back(NUdf::TUnboxedValuePod()); - continue; - } - TUnboxedValueVector items; items.reserve(value); for (ui64 i = 0; i < value; ++i) { - if (optional) { - if (i % 2 == 0) { - items.push_back(NUdf::TUnboxedValuePod()); - } else { - items.push_back(GetValueOfBasicType(itemType, i).MakeOptional()); - } - } else { - items.push_back(GetValueOfBasicType(itemType, i)); - } + items.push_back(NUdf::TUnboxedValuePod(static_cast(-i))); } - auto listValue = Vb.NewList(items.data(), value); - if (optional) { - listValue = std::move(listValue).MakeOptional(); - } - values.emplace_back(std::move(listValue)); } return values; @@ -751,18 +703,20 @@ void AssertUnboxedValuesAreEqual(NUdf::TUnboxedValue& left, NUdf::TUnboxedValue& // break; // } - // case TType::EKind::Tuple: { - // auto tupleType = static_cast(type); - // UNIT_ASSERT_EQUAL(left.GetListLength(), tupleType->GetElementsCount()); - // UNIT_ASSERT_EQUAL(right.GetListLength(), tupleType->GetElementsCount()); - // for (ui32 index = 0; index < tupleType->GetElementsCount(); ++index) { - // auto elementType = tupleType->GetElementType(index); - // NUdf::TUnboxedValue leftMember = left.GetElement(index); - // NUdf::TUnboxedValue rightMember = right.GetElement(index); - // AssertUnboxedValuesAreEqual(leftMember, rightMember, elementType); - // } - // break; - // } + case TType::EKind::Tuple: { + auto tupleType = static_cast(type); + + UNIT_ASSERT_EQUAL(left.GetListLength(), tupleType->GetElementsCount()); + UNIT_ASSERT_EQUAL(right.GetListLength(), tupleType->GetElementsCount()); + + for (ui32 index = 0; index < tupleType->GetElementsCount(); ++index) { + auto elementType = tupleType->GetElementType(index); + NUdf::TUnboxedValue leftMember = left.GetElement(index); + NUdf::TUnboxedValue rightMember = right.GetElement(index); + AssertUnboxedValuesAreEqual(leftMember, rightMember, elementType); + } + break; + } // case TType::EKind::Dict: { // auto dictType = static_cast(type); @@ -1075,22 +1029,52 @@ Y_UNIT_TEST_SUITE(KqpFormats_Arrow_Conversion) { } // Nested types - Y_UNIT_TEST_TWIN(NestedType_List, IsOptional) { + Y_UNIT_TEST(NestedType_List) { TTestContext context; - for (auto itemType: context.BasicTypes) { - auto listType = context.GetList(itemType, IsOptional); - auto values = context.CreateList(100, itemType, IsOptional); + auto listType = context.GetListType(); + auto values = context.CreateLists(100); - auto array = MakeArrowArray(values, listType); - UNIT_ASSERT(array->ValidateFull().ok()); - UNIT_ASSERT(array->length() == static_cast(values.size())); - UNIT_ASSERT(array->type_id() == arrow::Type::LIST); + auto array = MakeArrowArray(values, listType); + UNIT_ASSERT(array->ValidateFull().ok()); + UNIT_ASSERT(array->length() == static_cast(values.size())); + UNIT_ASSERT(array->type_id() == arrow::Type::LIST); - for (size_t i = 0; i < values.size(); ++i) { - auto arrowValue = ExtractUnboxedValue(array, i, listType, context.HolderFactory); - AssertUnboxedValuesAreEqual(arrowValue, values[i], listType); - } + auto listArray = static_pointer_cast(array); + UNIT_ASSERT(listArray->num_fields() == 1); + UNIT_ASSERT(listArray->value_type()->id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, listType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], listType); + } + } + + Y_UNIT_TEST(NestedType_Tuple) { + TTestContext context; + + auto tupleType = context.GetTupleType(); + auto values = context.CreateTuples(100); + + auto array = MakeArrowArray(values, tupleType); + UNIT_ASSERT(array->ValidateFull().ok()); + UNIT_ASSERT(array->length() == static_cast(values.size())); + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + + auto structArray = static_pointer_cast(array); + UNIT_ASSERT(structArray->num_fields() == 3); + + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::UINT8); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT8); + UNIT_ASSERT(structArray->field(2)->type_id() == arrow::Type::UINT8); + + UNIT_ASSERT(static_cast(structArray->field(0)->length()) == values.size()); + UNIT_ASSERT(static_cast(structArray->field(1)->length()) == values.size()); + UNIT_ASSERT(static_cast(structArray->field(2)->length()) == values.size()); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, tupleType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], tupleType); } } } @@ -1137,46 +1121,6 @@ Y_UNIT_TEST_SUITE(DqUnboxedValueToNativeArrowConversion) { } } - Y_UNIT_TEST(Tuple) { - TTestContext context; - - auto tupleType = context.GetTupleType(); - UNIT_ASSERT(IsArrowCompatible(tupleType)); - - auto values = context.CreateTuples(100); - auto array = MakeArrowArray(values, tupleType); - UNIT_ASSERT(array->ValidateFull().ok()); - - UNIT_ASSERT(array->length() == static_cast(values.size())); - UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); - auto structArray = static_pointer_cast(array); - UNIT_ASSERT(structArray->num_fields() == 3); - UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::UINT8); - UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT8); - UNIT_ASSERT(structArray->field(2)->type_id() == arrow::Type::UINT8); - UNIT_ASSERT(static_cast(structArray->field(0)->length()) == values.size()); - UNIT_ASSERT(static_cast(structArray->field(1)->length()) == values.size()); - UNIT_ASSERT(static_cast(structArray->field(2)->length()) == values.size()); - auto boolArray = static_pointer_cast(structArray->field(0)); - auto int8Array = static_pointer_cast(structArray->field(1)); - auto uint8Array = static_pointer_cast(structArray->field(2)); - auto index = 0; - for (const auto& value: values) { - auto boolValue = value.GetElement(0).Get(); - auto boolArrow = boolArray->Value(index); - UNIT_ASSERT(boolValue == boolArrow); - - auto intValue = value.GetElement(1).Get(); - auto intArrow = int8Array->Value(index); - UNIT_ASSERT(intValue == intArrow); - - auto uIntValue = value.GetElement(2).Get(); - auto uIntArrow = uint8Array->Value(index); - UNIT_ASSERT(uIntValue == uIntArrow); - ++index; - } - } - Y_UNIT_TEST(OptionalListOfOptional) { TTestContext context; @@ -1611,51 +1555,6 @@ Y_UNIT_TEST_SUITE(DqUnboxedValueToNativeArrowConversion) { } Y_UNIT_TEST_SUITE(DqUnboxedValueDoNotFitToArrow) { - Y_UNIT_TEST(DictUtf8ToInterval) { - TTestContext context; - - auto dictType = context.GetDictUtf8ToIntervalType(); - UNIT_ASSERT(IsArrowCompatible(dictType)); - - auto values = context.CreateDictUtf8ToInterval(100); - auto array = MakeArrowArray(values, dictType); - UNIT_ASSERT(array->ValidateFull().ok()); - - UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); - auto wrapArray = static_pointer_cast(array); - UNIT_ASSERT_VALUES_EQUAL(wrapArray->num_fields(), 2); - UNIT_ASSERT_VALUES_EQUAL(static_cast(wrapArray->length()), values.size()); - - UNIT_ASSERT(wrapArray->field(0)->type_id() == arrow::Type::MAP); - auto mapArray = static_pointer_cast(wrapArray->field(0)); - UNIT_ASSERT_VALUES_EQUAL(static_cast(mapArray->length()), values.size()); - - UNIT_ASSERT(wrapArray->field(1)->type_id() == arrow::Type::UINT64); - auto customArray = static_pointer_cast(wrapArray->field(1)); - UNIT_ASSERT_VALUES_EQUAL(static_cast(customArray->length()), values.size()); - - UNIT_ASSERT_VALUES_EQUAL(mapArray->num_fields(), 1); - - UNIT_ASSERT(mapArray->keys()->type_id() == arrow::Type::STRING); - auto utf8Array = static_pointer_cast(mapArray->keys()); - - UNIT_ASSERT(mapArray->items()->type_id() == arrow::Type::INT64); - auto intervalArray = static_pointer_cast(mapArray->items()); - - ui64 index = 0; - for (const auto& value: values) { - UNIT_ASSERT_VALUES_EQUAL(value.GetDictLength(), static_cast(mapArray->value_length(index))); - for (auto subindex = mapArray->value_offset(index); subindex < mapArray->value_offset(index + 1); ++subindex) { - auto keyArrow = utf8Array->GetView(subindex); - NUdf::TUnboxedValue key = MakeString(NUdf::TStringRef(keyArrow.data(), keyArrow.size())); - UNIT_ASSERT(value.Contains(key)); - NUdf::TUnboxedValue payloadValue = value.Lookup(key); - UNIT_ASSERT_VALUES_EQUAL(intervalArray->Value(subindex), payloadValue.Get()); - } - ++index; - } - } - Y_UNIT_TEST(DictOptionalToTuple) { TTestContext context; @@ -1802,36 +1701,6 @@ Y_UNIT_TEST_SUITE(ConvertUnboxedValueToArrowAndBack){ } } - Y_UNIT_TEST(Tuple) { - TTestContext context; - - auto tupleType = context.GetTupleType(); - UNIT_ASSERT(IsArrowCompatible(tupleType)); - - auto values = context.CreateTuples(100); - auto array = MakeArrowArray(values, tupleType); - auto restoredValues = ExtractUnboxedVector(array, tupleType, context.HolderFactory); - UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); - for (ui64 index = 0; index < values.size(); ++index) { - AssertUnboxedValuesAreEqual(values[index], restoredValues[index], tupleType); - } - } - - Y_UNIT_TEST(DictUtf8ToInterval) { - TTestContext context; - - auto dictType = context.GetDictUtf8ToIntervalType(); - UNIT_ASSERT(IsArrowCompatible(dictType)); - - auto values = context.CreateDictUtf8ToInterval(100); - auto array = MakeArrowArray(values, dictType); - auto restoredValues = ExtractUnboxedVector(array, dictType, context.HolderFactory); - UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); - for (ui64 index = 0; index < values.size(); ++index) { - AssertUnboxedValuesAreEqual(values[index], restoredValues[index], dictType); - } - } - Y_UNIT_TEST(OptionalListOfOptional) { TTestContext context; From 05807729145e91f6d95b959e8d30ad96a0dacc82 Mon Sep 17 00:00:00 2001 From: Daniil Timizhev Date: Wed, 5 Nov 2025 23:35:16 +0300 Subject: [PATCH 12/25] Struct tests --- .../ut/kqp_formats_arrow_ut.cpp | 76 ++++++++++++++----- 1 file changed, 57 insertions(+), 19 deletions(-) diff --git a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp index 85c2c9262cd7..9328635bef8b 100644 --- a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp +++ b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp @@ -214,23 +214,29 @@ struct TTestContext { } TType* GetStructType() { - TStructMember members[3] = { - {"s", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, - {"x", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, - {"y", TDataType::Create(NUdf::TDataType::Id, TypeEnv)} + std::vector members = { + {"ABC", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, + {"DEF", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, + {"GHI", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, + {"JKL", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, + {"MNO", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, }; - return TStructType::Create(3, members, TypeEnv); + return TStructType::Create(5, members.data(), TypeEnv); } TUnboxedValueVector CreateStructs(ui32 quantity) { TUnboxedValueVector values; for (ui32 value = 0; value < quantity; ++value) { NUdf::TUnboxedValue* items; - auto structValue = Vb.NewArray(3, items); + auto structValue = Vb.NewArray(5, items); + std::string string = TStringBuilder() << value; items[0] = MakeString(NUdf::TStringRef(string.data(), string.size())); items[1] = NUdf::TUnboxedValuePod(static_cast(-value)); - items[2] = NUdf::TUnboxedValuePod((ui64) (value * value)); + items[2] = NUdf::TUnboxedValuePod((ui64) (value)); + items[3] = NUdf::TUnboxedValuePod(static_cast(-value)); + items[4] = NUdf::TUnboxedValuePod(MakeString(NUdf::TStringRef(string.data(), string.size()))); + values.emplace_back(std::move(structValue)); } return values; @@ -690,18 +696,18 @@ void AssertUnboxedValuesAreEqual(NUdf::TUnboxedValue& left, NUdf::TUnboxedValue& break; } - // case TType::EKind::Struct: { - // auto structType = static_cast(type); - // UNIT_ASSERT_EQUAL(left.GetListLength(), structType->GetMembersCount()); - // UNIT_ASSERT_EQUAL(right.GetListLength(), structType->GetMembersCount()); - // for (ui32 index = 0; index < structType->GetMembersCount(); ++index) { - // auto memberType = structType->GetMemberType(index); - // NUdf::TUnboxedValue leftMember = left.GetElement(index); - // NUdf::TUnboxedValue rightMember = right.GetElement(index); - // AssertUnboxedValuesAreEqual(leftMember, rightMember, memberType); - // } - // break; - // } + case TType::EKind::Struct: { + auto structType = static_cast(type); + UNIT_ASSERT_EQUAL(left.GetListLength(), structType->GetMembersCount()); + UNIT_ASSERT_EQUAL(right.GetListLength(), structType->GetMembersCount()); + for (ui32 index = 0; index < structType->GetMembersCount(); ++index) { + auto memberType = structType->GetMemberType(index); + NUdf::TUnboxedValue leftMember = left.GetElement(index); + NUdf::TUnboxedValue rightMember = right.GetElement(index); + AssertUnboxedValuesAreEqual(leftMember, rightMember, memberType); + } + break; + } case TType::EKind::Tuple: { auto tupleType = static_cast(type); @@ -1077,6 +1083,38 @@ Y_UNIT_TEST_SUITE(KqpFormats_Arrow_Conversion) { AssertUnboxedValuesAreEqual(arrowValue, values[i], tupleType); } } + + Y_UNIT_TEST(NestedType_Struct) { + TTestContext context; + + auto structType = context.GetStructType(); + auto values = context.CreateStructs(100); + + auto array = MakeArrowArray(values, structType); + UNIT_ASSERT(array->ValidateFull().ok()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + + auto structArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 5); + + UNIT_ASSERT(structArray->GetFieldByName("ABC") && structArray->GetFieldByName("ABC")->type_id() == arrow::Type::BINARY); + UNIT_ASSERT(structArray->GetFieldByName("DEF") && structArray->GetFieldByName("DEF")->type_id() == arrow::Type::INT32); + UNIT_ASSERT(structArray->GetFieldByName("GHI") && structArray->GetFieldByName("GHI")->type_id() == arrow::Type::UINT64); + UNIT_ASSERT(structArray->GetFieldByName("JKL") && structArray->GetFieldByName("JKL")->type_id() == arrow::Type::INT64); + UNIT_ASSERT(structArray->GetFieldByName("MNO") && structArray->GetFieldByName("MNO")->type_id() == arrow::Type::STRING); + + UNIT_ASSERT_VALUES_EQUAL(structArray->GetFieldByName("ABC")->length(), values.size()); + UNIT_ASSERT_VALUES_EQUAL(structArray->GetFieldByName("DEF")->length(), values.size()); + UNIT_ASSERT_VALUES_EQUAL(structArray->GetFieldByName("GHI")->length(), values.size()); + UNIT_ASSERT_VALUES_EQUAL(structArray->GetFieldByName("JKL")->length(), values.size()); + UNIT_ASSERT_VALUES_EQUAL(structArray->GetFieldByName("MNO")->length(), values.size()); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, structType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], structType); + } + } } Y_UNIT_TEST_SUITE(DqUnboxedValueToNativeArrowConversion) { From 83086642a88b345a14796effd0f54a9059328057 Mon Sep 17 00:00:00 2001 From: Daniil Timizhev Date: Wed, 5 Nov 2025 23:49:28 +0300 Subject: [PATCH 13/25] Blazing asserts, remove old code, some fixes --- .../ut/kqp_formats_arrow_ut.cpp | 126 ++++++------------ 1 file changed, 40 insertions(+), 86 deletions(-) diff --git a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp index 9328635bef8b..f779d639b31c 100644 --- a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp +++ b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp @@ -20,7 +20,8 @@ using namespace NKikimr::NMiniKQL; using namespace NYql; -inline static constexpr size_t TEST_ARRAY_SIZE = 1 << 16; +inline static constexpr size_t TEST_ARRAY_DATATYPE_SIZE = 1 << 16; +inline static constexpr size_t TEST_ARRAY_NESTED_SIZE = 1 << 8; inline static constexpr ui8 DECIMAL_PRECISION = 35; inline static constexpr ui8 DECIMAL_SCALE = 10; @@ -775,15 +776,15 @@ void TestDataTypeConversion(arrow::Type::type arrowTypeId) { UNIT_ASSERT(IsArrowCompatible(type)); TUnboxedValueVector values; - values.reserve(TEST_ARRAY_SIZE); + values.reserve(TEST_ARRAY_DATATYPE_SIZE); - for (size_t i = 0; i < TEST_ARRAY_SIZE; ++i) { + for (size_t i = 0; i < TEST_ARRAY_DATATYPE_SIZE; ++i) { values.emplace_back(GetValueOfBasicType(type, i)); } auto array = MakeArrowArray(values, type); UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); - UNIT_ASSERT(array->length() == static_cast(values.size())); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); std::shared_ptr typedArray; std::shared_ptr timezoneArray; @@ -791,7 +792,8 @@ void TestDataTypeConversion(arrow::Type::type arrowTypeId) { if constexpr (IsTimezoneType) { UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); auto structArray = static_pointer_cast(array); - UNIT_ASSERT(structArray->num_fields() == 2); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 2); + UNIT_ASSERT(structArray->field(0)->type_id() == arrowTypeId); UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::STRING); @@ -802,7 +804,7 @@ void TestDataTypeConversion(arrow::Type::type arrowTypeId) { typedArray = static_pointer_cast(array); } - for (size_t i = 0; i < TEST_ARRAY_SIZE; ++i) { + for (size_t i = 0; i < TEST_ARRAY_DATATYPE_SIZE; ++i) { auto arrowValue = ExtractUnboxedValue(array, i, type, context.HolderFactory); AssertUnboxedValuesAreEqual(arrowValue, values[i], type); } @@ -822,23 +824,23 @@ void TestFixedSizeBinaryDataTypeConversion() { UNIT_ASSERT(IsArrowCompatible(type)); TUnboxedValueVector values; - values.reserve(TEST_ARRAY_SIZE); + values.reserve(TEST_ARRAY_DATATYPE_SIZE); - for (size_t i = 0; i < TEST_ARRAY_SIZE; ++i) { + for (size_t i = 0; i < TEST_ARRAY_DATATYPE_SIZE; ++i) { values.emplace_back(GetValueOfBasicType(type, i)); } auto array = MakeArrowArray(values, type); UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); - UNIT_ASSERT(array->length() == static_cast(values.size())); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); std::shared_ptr typedArray; UNIT_ASSERT(array->type_id() == arrow::Type::FIXED_SIZE_BINARY); typedArray = static_pointer_cast(array); - UNIT_ASSERT(typedArray->byte_width() == NScheme::FSB_SIZE); + UNIT_ASSERT_VALUES_EQUAL(typedArray->byte_width(), NScheme::FSB_SIZE); - for (size_t i = 0; i < TEST_ARRAY_SIZE; ++i) { + for (size_t i = 0; i < TEST_ARRAY_DATATYPE_SIZE; ++i) { auto arrowValue = ExtractUnboxedValue(array, i, type, context.HolderFactory); AssertUnboxedValuesAreEqual(arrowValue, values[i], type); } @@ -852,25 +854,25 @@ void TestSingularTypeConversion() { UNIT_ASSERT(IsArrowCompatible(type)); TUnboxedValueVector values; - values.reserve(TEST_ARRAY_SIZE); + values.reserve(TEST_ARRAY_DATATYPE_SIZE); - for (size_t i = 0; i < TEST_ARRAY_SIZE; ++i) { + for (size_t i = 0; i < TEST_ARRAY_DATATYPE_SIZE; ++i) { values.emplace_back(); } auto array = MakeArrowArray(values, type); UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); - UNIT_ASSERT(array->length() == static_cast(TEST_ARRAY_SIZE)); + UNIT_ASSERT_VALUES_EQUAL(array->length(), TEST_ARRAY_DATATYPE_SIZE); if (SingularKind == TType::EKind::Null) { UNIT_ASSERT(array->type_id() == arrow::Type::NA); } else { UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); auto structArray = static_pointer_cast(array); - UNIT_ASSERT(structArray->num_fields() == 0); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 0); } - for (size_t i = 0; i < TEST_ARRAY_SIZE; ++i) { + for (size_t i = 0; i < TEST_ARRAY_DATATYPE_SIZE; ++i) { auto arrowValue = ExtractUnboxedValue(array, i, type, context.HolderFactory); AssertUnboxedValuesAreEqual(arrowValue, values[i], type); } @@ -1039,15 +1041,17 @@ Y_UNIT_TEST_SUITE(KqpFormats_Arrow_Conversion) { TTestContext context; auto listType = context.GetListType(); - auto values = context.CreateLists(100); + auto values = context.CreateLists(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(listType)); auto array = MakeArrowArray(values, listType); - UNIT_ASSERT(array->ValidateFull().ok()); - UNIT_ASSERT(array->length() == static_cast(values.size())); - UNIT_ASSERT(array->type_id() == arrow::Type::LIST); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::LIST); auto listArray = static_pointer_cast(array); - UNIT_ASSERT(listArray->num_fields() == 1); + UNIT_ASSERT_VALUES_EQUAL(listArray->num_fields(), 1); UNIT_ASSERT(listArray->value_type()->id() == arrow::Type::INT32); for (size_t i = 0; i < values.size(); ++i) { @@ -1060,23 +1064,25 @@ Y_UNIT_TEST_SUITE(KqpFormats_Arrow_Conversion) { TTestContext context; auto tupleType = context.GetTupleType(); - auto values = context.CreateTuples(100); + auto values = context.CreateTuples(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(tupleType)); auto array = MakeArrowArray(values, tupleType); - UNIT_ASSERT(array->ValidateFull().ok()); - UNIT_ASSERT(array->length() == static_cast(values.size())); - UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); auto structArray = static_pointer_cast(array); - UNIT_ASSERT(structArray->num_fields() == 3); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 3); UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::UINT8); UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT8); UNIT_ASSERT(structArray->field(2)->type_id() == arrow::Type::UINT8); - UNIT_ASSERT(static_cast(structArray->field(0)->length()) == values.size()); - UNIT_ASSERT(static_cast(structArray->field(1)->length()) == values.size()); - UNIT_ASSERT(static_cast(structArray->field(2)->length()) == values.size()); + UNIT_ASSERT_VALUES_EQUAL(static_cast(structArray->field(0)->length()), values.size()); + UNIT_ASSERT_VALUES_EQUAL(static_cast(structArray->field(1)->length()), values.size()); + UNIT_ASSERT_VALUES_EQUAL(static_cast(structArray->field(2)->length()), values.size()); for (size_t i = 0; i < values.size(); ++i) { auto arrowValue = ExtractUnboxedValue(array, i, tupleType, context.HolderFactory); @@ -1088,13 +1094,15 @@ Y_UNIT_TEST_SUITE(KqpFormats_Arrow_Conversion) { TTestContext context; auto structType = context.GetStructType(); - auto values = context.CreateStructs(100); + auto values = context.CreateStructs(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(structType)); auto array = MakeArrowArray(values, structType); - UNIT_ASSERT(array->ValidateFull().ok()); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); - UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); auto structArray = static_pointer_cast(array); UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 5); @@ -1118,47 +1126,6 @@ Y_UNIT_TEST_SUITE(KqpFormats_Arrow_Conversion) { } Y_UNIT_TEST_SUITE(DqUnboxedValueToNativeArrowConversion) { - Y_UNIT_TEST(Struct) { - TTestContext context; - - auto structType = context.GetStructType(); - UNIT_ASSERT(IsArrowCompatible(structType)); - - auto values = context.CreateStructs(100); - auto array = MakeArrowArray(values, structType); - - UNIT_ASSERT(array->ValidateFull().ok()); - UNIT_ASSERT(array->length() == static_cast(values.size())); - UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); - auto structArray = static_pointer_cast(array); - UNIT_ASSERT(structArray->num_fields() == 3); - UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::BINARY); - UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT32); - UNIT_ASSERT(structArray->field(2)->type_id() == arrow::Type::UINT64); - UNIT_ASSERT(static_cast(structArray->field(0)->length()) == values.size()); - UNIT_ASSERT(static_cast(structArray->field(1)->length()) == values.size()); - UNIT_ASSERT(static_cast(structArray->field(2)->length()) == values.size()); - auto binaryArray = static_pointer_cast(structArray->field(0)); - auto int32Array = static_pointer_cast(structArray->field(1)); - auto uint64Array = static_pointer_cast(structArray->field(2)); - auto index = 0; - for (const auto& value: values) { - auto stringValue = value.GetElement(0); - auto stringRef = stringValue.AsStringRef(); - auto stringView = binaryArray->GetView(index); - UNIT_ASSERT_EQUAL(std::string(stringRef.Data(), stringRef.Size()), std::string(stringView)); - - auto intValue = value.GetElement(1).Get(); - auto intArrow = int32Array->Value(index); - UNIT_ASSERT_EQUAL(intValue, intArrow); - - auto uIntValue = value.GetElement(2).Get(); - auto uIntArrow = uint64Array->Value(index); - UNIT_ASSERT_EQUAL(uIntValue, uIntArrow); - ++index; - } - } - Y_UNIT_TEST(OptionalListOfOptional) { TTestContext context; @@ -1726,19 +1693,6 @@ Y_UNIT_TEST_SUITE(DqUnboxedValueDoNotFitToArrow) { } Y_UNIT_TEST_SUITE(ConvertUnboxedValueToArrowAndBack){ - Y_UNIT_TEST(Struct) { - TTestContext context; - - auto structType = context.GetStructType(); - auto values = context.CreateStructs(100); - auto array = MakeArrowArray(values, structType); - auto restoredValues = ExtractUnboxedVector(array, structType, context.HolderFactory); - UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); - for (ui64 index = 0; index < values.size(); ++index) { - AssertUnboxedValuesAreEqual(values[index], restoredValues[index], structType); - } - } - Y_UNIT_TEST(OptionalListOfOptional) { TTestContext context; From 540138967951c8d303f8224c21b4e2b90488c91f Mon Sep 17 00:00:00 2001 From: Daniil Timizhev Date: Wed, 5 Nov 2025 23:53:16 +0300 Subject: [PATCH 14/25] MORE CHECKS FOR STRUCT --- .../ut/kqp_formats_arrow_ut.cpp | 26 +++++++++++-------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp index f779d639b31c..a1f036906171 100644 --- a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp +++ b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp @@ -1106,17 +1106,21 @@ Y_UNIT_TEST_SUITE(KqpFormats_Arrow_Conversion) { auto structArray = static_pointer_cast(array); UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 5); - UNIT_ASSERT(structArray->GetFieldByName("ABC") && structArray->GetFieldByName("ABC")->type_id() == arrow::Type::BINARY); - UNIT_ASSERT(structArray->GetFieldByName("DEF") && structArray->GetFieldByName("DEF")->type_id() == arrow::Type::INT32); - UNIT_ASSERT(structArray->GetFieldByName("GHI") && structArray->GetFieldByName("GHI")->type_id() == arrow::Type::UINT64); - UNIT_ASSERT(structArray->GetFieldByName("JKL") && structArray->GetFieldByName("JKL")->type_id() == arrow::Type::INT64); - UNIT_ASSERT(structArray->GetFieldByName("MNO") && structArray->GetFieldByName("MNO")->type_id() == arrow::Type::STRING); - - UNIT_ASSERT_VALUES_EQUAL(structArray->GetFieldByName("ABC")->length(), values.size()); - UNIT_ASSERT_VALUES_EQUAL(structArray->GetFieldByName("DEF")->length(), values.size()); - UNIT_ASSERT_VALUES_EQUAL(structArray->GetFieldByName("GHI")->length(), values.size()); - UNIT_ASSERT_VALUES_EQUAL(structArray->GetFieldByName("JKL")->length(), values.size()); - UNIT_ASSERT_VALUES_EQUAL(structArray->GetFieldByName("MNO")->length(), values.size()); + UNIT_ASSERT(structArray->GetFieldByName("ABC") && structArray->GetFieldByName("ABC") == structArray->field(0)); + UNIT_ASSERT(structArray->GetFieldByName("DEF") && structArray->GetFieldByName("DEF") == structArray->field(1)); + UNIT_ASSERT(structArray->GetFieldByName("GHI") && structArray->GetFieldByName("GHI") == structArray->field(2)); + UNIT_ASSERT(structArray->GetFieldByName("JKL") && structArray->GetFieldByName("JKL") == structArray->field(3)); + UNIT_ASSERT(structArray->GetFieldByName("MNO") && structArray->GetFieldByName("MNO") == structArray->field(4)); + + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::BINARY); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT32); + UNIT_ASSERT(structArray->field(2)->type_id() == arrow::Type::UINT64); + UNIT_ASSERT(structArray->field(3)->type_id() == arrow::Type::INT64); + UNIT_ASSERT(structArray->field(4)->type_id() == arrow::Type::STRING); + + for (int i = 0; i < structArray->num_fields(); ++i) { + UNIT_ASSERT_VALUES_EQUAL(structArray->field(i)->length(), values.size()); + } for (size_t i = 0; i < values.size(); ++i) { auto arrowValue = ExtractUnboxedValue(array, i, structType, context.HolderFactory); From a5de4c91ccfb5a5488e2423900820eba6d1f2237 Mon Sep 17 00:00:00 2001 From: Daniil Timizhev Date: Thu, 6 Nov 2025 12:43:48 +0300 Subject: [PATCH 15/25] Support dict --- .../result_set_format/kqp_formats_arrow.cpp | 67 ++++------------- .../result_set_format/kqp_formats_arrow.h | 2 +- .../ut/kqp_formats_arrow_ut.cpp | 72 +++++++++++++++---- .../ut/kqp_formats_ut_helpers.cpp | 41 +++-------- 4 files changed, 85 insertions(+), 97 deletions(-) diff --git a/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp b/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp index 9964b34c4fb2..4a5bb3c4dac7 100644 --- a/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp +++ b/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp @@ -109,22 +109,11 @@ std::shared_ptr GetArrowType(const NMiniKQL::TDictType* dictTyp auto keyType = dictType->GetKeyType(); auto payloadType = dictType->GetPayloadType(); - auto keyArrowType = NFormats::GetArrowType(keyType); - auto payloadArrowType = NFormats::GetArrowType(payloadType); - - auto custom = std::make_shared("custom", arrow::uint64(), false); - - if (keyType->GetKind() == NMiniKQL::TType::EKind::Optional) { - std::vector> items; - items.emplace_back(std::make_shared("key", keyArrowType, true)); - items.emplace_back(std::make_shared("payload", payloadArrowType, payloadType->IsOptional())); - - auto fieldMap = std::make_shared("map", arrow::list(arrow::struct_(items)), false); - return arrow::struct_({fieldMap, custom}); - } - - auto fieldMap = std::make_shared("map", arrow::map(keyArrowType, payloadArrowType), false); - return arrow::struct_({fieldMap, custom}); + auto structType = arrow::struct_({ + std::make_shared("key", NFormats::GetArrowType(keyType), keyType->IsOptional()), + std::make_shared("payload", NFormats::GetArrowType(payloadType), payloadType->IsOptional()) + }); + return arrow::list(structType); } std::shared_ptr GetArrowType(const NMiniKQL::TVariantType* variantType) { @@ -735,49 +724,23 @@ void AppendElement(NUdf::TUnboxedValue value, arrow::ArrayBuilder* builder, cons arrow::ArrayBuilder* itemBuilder = nullptr; arrow::StructBuilder* structBuilder = nullptr; - YQL_ENSURE(builder->type()->id() == arrow::Type::STRUCT, "Unexpected builder type"); - arrow::StructBuilder* wrapBuilder = reinterpret_cast(builder); - YQL_ENSURE(wrapBuilder->num_fields() == 2, "Unexpected number of fields"); + YQL_ENSURE(builder->type()->id() == arrow::Type::LIST, "Unexpected builder type"); + arrow::ListBuilder* listBuilder = reinterpret_cast(builder); - auto status = wrapBuilder->Append(); + auto status = listBuilder->Append(); YQL_ENSURE(status.ok(), "Failed to append dict value: " << status.ToString()); - if (keyType->GetKind() == NMiniKQL::TType::EKind::Optional) { - YQL_ENSURE(wrapBuilder->field_builder(0)->type()->id() == arrow::Type::LIST, "Unexpected builder type"); - auto listBuilder = reinterpret_cast(wrapBuilder->field_builder(0)); - - auto status = listBuilder->Append(); - YQL_ENSURE(status.ok(), "Failed to append dict value: " << status.ToString()); - - YQL_ENSURE(listBuilder->value_builder()->type()->id() == arrow::Type::STRUCT, "Unexpected builder type"); - structBuilder = reinterpret_cast( - listBuilder->value_builder()); - YQL_ENSURE(structBuilder->num_fields() == 2, "Unexpected number of fields"); - - keyBuilder = structBuilder->field_builder(0); - itemBuilder = structBuilder->field_builder(1); - } else { - YQL_ENSURE(wrapBuilder->field_builder(0)->type()->id() == arrow::Type::MAP, "Unexpected builder type"); - auto mapBuilder = reinterpret_cast(wrapBuilder->field_builder(0)); - - auto status = mapBuilder->Append(); - YQL_ENSURE(status.ok(), "Failed to append dict value: " << status.ToString()); - - keyBuilder = mapBuilder->key_builder(); - itemBuilder = mapBuilder->item_builder(); - } + YQL_ENSURE(listBuilder->value_builder()->type()->id() == arrow::Type::STRUCT, "Unexpected builder type"); + structBuilder = reinterpret_cast(listBuilder->value_builder()); + YQL_ENSURE(structBuilder->num_fields() == 2, "Unexpected number of fields"); - arrow::UInt64Builder* customBuilder = reinterpret_cast(wrapBuilder->field_builder(1)); - status = customBuilder->Append(0); - YQL_ENSURE(status.ok(), "Failed to append dict value: " << status.ToString()); + keyBuilder = structBuilder->field_builder(0); + itemBuilder = structBuilder->field_builder(1); - // We do not sort dictionary before appending it to builder. const auto iter = value.GetDictIterator(); for (NUdf::TUnboxedValue key, payload; iter.NextPair(key, payload);) { - if (structBuilder != nullptr) { - status = structBuilder->Append(); - YQL_ENSURE(status.ok(), "Failed to append dict value: " << status.ToString()); - } + status = structBuilder->Append(); + YQL_ENSURE(status.ok(), "Failed to append dict value: " << status.ToString()); AppendElement(key, keyBuilder, keyType); AppendElement(payload, itemBuilder, payloadType); diff --git a/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.h b/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.h index 92a454acf443..5178614afb25 100644 --- a/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.h +++ b/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.h @@ -128,7 +128,7 @@ bool NeedWrapByExternalOptional(const NMiniKQL::TType* type); * - Data types: mapped according to SwitchMiniKQLDataTypeToArrowType * - Struct/Tuple: converted to arrow::StructType * - List: converted to arrow::ListType - * - Dict: converted to arrow::MapType or List (if key is Optional) + * - Dict: converted to arrow::ListType of arrow::StructType * - Variant: converted to arrow::DenseUnionType * - Optional: nested optionals are flattened and represented via struct wrapping * diff --git a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp index a1f036906171..1909a9f5c746 100644 --- a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp +++ b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp @@ -285,6 +285,27 @@ struct TTestContext { return values; } + TType* GetDictType() { + TType* keyType = TDataType::Create(NUdf::TDataType::Id, TypeEnv); + TType* payloadType = TDataType::Create(NUdf::TDataType::Id, TypeEnv); + return TDictType::Create(keyType, payloadType, TypeEnv); + } + + TUnboxedValueVector CreateDicts(ui32 quantity) { + TUnboxedValueVector values; + for (ui64 value = 0; value < quantity; ++value) { + auto dictBuilder = Vb.NewDict(GetDictType(), 0); + for (ui64 i = 0; i < value; ++i) { + NUdf::TUnboxedValue key = NUdf::TUnboxedValuePod(static_cast(i)); + NUdf::TUnboxedValue payload = NUdf::TUnboxedValuePod(static_cast(i * value)); + dictBuilder->Add(std::move(key), std::move(payload)); + } + auto dictValue = dictBuilder->Build(); + values.emplace_back(std::move(dictValue)); + } + return values; + } + TType* GetOptionalListOfOptional() { TType* itemType = TOptionalType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), TypeEnv); return TOptionalType::Create(TListType::Create(itemType, TypeEnv), TypeEnv); @@ -725,19 +746,18 @@ void AssertUnboxedValuesAreEqual(NUdf::TUnboxedValue& left, NUdf::TUnboxedValue& break; } - // case TType::EKind::Dict: { - // auto dictType = static_cast(type); - // auto payloadType = dictType->GetPayloadType(); + case TType::EKind::Dict: { + auto dictType = static_cast(type); + UNIT_ASSERT_VALUES_EQUAL(left.GetDictLength(), right.GetDictLength()); - // UNIT_ASSERT_EQUAL(left.GetDictLength(), right.GetDictLength()); - // const auto leftIter = left.GetDictIterator(); - // for (NUdf::TUnboxedValue key, leftPayload; leftIter.NextPair(key, leftPayload);) { - // UNIT_ASSERT(right.Contains(key)); - // NUdf::TUnboxedValue rightPayload = right.Lookup(key); - // AssertUnboxedValuesAreEqual(leftPayload, rightPayload, payloadType); - // } - // break; - // } + const auto leftIter = left.GetDictIterator(); + for (NUdf::TUnboxedValue key, leftPayload; leftIter.NextPair(key, leftPayload);) { + UNIT_ASSERT(right.Contains(key)); + NUdf::TUnboxedValue rightPayload = right.Lookup(key); + AssertUnboxedValuesAreEqual(leftPayload, rightPayload, dictType->GetPayloadType()); + } + break; + } // case TType::EKind::Variant: { // auto variantType = static_cast(type); @@ -1127,6 +1147,34 @@ Y_UNIT_TEST_SUITE(KqpFormats_Arrow_Conversion) { AssertUnboxedValuesAreEqual(arrowValue, values[i], structType); } } + + Y_UNIT_TEST(NestedType_Dict) { + TTestContext context; + + auto dictType = context.GetDictType(); + auto values = context.CreateDicts(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(dictType)); + + auto array = MakeArrowArray(values, dictType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::LIST); + auto listArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(listArray->num_fields(), 1); + UNIT_ASSERT(listArray->value_type()->id() == arrow::Type::STRUCT); + + for (size_t i = 0; i < values.size(); ++i) { + auto structArray = static_pointer_cast(listArray->value_slice(i)); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 2); + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::DOUBLE); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT32); + + auto arrowValue = ExtractUnboxedValue(array, i, dictType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], dictType); + } + } } Y_UNIT_TEST_SUITE(DqUnboxedValueToNativeArrowConversion) { diff --git a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp index f1cccbf1fa3b..ef3cebbba9ce 100644 --- a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp +++ b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp @@ -314,42 +314,19 @@ NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& arr auto keyType = dictType->GetKeyType(); auto payloadType = dictType->GetPayloadType(); - auto dictBuilder = holderFactory.NewDict(dictType, NUdf::TDictFlags::EDictKind::Hashed); + auto dictBuilder = holderFactory.NewDict(dictType, 0); - std::shared_ptr keyArray = nullptr; - std::shared_ptr payloadArray = nullptr; - ui64 dictLength = 0; - ui64 offset = 0; - - YQL_ENSURE(array->type_id() == arrow::Type::STRUCT, "Unexpected array type"); - auto wrapArray = static_pointer_cast(array); - YQL_ENSURE(wrapArray->num_fields() == 2, "Unexpected count of fields"); - - auto dictSlice = wrapArray->field(0); - - if (keyType->GetKind() == NMiniKQL::TType::EKind::Optional) { - YQL_ENSURE(dictSlice->type_id() == arrow::Type::LIST, "Unexpected array type"); - auto listArray = static_pointer_cast(dictSlice); - - auto arraySlice = listArray->value_slice(row); - YQL_ENSURE(arraySlice->type_id() == arrow::Type::STRUCT, "Unexpected array type"); - auto structArray = static_pointer_cast(arraySlice); - YQL_ENSURE(structArray->num_fields() == 2, "Unexpected count of fields"); + YQL_ENSURE(array->type_id() == arrow::Type::LIST, "Unexpected array type"); + auto listArray = static_pointer_cast(array); + YQL_ENSURE(listArray->value_type()->id() == arrow::Type::STRUCT, "Unexpected array type"); - dictLength = arraySlice->length(); - keyArray = structArray->field(0); - payloadArray = structArray->field(1); - } else { - YQL_ENSURE(dictSlice->type_id() == arrow::Type::MAP, "Unexpected array type"); - auto mapArray = static_pointer_cast(dictSlice); + auto structArray = static_pointer_cast(listArray->value_slice(row)); + YQL_ENSURE(static_cast(structArray->num_fields()) == 2, "Unexpected count of fields"); - dictLength = mapArray->value_length(row); - offset = mapArray->value_offset(row); - keyArray = mapArray->keys(); - payloadArray = mapArray->items(); - } + std::shared_ptr keyArray = structArray->field(0); + std::shared_ptr payloadArray = structArray->field(1); - for (ui64 i = offset; i < offset + static_cast(dictLength); ++i) { + for (ui64 i = 0; i < static_cast(structArray->length()); ++i) { auto key = ExtractUnboxedValue(keyArray, i, keyType, holderFactory); auto payload = ExtractUnboxedValue(payloadArray, i, payloadType, holderFactory); dictBuilder->Add(std::move(key), std::move(payload)); From d02a7f585b0d2a86780c772a45f1b8ee9cbf460d Mon Sep 17 00:00:00 2001 From: Daniil Timizhev Date: Thu, 6 Nov 2025 13:59:05 +0300 Subject: [PATCH 16/25] Support optional --- .../result_set_format/kqp_formats_arrow.cpp | 14 ++- .../ut/kqp_formats_arrow_ut.cpp | 115 ++++++++++++++++++ .../ut/kqp_formats_ut_helpers.cpp | 15 +-- 3 files changed, 132 insertions(+), 12 deletions(-) diff --git a/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp b/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp index 4a5bb3c4dac7..f7e1f15760ff 100644 --- a/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp +++ b/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp @@ -182,17 +182,18 @@ std::shared_ptr GetArrowType(const NMiniKQL::TOptionalType* opt ++depth; } + // For types without native validity bitmap (e.g., Variant, Null) we need to wrap them in an additional struct layer + // Furthermore, other singular types (e.g., Void, EmptyList, EmptyDict) also need to wrap (from YQL-15332) + // Thus, the depth == 2 for Optional> type if (NeedWrapByExternalOptional(currentType)) { ++depth; } std::shared_ptr innerArrowType = NFormats::GetArrowType(currentType); - - for (ui32 i = 1; i < depth; ++i) { - auto field = std::make_shared("opt", innerArrowType, false); - innerArrowType = std::make_shared(std::vector>{field}); + while (depth > 1) { + innerArrowType = arrow::struct_({std::make_shared("opt", innerArrowType, true)}); + --depth; } - return innerArrowType; } @@ -619,6 +620,9 @@ void AppendElement(NUdf::TUnboxedValue value, arrow::ArrayBuilder* builder, cons ++depth; } + // For types without native validity bitmap (e.g., Variant, Null) we need to wrap them in an additional struct layer + // Furthermore, other singular types (e.g., Void, EmptyList, EmptyDict) also need to wrap (from YQL-15332) + // Thus, the depth == 2 for Optional> type if (NeedWrapByExternalOptional(innerType)) { ++depth; } diff --git a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp index 1909a9f5c746..b5a4cad1103d 100644 --- a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp +++ b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp @@ -306,6 +306,56 @@ struct TTestContext { return values; } + TType* GetDataOptionalType() { + return TOptionalType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), TypeEnv); + } + + TUnboxedValueVector CreateDataOptionals(ui32 quantity) { + TUnboxedValueVector values; + for (ui64 value = 0; value < quantity; ++value) { + if (value % 2 == 0) { + values.push_back(NUdf::TUnboxedValuePod(static_cast(value)).MakeOptional()); + } else { + values.emplace_back(); + } + } + return values; + } + + TType* GetSingularOptionalType() { + return TOptionalType::Create(GetTypeOfSingular(TypeEnv), TypeEnv); + } + + TUnboxedValueVector CreateSingularOptionals(ui32 quantity) { + TUnboxedValueVector values; + for (ui64 value = 0; value < quantity; ++value) { + if (value % 2 == 0) { + values.push_back(NUdf::TUnboxedValuePod().MakeOptional()); + } else { + values.emplace_back(); + } + } + return values; + } + + TType* GetDoubleOptionalType() { + return TOptionalType::Create(GetDataOptionalType(), TypeEnv); + } + + TUnboxedValueVector CreateDoubleOptionals(ui32 quantity) { + TUnboxedValueVector values; + for (ui64 value = 0; value < quantity; ++value) { + if (value % 3 == 0) { + values.push_back(NUdf::TUnboxedValuePod(static_cast(value)).MakeOptional().MakeOptional()); + } else if (value % 3 == 1) { + values.push_back(NUdf::TUnboxedValuePod().MakeOptional()); + } else { + values.emplace_back(); + } + } + return values; + } + TType* GetOptionalListOfOptional() { TType* itemType = TOptionalType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), TypeEnv); return TOptionalType::Create(TListType::Create(itemType, TypeEnv), TypeEnv); @@ -1175,6 +1225,71 @@ Y_UNIT_TEST_SUITE(KqpFormats_Arrow_Conversion) { AssertUnboxedValuesAreEqual(arrowValue, values[i], dictType); } } + + Y_UNIT_TEST(NestedType_Optional_Data) { + TTestContext context; + + auto optionalType = context.GetDataOptionalType(); + auto values = context.CreateDataOptionals(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(optionalType)); + + auto array = MakeArrowArray(values, optionalType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, optionalType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], optionalType); + } + } + + Y_UNIT_TEST(NestedType_Optional_Singular) { + TTestContext context; + + auto optionalType = context.GetSingularOptionalType(); + auto values = context.CreateSingularOptionals(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(optionalType)); + + auto array = MakeArrowArray(values, optionalType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + auto structArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 1); + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::NA); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, optionalType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], optionalType); + } + } + + Y_UNIT_TEST(NestedType_Optional_Double) { + TTestContext context; + + auto optionalType = context.GetDoubleOptionalType(); + auto values = context.CreateDoubleOptionals(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(optionalType)); + + auto array = MakeArrowArray(values, optionalType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + auto structArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 1); + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, optionalType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], optionalType); + } + } } Y_UNIT_TEST_SUITE(DqUnboxedValueToNativeArrowConversion) { diff --git a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp index ef3cebbba9ce..3f61d5993a47 100644 --- a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp +++ b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp @@ -256,8 +256,6 @@ NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& arr auto innerArray = array; auto innerType = itemType; - - NUdf::TUnboxedValue value; int depth = 0; while (innerArray->type_id() == arrow::Type::STRUCT) { @@ -265,8 +263,11 @@ NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& arr YQL_ENSURE(structArray->num_fields() == 1, "Unexpected count of fields"); if (structArray->IsNull(row)) { - value = NUdf::TUnboxedValuePod(); - break; + NUdf::TUnboxedValue value; + for (int i = 0; i < depth; ++i) { + value = value.MakeOptional(); + } + return value; } innerType = static_cast(innerType)->GetItemType(); @@ -274,10 +275,10 @@ NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& arr ++depth; } - auto wrap = NeedWrapByExternalOptional(innerType); - if (wrap || !innerArray->IsNull(row)) { + NUdf::TUnboxedValue value; + if (NeedWrapByExternalOptional(innerType) || !innerArray->IsNull(row)) { value = ExtractUnboxedValue(innerArray, row, innerType, holderFactory); - if (wrap) { + if (NeedWrapByExternalOptional(innerType)) { --depth; } } From b33832006662901f6dbccbcb70c850c42ace9c41 Mon Sep 17 00:00:00 2001 From: Daniil Timizhev Date: Fri, 7 Nov 2025 00:12:26 +0300 Subject: [PATCH 17/25] Support Tagged, fix comments --- .../result_set_format/kqp_formats_arrow.cpp | 12 +++--- .../result_set_format/kqp_formats_arrow.h | 6 +-- .../ut/kqp_formats_arrow_ut.cpp | 37 +++++++++++++++++++ .../ut/kqp_formats_ut_helpers.cpp | 23 ++++++++++-- .../ut/kqp_formats_ut_helpers.h | 8 ++++ 5 files changed, 75 insertions(+), 11 deletions(-) diff --git a/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp b/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp index f7e1f15760ff..f89adca4e86e 100644 --- a/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp +++ b/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp @@ -2,6 +2,7 @@ #include +#include #include #include #include @@ -174,11 +175,11 @@ std::shared_ptr GetArrowType(const NMiniKQL::TVariantType* vari } std::shared_ptr GetArrowType(const NMiniKQL::TOptionalType* optionalType) { - auto currentType = optionalType->GetItemType(); + auto currentType = SkipTaggedType(optionalType->GetItemType()); ui32 depth = 1; while (currentType->IsOptional()) { - currentType = static_cast(currentType)->GetItemType(); + currentType = SkipTaggedType(static_cast(currentType)->GetItemType()); ++depth; } @@ -612,11 +613,11 @@ void AppendElement(NUdf::TUnboxedValue value, arrow::ArrayBuilder* builder, cons } case NMiniKQL::TType::EKind::Optional: { - auto innerType = static_cast(type)->GetItemType(); + auto innerType = SkipTaggedType(static_cast(type)->GetItemType()); ui32 depth = 1; while (innerType->IsOptional()) { - innerType = static_cast(innerType) ->GetItemType(); + innerType = SkipTaggedType(static_cast(innerType) ->GetItemType()); ++depth; } @@ -797,7 +798,8 @@ void AppendElement(NUdf::TUnboxedValue value, arrow::ArrayBuilder* builder, cons } case NMiniKQL::TType::EKind::Tagged: { - // TODO: Support Tagged type + auto taggedType = static_cast(type); + AppendElement(value, builder, taggedType->GetBaseType()); break; } diff --git a/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.h b/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.h index 5178614afb25..6bfd66e9bcbc 100644 --- a/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.h +++ b/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.h @@ -107,7 +107,7 @@ bool SwitchMiniKQLDataTypeToArrowType(NUdf::EDataSlot typeId, TFunc&& callback) * @brief Determines if a type requires wrapping in an external Optional layer. * * Some MiniKQL types don't have a native validity bitmap in Arrow representation - * (e.g., Variant, Null, Void). These types need to be wrapped in an additional + * (e.g., Variant, Null). These types need to be wrapped in an additional * struct layer when used as optional values to properly represent NULL states. * * @param type The MiniKQL type to check @@ -121,7 +121,7 @@ bool NeedWrapByExternalOptional(const NMiniKQL::TType* type); * @brief Converts a MiniKQL type to its corresponding Arrow DataType. * * This function recursively converts complex MiniKQL types (Struct, Tuple, List, Dict, - * Variant, Optional) to their Arrow equivalents. The conversion preserves the structure + * Variant, Optional, Tagged) to their Arrow equivalents. The conversion preserves the structure * and nullability information. * * Conversion rules: @@ -131,6 +131,7 @@ bool NeedWrapByExternalOptional(const NMiniKQL::TType* type); * - Dict: converted to arrow::ListType of arrow::StructType * - Variant: converted to arrow::DenseUnionType * - Optional: nested optionals are flattened and represented via struct wrapping + * - Tagged: converted to inner type * * @param type The MiniKQL type to convert * @return Shared pointer to corresponding Arrow DataType, or arrow::NullType if unsupported @@ -147,7 +148,6 @@ std::shared_ptr GetArrowType(const NMiniKQL::TType* type); * @param type The MiniKQL type to validate * @return true if the type can be converted to Arrow format, false otherwise * - * @note Compatible types: Data, Struct, Tuple, List, Dict, Variant, Optional, Tagged * @note Incompatible types: Type, Stream, Callable, Any, Resource, Flow, Block, Pg, Multi, Linear */ bool IsArrowCompatible(const NMiniKQL::TType* type); diff --git a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp index b5a4cad1103d..e604b9c7f013 100644 --- a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp +++ b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp @@ -356,6 +356,18 @@ struct TTestContext { return values; } + TType* GetTaggedType() { + return TTaggedType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), "tag", TypeEnv); + } + + TUnboxedValueVector CreateTaggeds(ui32 quantity) { + TUnboxedValueVector values; + for (ui64 value = 0; value < quantity; ++value) { + values.push_back(NUdf::TUnboxedValuePod(static_cast(value))); + } + return values; + } + TType* GetOptionalListOfOptional() { TType* itemType = TOptionalType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), TypeEnv); return TOptionalType::Create(TListType::Create(itemType, TypeEnv), TypeEnv); @@ -826,6 +838,12 @@ void AssertUnboxedValuesAreEqual(NUdf::TUnboxedValue& left, NUdf::TUnboxedValue& // break; // } + case TType::EKind::Tagged: { + auto taggedType = static_cast(type); + AssertUnboxedValuesAreEqual(left, right, taggedType->GetBaseType()); + break; + } + default: { UNIT_ASSERT_C(false, TStringBuilder() << "Unsupported type: " << type->GetKindAsStr()); } @@ -1290,6 +1308,25 @@ Y_UNIT_TEST_SUITE(KqpFormats_Arrow_Conversion) { AssertUnboxedValuesAreEqual(arrowValue, values[i], optionalType); } } + + Y_UNIT_TEST(NestedType_Tagged) { + TTestContext context; + + auto taggedType = context.GetTaggedType(); + auto values = context.CreateTaggeds(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(taggedType)); + + auto array = MakeArrowArray(values, taggedType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, taggedType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], taggedType); + } + } } Y_UNIT_TEST_SUITE(DqUnboxedValueToNativeArrowConversion) { diff --git a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp index 3f61d5993a47..3f1910917b41 100644 --- a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp +++ b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -249,7 +250,7 @@ NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& arr case NMiniKQL::TType::EKind::Optional: { auto optionalType = static_cast(itemType); - auto innerOptionalType = optionalType->GetItemType(); + auto innerOptionalType = SkipTaggedType(optionalType->GetItemType()); if (NeedWrapByExternalOptional(innerOptionalType)) { YQL_ENSURE(array->type_id() == arrow::Type::STRUCT, "Unexpected array type"); @@ -270,7 +271,7 @@ NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& arr return value; } - innerType = static_cast(innerType)->GetItemType(); + innerType = SkipTaggedType(static_cast(innerType)->GetItemType()); innerArray = structArray->field(0); ++depth; } @@ -369,7 +370,23 @@ NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& arr NUdf::TUnboxedValue value = ExtractUnboxedValue(valuesArray, rowInChild, innerType, holderFactory); return holderFactory.CreateVariantHolder(value.Release(), variantIndex); } - default: { + + case NMiniKQL::TType::EKind::Tagged: { + auto taggedType = static_cast(itemType); + return ExtractUnboxedValue(array, row, taggedType->GetBaseType(), holderFactory); + } + + case NMiniKQL::TType::EKind::Type: + case NMiniKQL::TType::EKind::Stream: + case NMiniKQL::TType::EKind::Callable: + case NMiniKQL::TType::EKind::Any: + case NMiniKQL::TType::EKind::Resource: + case NMiniKQL::TType::EKind::Flow: + case NMiniKQL::TType::EKind::ReservedKind: + case NMiniKQL::TType::EKind::Block: + case NMiniKQL::TType::EKind::Pg: + case NMiniKQL::TType::EKind::Multi: + case NMiniKQL::TType::EKind::Linear: { YQL_ENSURE(false, "Unsupported type: " << itemType->GetKindAsStr()); } } diff --git a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.h b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.h index 279a421aab47..028183709c9b 100644 --- a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.h +++ b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.h @@ -5,6 +5,14 @@ #include #include +/** + * @file kqp_formats_ut_helpers.h + * @brief Utilities for testing KQP formats. + * + * This module provides utilities for testing KQP formats. + * It includes functions for making arrow arrays and extracting unboxed values from arrow arrays. + */ + namespace NKikimr::NKqp::NFormats { /** From 9af660dc6585eff1ea01173d7f13ae2ece93d502 Mon Sep 17 00:00:00 2001 From: Daniil Timizhev Date: Fri, 7 Nov 2025 12:47:53 +0300 Subject: [PATCH 18/25] decompositions for variant type --- .../result_set_format/kqp_formats_arrow.cpp | 424 +++++++++--------- .../result_set_format/kqp_formats_arrow.h | 4 + .../ut/kqp_formats_ut_helpers.cpp | 348 +++++++------- 3 files changed, 405 insertions(+), 371 deletions(-) diff --git a/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp b/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp index f89adca4e86e..0dedc68b12e4 100644 --- a/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp +++ b/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp @@ -388,16 +388,192 @@ void AppendDataValue(arrow::ArrayBuilder* builder, N YQL_ENSURE(status.ok(), "Failed to append data value: " << status.ToString()); } +void AppendElement(NUdf::TUnboxedValue value, arrow::ArrayBuilder* builder, const NMiniKQL::TDataType* dataType) { + auto slot = *dataType->GetDataSlot().Get(); + bool success = SwitchMiniKQLDataTypeToArrowType(slot, [&]() { + AppendDataValue(builder, value, slot); + return true; + }); + YQL_ENSURE(success, "Failed to append data value to arrow builder"); +} + +void AppendElement(NUdf::TUnboxedValue value, arrow::ArrayBuilder* builder, const NMiniKQL::TOptionalType* optionalType) { + auto innerType = SkipTaggedType(optionalType->GetItemType()); + ui32 depth = 1; + + while (innerType->IsOptional()) { + innerType = SkipTaggedType(static_cast(innerType) ->GetItemType()); + ++depth; + } + + // For types without native validity bitmap (e.g., Variant, Null) we need to wrap them in an additional struct layer + // Furthermore, other singular types (e.g., Void, EmptyList, EmptyDict) also need to wrap (from YQL-15332) + // Thus, the depth == 2 for Optional> type + if (NeedWrapByExternalOptional(innerType)) { + ++depth; + } + + auto innerBuilder = builder; + auto innerValue = value; + + for (ui32 i = 1; i < depth; ++i) { + YQL_ENSURE(innerBuilder->type()->id() == arrow::Type::STRUCT, "Unexpected builder type"); + auto structBuilder = reinterpret_cast(innerBuilder); + YQL_ENSURE(structBuilder->num_fields() == 1, "Unexpected number of fields"); + + if (!innerValue) { + auto status = innerBuilder->AppendNull(); + YQL_ENSURE(status.ok(), "Failed to append null optional value: " << status.ToString()); + return; + } + + auto status = structBuilder->Append(); + YQL_ENSURE(status.ok(), "Failed to append optional value: " << status.ToString()); + + innerValue = innerValue.GetOptionalValue(); + innerBuilder = structBuilder->field_builder(0); + } + + if (innerValue) { + NFormats::AppendElement(innerValue.GetOptionalValue(), innerBuilder, innerType); + } else { + auto status = innerBuilder->AppendNull(); + YQL_ENSURE(status.ok(), "Failed to append null optional value: " << status.ToString()); + } +} + +void AppendElement(NUdf::TUnboxedValue value, arrow::ArrayBuilder* builder, const NMiniKQL::TListType* listType) { + auto itemType = listType->GetItemType(); + + YQL_ENSURE(builder->type()->id() == arrow::Type::LIST, "Unexpected builder type"); + auto listBuilder = reinterpret_cast(builder); + + auto status = listBuilder->Append(); + YQL_ENSURE(status.ok(), "Failed to append list value: " << status.ToString()); + + auto innerBuilder = listBuilder->value_builder(); + if (auto item = value.GetElements()) { + auto length = value.GetListLength(); + while (length > 0) { + NFormats::AppendElement(*item++, innerBuilder, itemType); + --length; + } + } else { + const auto iter = value.GetListIterator(); + for (NUdf::TUnboxedValue item; iter.Next(item);) { + NFormats::AppendElement(item, innerBuilder, itemType); + } + } +} + +void AppendElement(NUdf::TUnboxedValue value, arrow::ArrayBuilder* builder, const NMiniKQL::TStructType* structType) { + YQL_ENSURE(builder->type()->id() == arrow::Type::STRUCT, "Unexpected builder type"); + auto structBuilder = reinterpret_cast(builder); + + auto status = structBuilder->Append(); + YQL_ENSURE(status.ok(), "Failed to append struct value: " << status.ToString()); + + YQL_ENSURE(static_cast(structBuilder->num_fields()) == structType->GetMembersCount(), "Unexpected number of fields"); + for (ui32 index = 0; index < structType->GetMembersCount(); ++index) { + auto innerBuilder = structBuilder->field_builder(index); + auto memberType = structType->GetMemberType(index); + NFormats::AppendElement(value.GetElement(index), innerBuilder, memberType); + } +} + +void AppendElement(NUdf::TUnboxedValue value, arrow::ArrayBuilder* builder, const NMiniKQL::TTupleType* tupleType) { + YQL_ENSURE(builder->type()->id() == arrow::Type::STRUCT, "Unexpected builder type"); + auto structBuilder = reinterpret_cast(builder); + + auto status = structBuilder->Append(); + YQL_ENSURE(status.ok(), "Failed to append tuple value: " << status.ToString()); + + YQL_ENSURE(static_cast(structBuilder->num_fields()) == tupleType->GetElementsCount(), "Unexpected number of fields"); + for (ui32 index = 0; index < tupleType->GetElementsCount(); ++index) { + auto innerBuilder = structBuilder->field_builder(index); + auto elementType = tupleType->GetElementType(index); + NFormats::AppendElement(value.GetElement(index), innerBuilder, elementType); + } +} + +void AppendElement(NUdf::TUnboxedValue value, arrow::ArrayBuilder* builder, const NMiniKQL::TDictType* dictType) { + auto keyType = dictType->GetKeyType(); + auto payloadType = dictType->GetPayloadType(); + + YQL_ENSURE(builder->type()->id() == arrow::Type::LIST, "Unexpected builder type"); + auto listBuilder = reinterpret_cast(builder); + + auto status = listBuilder->Append(); + YQL_ENSURE(status.ok(), "Failed to append dict value: " << status.ToString()); + + YQL_ENSURE(listBuilder->value_builder()->type()->id() == arrow::Type::STRUCT, "Unexpected builder type"); + auto structBuilder = reinterpret_cast(listBuilder->value_builder()); + YQL_ENSURE(structBuilder->num_fields() == 2, "Unexpected number of fields"); + + auto keyBuilder = structBuilder->field_builder(0); + auto itemBuilder = structBuilder->field_builder(1); + + const auto iter = value.GetDictIterator(); + for (NUdf::TUnboxedValue key, payload; iter.NextPair(key, payload);) { + auto status = structBuilder->Append(); + YQL_ENSURE(status.ok(), "Failed to append dict value: " << status.ToString()); + + NFormats::AppendElement(key, keyBuilder, keyType); + NFormats::AppendElement(payload, itemBuilder, payloadType); + } +} + +void AppendElement(NUdf::TUnboxedValue value, arrow::ArrayBuilder* builder, const NMiniKQL::TVariantType* variantType) { + YQL_ENSURE(builder->type()->id() == arrow::Type::DENSE_UNION, "Unexpected builder type"); + auto unionBuilder = reinterpret_cast(builder); + + ui32 variantIndex = value.GetVariantIndex(); + NMiniKQL::TType* innerType = variantType->GetUnderlyingType(); + + if (innerType->IsStruct()) { + innerType = static_cast(innerType)->GetMemberType(variantIndex); + } else { + YQL_ENSURE(innerType->IsTuple(), "Unexpected underlying variant type: " << innerType->GetKindAsStr()); + innerType = static_cast(innerType)->GetElementType(variantIndex); + } + + if (variantType->GetAlternativesCount() > arrow::UnionType::kMaxTypeCode) { + ui32 numberOfGroups = (variantType->GetAlternativesCount() - 1) / arrow::UnionType::kMaxTypeCode + 1; + YQL_ENSURE(static_cast(unionBuilder->num_children()) == numberOfGroups, "Unexpected variant number of groups"); + + ui32 groupIndex = variantIndex / arrow::UnionType::kMaxTypeCode; + auto status = unionBuilder->Append(groupIndex); + YQL_ENSURE(status.ok(), "Failed to append variant value: " << status.ToString()); + + auto innerBuilder = unionBuilder->child_builder(groupIndex); + YQL_ENSURE(innerBuilder->type()->id() == arrow::Type::DENSE_UNION, "Unexpected builder type"); + auto innerUnionBuilder = reinterpret_cast(innerBuilder.get()); + + ui32 innerVariantIndex = variantIndex % arrow::UnionType::kMaxTypeCode; + status = innerUnionBuilder->Append(innerVariantIndex); + YQL_ENSURE(status.ok(), "Failed to append variant value: " << status.ToString()); + + auto doubleInnerBuilder = innerUnionBuilder->child_builder(innerVariantIndex); + NFormats::AppendElement(value.GetVariantItem(), doubleInnerBuilder.get(), innerType); + } else { + auto status = unionBuilder->Append(variantIndex); + YQL_ENSURE(status.ok(), "Failed to append variant value: " << status.ToString()); + + auto innerBuilder = unionBuilder->child_builder(variantIndex); + NFormats::AppendElement(value.GetVariantItem(), innerBuilder.get(), innerType); + } +} + } // namespace bool NeedWrapByExternalOptional(const NMiniKQL::TType* type) { switch (type->GetKind()) { - case NMiniKQL::TType::EKind::Void: case NMiniKQL::TType::EKind::Null: - case NMiniKQL::TType::EKind::Variant: - case NMiniKQL::TType::EKind::Optional: + case NMiniKQL::TType::EKind::Void: case NMiniKQL::TType::EKind::EmptyList: - case NMiniKQL::TType::EKind::EmptyDict: { + case NMiniKQL::TType::EKind::EmptyDict: + case NMiniKQL::TType::EKind::Optional: + case NMiniKQL::TType::EKind::Variant: { return true; } @@ -440,43 +616,35 @@ std::shared_ptr GetArrowType(const NMiniKQL::TType* type) { } case NMiniKQL::TType::EKind::Data: { - auto dataType = static_cast(type); - return GetArrowType(dataType); + return GetArrowType(static_cast(type)); } - case NMiniKQL::TType::EKind::Struct: { - auto structType = static_cast(type); - return GetArrowType(structType); + case NMiniKQL::TType::EKind::Optional: { + return GetArrowType(static_cast(type)); } - case NMiniKQL::TType::EKind::Tuple: { - auto tupleType = static_cast(type); - return GetArrowType(tupleType); + case NMiniKQL::TType::EKind::Struct: { + return GetArrowType(static_cast(type)); } - case NMiniKQL::TType::EKind::Optional: { - auto optionalType = static_cast(type); - return GetArrowType(optionalType); + case NMiniKQL::TType::EKind::Tuple: { + return GetArrowType(static_cast(type)); } case NMiniKQL::TType::EKind::List: { - auto listType = static_cast(type); - return GetArrowType(listType); + return GetArrowType(static_cast(type)); } case NMiniKQL::TType::EKind::Dict: { - auto dictType = static_cast(type); - return GetArrowType(dictType); + return GetArrowType(static_cast(type)); } case NMiniKQL::TType::EKind::Variant: { - auto variantType = static_cast(type); - return GetArrowType(variantType); + return GetArrowType(static_cast(type)); } case NMiniKQL::TType::EKind::Tagged: { - auto taggedType = static_cast(type); - return GetArrowType(taggedType->GetBaseType()); + return GetArrowType(static_cast(type)->GetBaseType()); } case NMiniKQL::TType::EKind::Type: @@ -498,14 +666,19 @@ std::shared_ptr GetArrowType(const NMiniKQL::TType* type) { bool IsArrowCompatible(const NKikimr::NMiniKQL::TType* type) { switch (type->GetKind()) { - case NMiniKQL::TType::EKind::Void: case NMiniKQL::TType::EKind::Null: + case NMiniKQL::TType::EKind::Void: case NMiniKQL::TType::EKind::EmptyList: case NMiniKQL::TType::EKind::EmptyDict: case NMiniKQL::TType::EKind::Data: { return true; } + case NMiniKQL::TType::EKind::Optional: { + auto optionalType = static_cast(type); + return IsArrowCompatible(optionalType->GetItemType()); + } + case NMiniKQL::TType::EKind::Struct: { auto structType = static_cast(type); bool isCompatible = true; @@ -526,17 +699,19 @@ bool IsArrowCompatible(const NKikimr::NMiniKQL::TType* type) { return isCompatible; } - case NMiniKQL::TType::EKind::Optional: { - auto optionalType = static_cast(type); - return IsArrowCompatible(optionalType->GetItemType()); - } - case NMiniKQL::TType::EKind::List: { auto listType = static_cast(type); auto itemType = listType->GetItemType(); return IsArrowCompatible(itemType); } + case NMiniKQL::TType::EKind::Dict: { + auto dictType = static_cast(type); + auto keyType = dictType->GetKeyType(); + auto payloadType = dictType->GetPayloadType(); + return IsArrowCompatible(keyType) && IsArrowCompatible(payloadType); + } + case NMiniKQL::TType::EKind::Variant: { auto variantType = static_cast(type); ui32 maxTypesCount = (arrow::UnionType::kMaxTypeCode + 1) * (arrow::UnionType::kMaxTypeCode + 1); @@ -553,13 +728,6 @@ bool IsArrowCompatible(const NKikimr::NMiniKQL::TType* type) { return false; } - case NMiniKQL::TType::EKind::Dict: { - auto dictType = static_cast(type); - auto keyType = dictType->GetKeyType(); - auto payloadType = dictType->GetPayloadType(); - return IsArrowCompatible(keyType) && IsArrowCompatible(payloadType); - } - case NMiniKQL::TType::EKind::Tagged: { auto taggedType = static_cast(type); return IsArrowCompatible(taggedType->GetBaseType()); @@ -602,204 +770,42 @@ void AppendElement(NUdf::TUnboxedValue value, arrow::ArrayBuilder* builder, cons } case NMiniKQL::TType::EKind::Data: { - auto dataType = static_cast(type); - auto slot = *dataType->GetDataSlot().Get(); - bool success = SwitchMiniKQLDataTypeToArrowType(slot, [&]() { - AppendDataValue(builder, value, slot); - return true; - }); - YQL_ENSURE(success, "Failed to append data value to arrow builder"); + AppendElement(value, builder, static_cast(type)); break; } case NMiniKQL::TType::EKind::Optional: { - auto innerType = SkipTaggedType(static_cast(type)->GetItemType()); - ui32 depth = 1; - - while (innerType->IsOptional()) { - innerType = SkipTaggedType(static_cast(innerType) ->GetItemType()); - ++depth; - } - - // For types without native validity bitmap (e.g., Variant, Null) we need to wrap them in an additional struct layer - // Furthermore, other singular types (e.g., Void, EmptyList, EmptyDict) also need to wrap (from YQL-15332) - // Thus, the depth == 2 for Optional> type - if (NeedWrapByExternalOptional(innerType)) { - ++depth; - } - - auto innerBuilder = builder; - auto innerValue = value; - - for (ui32 i = 1; i < depth; ++i) { - YQL_ENSURE(innerBuilder->type()->id() == arrow::Type::STRUCT, "Unexpected builder type"); - auto structBuilder = reinterpret_cast(innerBuilder); - YQL_ENSURE(structBuilder->num_fields() == 1, "Unexpected number of fields"); - - if (!innerValue) { - auto status = innerBuilder->AppendNull(); - YQL_ENSURE(status.ok(), "Failed to append null optional value: " << status.ToString()); - return; - } - - auto status = structBuilder->Append(); - YQL_ENSURE(status.ok(), "Failed to append optional value: " << status.ToString()); - - innerValue = innerValue.GetOptionalValue(); - innerBuilder = structBuilder->field_builder(0); - } - - if (innerValue) { - AppendElement(innerValue.GetOptionalValue(), innerBuilder, innerType); - } else { - auto status = innerBuilder->AppendNull(); - YQL_ENSURE(status.ok(), "Failed to append null optional value: " << status.ToString()); - } - break; - } - - case NMiniKQL::TType::EKind::List: { - auto listType = static_cast(type); - auto itemType = listType->GetItemType(); - - YQL_ENSURE(builder->type()->id() == arrow::Type::LIST, "Unexpected builder type"); - auto listBuilder = reinterpret_cast(builder); - - auto status = listBuilder->Append(); - YQL_ENSURE(status.ok(), "Failed to append list value: " << status.ToString()); - - auto innerBuilder = listBuilder->value_builder(); - if (auto item = value.GetElements()) { - auto length = value.GetListLength(); - while (length > 0) { - AppendElement(*item++, innerBuilder, itemType); - --length; - } - } else { - const auto iter = value.GetListIterator(); - for (NUdf::TUnboxedValue item; iter.Next(item);) { - AppendElement(item, innerBuilder, itemType); - } - } + AppendElement(value, builder, static_cast(type)); break; } case NMiniKQL::TType::EKind::Struct: { - auto structType = static_cast(type); - - YQL_ENSURE(builder->type()->id() == arrow::Type::STRUCT, "Unexpected builder type"); - auto structBuilder = reinterpret_cast(builder); - - auto status = structBuilder->Append(); - YQL_ENSURE(status.ok(), "Failed to append struct value: " << status.ToString()); - - YQL_ENSURE(static_cast(structBuilder->num_fields()) == structType->GetMembersCount(), "Unexpected number of fields"); - for (ui32 index = 0; index < structType->GetMembersCount(); ++index) { - auto innerBuilder = structBuilder->field_builder(index); - auto memberType = structType->GetMemberType(index); - AppendElement(value.GetElement(index), innerBuilder, memberType); - } + AppendElement(value, builder, static_cast(type)); break; } case NMiniKQL::TType::EKind::Tuple: { - auto tupleType = static_cast(type); - - YQL_ENSURE(builder->type()->id() == arrow::Type::STRUCT, "Unexpected builder type"); - auto structBuilder = reinterpret_cast(builder); - - auto status = structBuilder->Append(); - YQL_ENSURE(status.ok(), "Failed to append tuple value: " << status.ToString()); + AppendElement(value, builder, static_cast(type)); + break; + } - YQL_ENSURE(static_cast(structBuilder->num_fields()) == tupleType->GetElementsCount(), "Unexpected number of fields"); - for (ui32 index = 0; index < tupleType->GetElementsCount(); ++index) { - auto innerBuilder = structBuilder->field_builder(index); - auto elementType = tupleType->GetElementType(index); - AppendElement(value.GetElement(index), innerBuilder, elementType); - } + case NMiniKQL::TType::EKind::List: { + AppendElement(value, builder, static_cast(type)); break; } case NMiniKQL::TType::EKind::Dict: { - auto dictType = static_cast(type); - auto keyType = dictType->GetKeyType(); - auto payloadType = dictType->GetPayloadType(); - - arrow::ArrayBuilder* keyBuilder = nullptr; - arrow::ArrayBuilder* itemBuilder = nullptr; - arrow::StructBuilder* structBuilder = nullptr; - - YQL_ENSURE(builder->type()->id() == arrow::Type::LIST, "Unexpected builder type"); - arrow::ListBuilder* listBuilder = reinterpret_cast(builder); - - auto status = listBuilder->Append(); - YQL_ENSURE(status.ok(), "Failed to append dict value: " << status.ToString()); - - YQL_ENSURE(listBuilder->value_builder()->type()->id() == arrow::Type::STRUCT, "Unexpected builder type"); - structBuilder = reinterpret_cast(listBuilder->value_builder()); - YQL_ENSURE(structBuilder->num_fields() == 2, "Unexpected number of fields"); - - keyBuilder = structBuilder->field_builder(0); - itemBuilder = structBuilder->field_builder(1); - - const auto iter = value.GetDictIterator(); - for (NUdf::TUnboxedValue key, payload; iter.NextPair(key, payload);) { - status = structBuilder->Append(); - YQL_ENSURE(status.ok(), "Failed to append dict value: " << status.ToString()); - - AppendElement(key, keyBuilder, keyType); - AppendElement(payload, itemBuilder, payloadType); - } + AppendElement(value, builder, static_cast(type)); break; } case NMiniKQL::TType::EKind::Variant: { - auto variantType = static_cast(type); - - YQL_ENSURE(builder->type()->id() == arrow::Type::DENSE_UNION, "Unexpected builder type"); - auto unionBuilder = reinterpret_cast(builder); - - ui32 variantIndex = value.GetVariantIndex(); - NMiniKQL::TType* innerType = variantType->GetUnderlyingType(); - - if (innerType->IsStruct()) { - innerType = static_cast(innerType)->GetMemberType(variantIndex); - } else { - YQL_ENSURE(innerType->IsTuple(), "Unexpected underlying variant type: " << innerType->GetKindAsStr()); - innerType = static_cast(innerType)->GetElementType(variantIndex); - } - - if (variantType->GetAlternativesCount() > arrow::UnionType::kMaxTypeCode) { - ui32 numberOfGroups = (variantType->GetAlternativesCount() - 1) / arrow::UnionType::kMaxTypeCode + 1; - YQL_ENSURE(static_cast(unionBuilder->num_children()) == numberOfGroups, "Unexpected variant number of groups"); - - ui32 groupIndex = variantIndex / arrow::UnionType::kMaxTypeCode; - auto status = unionBuilder->Append(groupIndex); - YQL_ENSURE(status.ok(), "Failed to append variant value: " << status.ToString()); - - auto innerBuilder = unionBuilder->child_builder(groupIndex); - YQL_ENSURE(innerBuilder->type()->id() == arrow::Type::DENSE_UNION, "Unexpected builder type"); - auto innerUnionBuilder = reinterpret_cast(innerBuilder.get()); - - ui32 innerVariantIndex = variantIndex % arrow::UnionType::kMaxTypeCode; - status = innerUnionBuilder->Append(innerVariantIndex); - YQL_ENSURE(status.ok(), "Failed to append variant value: " << status.ToString()); - - auto doubleInnerBuilder = innerUnionBuilder->child_builder(innerVariantIndex); - AppendElement(value.GetVariantItem(), doubleInnerBuilder.get(), innerType); - } else { - auto status = unionBuilder->Append(variantIndex); - YQL_ENSURE(status.ok(), "Failed to append variant value: " << status.ToString()); - - auto innerBuilder = unionBuilder->child_builder(variantIndex); - AppendElement(value.GetVariantItem(), innerBuilder.get(), innerType); - } + AppendElement(value, builder, static_cast(type)); break; } case NMiniKQL::TType::EKind::Tagged: { - auto taggedType = static_cast(type); - AppendElement(value, builder, taggedType->GetBaseType()); + AppendElement(value, builder, static_cast(type)->GetBaseType()); break; } diff --git a/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.h b/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.h index 6bfd66e9bcbc..a7d9beb44ae9 100644 --- a/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.h +++ b/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.h @@ -15,6 +15,10 @@ namespace NKikimr::NKqp::NFormats { +constexpr size_t MAX_VARIANT_FLATTEN_SIZE = static_cast(arrow::UnionType::kMaxTypeCode) + 1; +constexpr size_t MAX_VARIANT_NESTED_SIZE = MAX_VARIANT_FLATTEN_SIZE * MAX_VARIANT_FLATTEN_SIZE; +constexpr size_t MAX_VARIANT_DEPTH = 2; + /** * @brief Dispatches MiniKQL data type to corresponding Arrow type via compile-time callback. * diff --git a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp index 3f1910917b41..154550491f86 100644 --- a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp +++ b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp @@ -13,7 +13,7 @@ namespace NKikimr::NKqp::NFormats { namespace { template -NUdf::TUnboxedValue GetUnboxedValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { +NUdf::TUnboxedValue ExtractDataValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { Y_UNUSED(dataSlot); using TArrayType = typename arrow::TypeTraits::ArrayType; auto array = std::static_pointer_cast(column); @@ -21,21 +21,21 @@ NUdf::TUnboxedValue GetUnboxedValue(std::shared_ptr column, ui32 r } template <> // For darwin build -NUdf::TUnboxedValue GetUnboxedValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { +NUdf::TUnboxedValue ExtractDataValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { Y_UNUSED(dataSlot); auto array = std::static_pointer_cast(column); return NUdf::TUnboxedValuePod(static_cast(array->Value(row))); } template <> // For darwin build -NUdf::TUnboxedValue GetUnboxedValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { +NUdf::TUnboxedValue ExtractDataValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { Y_UNUSED(dataSlot); auto array = std::static_pointer_cast(column); return NUdf::TUnboxedValuePod(static_cast(array->Value(row))); } template <> -NUdf::TUnboxedValue GetUnboxedValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { +NUdf::TUnboxedValue ExtractDataValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { auto array = std::static_pointer_cast(column); YQL_ENSURE(array->num_fields() == 2, "StructArray of some TzDate type should have 2 fields"); @@ -94,7 +94,7 @@ NUdf::TUnboxedValue GetUnboxedValue(std::shared_ptr -NUdf::TUnboxedValue GetUnboxedValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { +NUdf::TUnboxedValue ExtractDataValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { Y_UNUSED(dataSlot); auto array = std::static_pointer_cast(column); auto data = array->GetView(row); @@ -102,7 +102,7 @@ NUdf::TUnboxedValue GetUnboxedValue(std::shared_ptr -NUdf::TUnboxedValue GetUnboxedValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { +NUdf::TUnboxedValue ExtractDataValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { auto array = std::static_pointer_cast(column); auto data = array->GetView(row); @@ -141,7 +141,7 @@ NUdf::TUnboxedValue GetUnboxedValue(std::shared_ptr -NUdf::TUnboxedValue GetUnboxedValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { +NUdf::TUnboxedValue ExtractDataValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { auto array = std::static_pointer_cast(column); auto data = array->GetView(row); @@ -163,6 +163,173 @@ NUdf::TUnboxedValue GetUnboxedValue(std::shared_ptr< return NUdf::TUnboxedValuePod(); } +NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& array, ui64 row, + const NMiniKQL::TDataType* dataType) +{ + NUdf::TUnboxedValue result; + auto dataSlot = *dataType->GetDataSlot().Get(); + bool success = SwitchMiniKQLDataTypeToArrowType(dataSlot, + [&]() { + result = ExtractDataValue(array, row, dataSlot); + return true; + }); + YQL_ENSURE(success, "Failed to extract unboxed value from arrow array"); + return result; +} + +NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& array, ui64 row, + const NMiniKQL::TOptionalType* optionalType, const NMiniKQL::THolderFactory& holderFactory) +{ + auto innerOptionalType = SkipTaggedType(optionalType->GetItemType()); + if (NeedWrapByExternalOptional(innerOptionalType)) { + YQL_ENSURE(array->type_id() == arrow::Type::STRUCT, "Unexpected array type"); + + auto innerArray = array; + auto innerType = static_cast(optionalType); + int depth = 0; + + while (innerArray->type_id() == arrow::Type::STRUCT) { + auto structArray = static_pointer_cast(innerArray); + YQL_ENSURE(structArray->num_fields() == 1, "Unexpected count of fields"); + + if (structArray->IsNull(row)) { + NUdf::TUnboxedValue value; + for (int i = 0; i < depth; ++i) { + value = value.MakeOptional(); + } + return value; + } + + innerType = SkipTaggedType(static_cast(innerType)->GetItemType()); + innerArray = structArray->field(0); + ++depth; + } + + NUdf::TUnboxedValue value; + if (NeedWrapByExternalOptional(innerType) || !innerArray->IsNull(row)) { + value = NFormats::ExtractUnboxedValue(innerArray, row, innerType, holderFactory); + if (NeedWrapByExternalOptional(innerType)) { + --depth; + } + } + + for (int i = 0; i < depth; ++i) { + value = value.MakeOptional(); + } + return value; + } + return NFormats::ExtractUnboxedValue(array, row, innerOptionalType, holderFactory).Release().MakeOptional(); +} + +NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& array, ui64 row, + const NMiniKQL::TStructType* structType, const NMiniKQL::THolderFactory& holderFactory) +{ + YQL_ENSURE(array->type_id() == arrow::Type::STRUCT, "Unexpected array type"); + auto typedArray = static_pointer_cast(array); + YQL_ENSURE(static_cast(typedArray->num_fields()) == structType->GetMembersCount(), "Unexpected count of fields"); + + NUdf::TUnboxedValue* itemsPtr = nullptr; + auto result = holderFactory.CreateDirectArrayHolder(structType->GetMembersCount(), itemsPtr); + + for (ui32 index = 0; index < structType->GetMembersCount(); ++index) { + auto memberType = structType->GetMemberType(index); + itemsPtr[index] = NFormats::ExtractUnboxedValue(typedArray->field(index), row, memberType, holderFactory); + } + return result; +} + +NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& array, ui64 row, + const NMiniKQL::TTupleType* tupleType, const NMiniKQL::THolderFactory& holderFactory) +{ + YQL_ENSURE(array->type_id() == arrow::Type::STRUCT, "Unexpected array type"); + auto typedArray = static_pointer_cast(array); + YQL_ENSURE(static_cast(typedArray->num_fields()) == tupleType->GetElementsCount(), "Unexpected count of fields"); + + NUdf::TUnboxedValue* itemsPtr = nullptr; + auto result = holderFactory.CreateDirectArrayHolder(tupleType->GetElementsCount(), itemsPtr); + + for (ui32 index = 0; index < tupleType->GetElementsCount(); ++index) { + auto elementType = tupleType->GetElementType(index); + itemsPtr[index] = NFormats::ExtractUnboxedValue(typedArray->field(index), row, elementType, holderFactory); + } + return result; +} + +NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& array, ui64 row, + const NMiniKQL::TListType* listType, const NMiniKQL::THolderFactory& holderFactory) +{ + YQL_ENSURE(array->type_id() == arrow::Type::LIST, "Unexpected array type"); + auto typedArray = static_pointer_cast(array); + + auto arraySlice = typedArray->value_slice(row); + auto itemType = listType->GetItemType(); + const auto len = arraySlice->length(); + + NUdf::TUnboxedValue* items = nullptr; + auto list = holderFactory.CreateDirectArrayHolder(len, items); + for (ui64 i = 0; i < static_cast(len); ++i) { + *items++ = NFormats::ExtractUnboxedValue(arraySlice, i, itemType, holderFactory); + } + return list; +} + +NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& array, ui64 row, + const NMiniKQL::TDictType* dictType, const NMiniKQL::THolderFactory& holderFactory) +{ + auto keyType = dictType->GetKeyType(); + auto payloadType = dictType->GetPayloadType(); + auto dictBuilder = holderFactory.NewDict(dictType, 0); + + YQL_ENSURE(array->type_id() == arrow::Type::LIST, "Unexpected array type"); + auto listArray = static_pointer_cast(array); + YQL_ENSURE(listArray->value_type()->id() == arrow::Type::STRUCT, "Unexpected array type"); + + auto structArray = static_pointer_cast(listArray->value_slice(row)); + YQL_ENSURE(static_cast(structArray->num_fields()) == 2, "Unexpected count of fields"); + + std::shared_ptr keyArray = structArray->field(0); + std::shared_ptr payloadArray = structArray->field(1); + + for (ui64 i = 0; i < static_cast(structArray->length()); ++i) { + auto key = NFormats::ExtractUnboxedValue(keyArray, i, keyType, holderFactory); + auto payload = NFormats::ExtractUnboxedValue(payloadArray, i, payloadType, holderFactory); + dictBuilder->Add(std::move(key), std::move(payload)); + } + return dictBuilder->Build(); +} + +NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& array, ui64 row, + const NMiniKQL::TVariantType* variantType, const NMiniKQL::THolderFactory& holderFactory) +{ + YQL_ENSURE(array->type_id() == arrow::Type::DENSE_UNION, "Unexpected array type"); + auto unionArray = static_pointer_cast(array); + + auto variantIndex = unionArray->child_id(row); + auto rowInChild = unionArray->value_offset(row); + auto valuesArray = unionArray->field(variantIndex); + + if (variantType->GetAlternativesCount() > arrow::UnionType::kMaxTypeCode) { + YQL_ENSURE(valuesArray->type_id() == arrow::Type::DENSE_UNION, "Unexpected array type"); + auto innerUnionArray = static_pointer_cast(valuesArray); + auto innerVariantIndex = innerUnionArray->child_id(rowInChild); + + rowInChild = innerUnionArray->value_offset(rowInChild); + valuesArray = innerUnionArray->field(innerVariantIndex); + variantIndex =variantIndex * arrow::UnionType::kMaxTypeCode + innerVariantIndex; + } + + NMiniKQL::TType* innerType = variantType->GetUnderlyingType(); + if (innerType->IsStruct()) { + innerType =static_cast(innerType)->GetMemberType(variantIndex); + } else { + YQL_ENSURE(innerType->IsTuple(), "Unexpected underlying variant type: " << innerType->GetKindAsStr()); + innerType = static_cast(innerType)->GetElementType(variantIndex); + } + + auto value = NFormats::ExtractUnboxedValue(valuesArray, rowInChild, innerType, holderFactory); + return holderFactory.CreateVariantHolder(value.Release(), variantIndex); +} + } // namespace std::unique_ptr MakeArrowBuilder(const NMiniKQL::TType* type) { @@ -194,186 +361,43 @@ NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& arr } switch (itemType->GetKind()) { - case NMiniKQL::TType::EKind::Void: case NMiniKQL::TType::EKind::Null: + case NMiniKQL::TType::EKind::Void: case NMiniKQL::TType::EKind::EmptyList: case NMiniKQL::TType::EKind::EmptyDict: { break; } case NMiniKQL::TType::EKind::Data: { - auto dataType = static_cast(itemType); - NUdf::TUnboxedValue result; - auto dataSlot = *dataType->GetDataSlot().Get(); - bool success = SwitchMiniKQLDataTypeToArrowType(dataSlot, - [&]() { - result = GetUnboxedValue(array, row, dataSlot); - return true; - }); - YQL_ENSURE(success, "Failed to extract unboxed value from arrow array"); - return result; + return ExtractUnboxedValue(array, row, static_cast(itemType)); } - case NMiniKQL::TType::EKind::Struct: { - auto structType = static_cast(itemType); - - YQL_ENSURE(array->type_id() == arrow::Type::STRUCT, "Unexpected array type"); - auto typedArray = static_pointer_cast(array); - YQL_ENSURE(static_cast(typedArray->num_fields()) == structType->GetMembersCount(), "Unexpected count of fields"); - - NUdf::TUnboxedValue* itemsPtr = nullptr; - auto result = holderFactory.CreateDirectArrayHolder(structType->GetMembersCount(), itemsPtr); - - for (ui32 index = 0; index < structType->GetMembersCount(); ++index) { - auto memberType = structType->GetMemberType(index); - itemsPtr[index] = ExtractUnboxedValue(typedArray->field(index), row, memberType, holderFactory); - } - return result; + case NMiniKQL::TType::EKind::Optional: { + return ExtractUnboxedValue(array, row, static_cast(itemType), holderFactory); } - case NMiniKQL::TType::EKind::Tuple: { - auto tupleType = static_cast(itemType); - - YQL_ENSURE(array->type_id() == arrow::Type::STRUCT, "Unexpected array type"); - auto typedArray = static_pointer_cast(array); - YQL_ENSURE(static_cast(typedArray->num_fields()) == tupleType->GetElementsCount(), "Unexpected count of fields"); - - NUdf::TUnboxedValue* itemsPtr = nullptr; - auto result = holderFactory.CreateDirectArrayHolder(tupleType->GetElementsCount(), itemsPtr); - - for (ui32 index = 0; index < tupleType->GetElementsCount(); ++index) { - auto elementType = tupleType->GetElementType(index); - itemsPtr[index] = ExtractUnboxedValue(typedArray->field(index), row, elementType, holderFactory); - } - return result; + case NMiniKQL::TType::EKind::Struct: { + return ExtractUnboxedValue(array, row, static_cast(itemType), holderFactory); } - case NMiniKQL::TType::EKind::Optional: { - auto optionalType = static_cast(itemType); - auto innerOptionalType = SkipTaggedType(optionalType->GetItemType()); - - if (NeedWrapByExternalOptional(innerOptionalType)) { - YQL_ENSURE(array->type_id() == arrow::Type::STRUCT, "Unexpected array type"); - - auto innerArray = array; - auto innerType = itemType; - int depth = 0; - - while (innerArray->type_id() == arrow::Type::STRUCT) { - auto structArray = static_pointer_cast(innerArray); - YQL_ENSURE(structArray->num_fields() == 1, "Unexpected count of fields"); - - if (structArray->IsNull(row)) { - NUdf::TUnboxedValue value; - for (int i = 0; i < depth; ++i) { - value = value.MakeOptional(); - } - return value; - } - - innerType = SkipTaggedType(static_cast(innerType)->GetItemType()); - innerArray = structArray->field(0); - ++depth; - } - - NUdf::TUnboxedValue value; - if (NeedWrapByExternalOptional(innerType) || !innerArray->IsNull(row)) { - value = ExtractUnboxedValue(innerArray, row, innerType, holderFactory); - if (NeedWrapByExternalOptional(innerType)) { - --depth; - } - } - - for (int i = 0; i < depth; ++i) { - value = value.MakeOptional(); - } - return value; - } - - return ExtractUnboxedValue(array, row, innerOptionalType, holderFactory).Release().MakeOptional(); + case NMiniKQL::TType::EKind::Tuple: { + return ExtractUnboxedValue(array, row, static_cast(itemType), holderFactory); } case NMiniKQL::TType::EKind::List: { - auto listType = static_cast(itemType); - - YQL_ENSURE(array->type_id() == arrow::Type::LIST, "Unexpected array type"); - auto typedArray = static_pointer_cast(array); - - auto arraySlice = typedArray->value_slice(row); - auto itemType = listType->GetItemType(); - const auto len = arraySlice->length(); - - NUdf::TUnboxedValue* items = nullptr; - auto list = holderFactory.CreateDirectArrayHolder(len, items); - for (ui64 i = 0; i < static_cast(len); ++i) { - *items++ = ExtractUnboxedValue(arraySlice, i, itemType, holderFactory); - } - return list; + return ExtractUnboxedValue(array, row, static_cast(itemType), holderFactory); } case NMiniKQL::TType::EKind::Dict: { - auto dictType = static_cast(itemType); - - auto keyType = dictType->GetKeyType(); - auto payloadType = dictType->GetPayloadType(); - auto dictBuilder = holderFactory.NewDict(dictType, 0); - - YQL_ENSURE(array->type_id() == arrow::Type::LIST, "Unexpected array type"); - auto listArray = static_pointer_cast(array); - YQL_ENSURE(listArray->value_type()->id() == arrow::Type::STRUCT, "Unexpected array type"); - - auto structArray = static_pointer_cast(listArray->value_slice(row)); - YQL_ENSURE(static_cast(structArray->num_fields()) == 2, "Unexpected count of fields"); - - std::shared_ptr keyArray = structArray->field(0); - std::shared_ptr payloadArray = structArray->field(1); - - for (ui64 i = 0; i < static_cast(structArray->length()); ++i) { - auto key = ExtractUnboxedValue(keyArray, i, keyType, holderFactory); - auto payload = ExtractUnboxedValue(payloadArray, i, payloadType, holderFactory); - dictBuilder->Add(std::move(key), std::move(payload)); - } - return dictBuilder->Build(); + return ExtractUnboxedValue(array, row, static_cast(itemType), holderFactory); } case NMiniKQL::TType::EKind::Variant: { - // TODO Need to properly convert variants containing more than 127*127 - // types? - auto variantType = static_cast(itemType); - - YQL_ENSURE(array->type_id() == arrow::Type::DENSE_UNION, "Unexpected array type"); - auto unionArray = static_pointer_cast(array); - - auto variantIndex = unionArray->child_id(row); - auto rowInChild = unionArray->value_offset(row); - std::shared_ptr valuesArray = unionArray->field(variantIndex); - - if (variantType->GetAlternativesCount() > arrow::UnionType::kMaxTypeCode) { - // Go one step deeper - YQL_ENSURE(valuesArray->type_id() == arrow::Type::DENSE_UNION, "Unexpected array type"); - auto innerUnionArray = static_pointer_cast(valuesArray); - auto innerVariantIndex = innerUnionArray->child_id(rowInChild); - - rowInChild = innerUnionArray->value_offset(rowInChild); - valuesArray = innerUnionArray->field(innerVariantIndex); - variantIndex =variantIndex * arrow::UnionType::kMaxTypeCode + innerVariantIndex; - } - - NMiniKQL::TType* innerType = variantType->GetUnderlyingType(); - if (innerType->IsStruct()) { - innerType =static_cast(innerType)->GetMemberType(variantIndex); - } else { - YQL_ENSURE(innerType->IsTuple(), "Unexpected underlying variant type: " << innerType->GetKindAsStr()); - innerType = static_cast(innerType)->GetElementType(variantIndex); - } - - NUdf::TUnboxedValue value = ExtractUnboxedValue(valuesArray, rowInChild, innerType, holderFactory); - return holderFactory.CreateVariantHolder(value.Release(), variantIndex); + return ExtractUnboxedValue(array, row, static_cast(itemType), holderFactory); } case NMiniKQL::TType::EKind::Tagged: { - auto taggedType = static_cast(itemType); - return ExtractUnboxedValue(array, row, taggedType->GetBaseType(), holderFactory); + return ExtractUnboxedValue(array, row, static_cast(itemType)->GetBaseType(), holderFactory); } case NMiniKQL::TType::EKind::Type: From a1145137a9c33cf85570c0ec6f91ae2e20bb8527 Mon Sep 17 00:00:00 2001 From: Daniil Timizhev Date: Fri, 7 Nov 2025 19:01:34 +0300 Subject: [PATCH 19/25] Support Variant --- .../result_set_format/kqp_formats_arrow.cpp | 80 +- .../ut/kqp_formats_arrow_ut.cpp | 899 ++++-------------- .../ut/kqp_formats_ut_helpers.cpp | 8 +- 3 files changed, 218 insertions(+), 769 deletions(-) diff --git a/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp b/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp index 0dedc68b12e4..551306b2222f 100644 --- a/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp +++ b/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp @@ -53,7 +53,7 @@ std::shared_ptr BuildArrowType(NUdf::EDataSl return std::make_shared(); } - std::vector> fields{ + arrow::FieldVector fields{ std::make_shared("datetime", type, false), std::make_shared("timezone", arrow::utf8(), false), }; @@ -74,7 +74,7 @@ std::shared_ptr GetArrowType(const NMiniKQL::TDataType* dataTyp } std::shared_ptr GetArrowType(const NMiniKQL::TStructType* structType) { - std::vector> fields; + arrow::FieldVector fields; fields.reserve(structType->GetMembersCount()); for (ui32 index = 0; index < structType->GetMembersCount(); ++index) { auto memberType = structType->GetMemberType(index); @@ -87,7 +87,7 @@ std::shared_ptr GetArrowType(const NMiniKQL::TStructType* struc } std::shared_ptr GetArrowType(const NMiniKQL::TTupleType* tupleType) { - std::vector> fields; + arrow::FieldVector fields; fields.reserve(tupleType->GetElementsCount()); for (ui32 index = 0; index < tupleType->GetElementsCount(); ++index) { auto elementName = "field" + std::to_string(index); @@ -119,7 +119,6 @@ std::shared_ptr GetArrowType(const NMiniKQL::TDictType* dictTyp std::shared_ptr GetArrowType(const NMiniKQL::TVariantType* variantType) { NMiniKQL::TType* innerType = variantType->GetUnderlyingType(); - arrow::FieldVector types; NMiniKQL::TStructType* structType = nullptr; NMiniKQL::TTupleType* tupleType = nullptr; @@ -130,48 +129,50 @@ std::shared_ptr GetArrowType(const NMiniKQL::TVariantType* vari tupleType = static_cast(innerType); } - if (variantType->GetAlternativesCount() > arrow::UnionType::kMaxTypeCode) { - ui32 numberOfGroups = (variantType->GetAlternativesCount() - 1) / arrow::UnionType::kMaxTypeCode + 1; - types.reserve(numberOfGroups); + YQL_ENSURE(variantType->GetAlternativesCount() <= MAX_VARIANT_NESTED_SIZE, "Variant type has more than " << MAX_VARIANT_NESTED_SIZE << " alternatives"); - for (ui32 groupIndex = 0; groupIndex < numberOfGroups; ++groupIndex) { - ui32 beginIndex = groupIndex * arrow::UnionType::kMaxTypeCode; - ui32 endIndex = std::min((groupIndex + 1) * arrow::UnionType::kMaxTypeCode, variantType->GetAlternativesCount()); + arrow::FieldVector fields; + if (variantType->GetAlternativesCount() > MAX_VARIANT_FLATTEN_SIZE) { + ui32 numberOfGroups = ((variantType->GetAlternativesCount() - 1) / MAX_VARIANT_FLATTEN_SIZE) + 1; + fields.reserve(numberOfGroups); - arrow::FieldVector groupTypes; - groupTypes.reserve(endIndex - beginIndex); + for (ui32 group = 0; group < numberOfGroups; ++group) { + ui32 beginIndex = group * MAX_VARIANT_FLATTEN_SIZE; + ui32 endIndex = std::min((group + 1) * MAX_VARIANT_FLATTEN_SIZE, variantType->GetAlternativesCount()); - for (ui32 index = beginIndex; index < endIndex; ++index) { - auto itemName = (structType == nullptr) - ? std::string("field" + ToString(index)) - : std::string(structType->GetMemberName(index)); - auto itemType = (structType == nullptr) - ? tupleType->GetElementType(index) - : structType->GetMemberType(index); + arrow::FieldVector groupFields; + groupFields.reserve(endIndex - beginIndex); + + for (ui32 i = beginIndex; i < endIndex; ++i) { + auto itemName = (structType == nullptr) ? std::string("field" + ToString(i)) : std::string(structType->GetMemberName(i)); + auto itemType = (structType == nullptr) ? tupleType->GetElementType(i) : structType->GetMemberType(i); auto itemArrowType = NFormats::GetArrowType(itemType); - groupTypes.emplace_back(std::make_shared( itemName, itemArrowType, itemType->IsOptional())); + groupFields.emplace_back(std::make_shared( itemName, itemArrowType, itemType->IsOptional())); } - auto fieldName = std::string("field" + ToString(groupIndex)); - types.emplace_back(std::make_shared(fieldName, arrow::dense_union(groupTypes), false)); + std::vector typeCodes(groupFields.size()); + std::iota(typeCodes.begin(), typeCodes.end(), 0); + + auto fieldName = std::string("field" + ToString(group)); + fields.emplace_back(std::make_shared(fieldName, arrow::dense_union(groupFields, typeCodes), false)); } - return arrow::dense_union(types); + return arrow::dense_union(fields); } - types.reserve(variantType->GetAlternativesCount()); + fields.reserve(variantType->GetAlternativesCount()); for (ui32 index = 0; index < variantType->GetAlternativesCount(); ++index) { - auto itemName = (structType == nullptr) - ? std::string("field" + ToString(index)) - : std::string(structType->GetMemberName(index)); + auto itemName = (structType == nullptr) ? std::string("field" + ToString(index)) : std::string(structType->GetMemberName(index)); auto itemType = (structType == nullptr) ? tupleType->GetElementType(index) : structType->GetMemberType(index); auto itemArrowType = NFormats::GetArrowType(itemType); - types.emplace_back(std::make_shared(itemName, itemArrowType, itemType->IsOptional())); + fields.emplace_back(std::make_shared(itemName, itemArrowType, itemType->IsOptional())); } - return arrow::dense_union(types); + std::vector typeCodes(fields.size()); + std::iota(typeCodes.begin(), typeCodes.end(), 0); + return arrow::dense_union(fields, typeCodes); } std::shared_ptr GetArrowType(const NMiniKQL::TOptionalType* optionalType) { @@ -537,11 +538,13 @@ void AppendElement(NUdf::TUnboxedValue value, arrow::ArrayBuilder* builder, cons innerType = static_cast(innerType)->GetElementType(variantIndex); } - if (variantType->GetAlternativesCount() > arrow::UnionType::kMaxTypeCode) { - ui32 numberOfGroups = (variantType->GetAlternativesCount() - 1) / arrow::UnionType::kMaxTypeCode + 1; + YQL_ENSURE(variantType->GetAlternativesCount() <= MAX_VARIANT_NESTED_SIZE, "Variant type has more than " << MAX_VARIANT_NESTED_SIZE << " alternatives"); + + if (variantType->GetAlternativesCount() > MAX_VARIANT_FLATTEN_SIZE) { + ui32 numberOfGroups = ((variantType->GetAlternativesCount() - 1) / MAX_VARIANT_FLATTEN_SIZE) + 1; YQL_ENSURE(static_cast(unionBuilder->num_children()) == numberOfGroups, "Unexpected variant number of groups"); - ui32 groupIndex = variantIndex / arrow::UnionType::kMaxTypeCode; + ui32 groupIndex = variantIndex / MAX_VARIANT_FLATTEN_SIZE; auto status = unionBuilder->Append(groupIndex); YQL_ENSURE(status.ok(), "Failed to append variant value: " << status.ToString()); @@ -549,7 +552,7 @@ void AppendElement(NUdf::TUnboxedValue value, arrow::ArrayBuilder* builder, cons YQL_ENSURE(innerBuilder->type()->id() == arrow::Type::DENSE_UNION, "Unexpected builder type"); auto innerUnionBuilder = reinterpret_cast(innerBuilder.get()); - ui32 innerVariantIndex = variantIndex % arrow::UnionType::kMaxTypeCode; + ui32 innerVariantIndex = variantIndex % MAX_VARIANT_FLATTEN_SIZE; status = innerUnionBuilder->Append(innerVariantIndex); YQL_ENSURE(status.ok(), "Failed to append variant value: " << status.ToString()); @@ -604,6 +607,7 @@ bool NeedWrapByExternalOptional(const NMiniKQL::TType* type) { } std::shared_ptr GetArrowType(const NMiniKQL::TType* type) { + YQL_ENSURE(IsArrowCompatible(type)); switch (type->GetKind()) { case NMiniKQL::TType::EKind::Null: { return arrow::null(); @@ -714,18 +718,12 @@ bool IsArrowCompatible(const NKikimr::NMiniKQL::TType* type) { case NMiniKQL::TType::EKind::Variant: { auto variantType = static_cast(type); - ui32 maxTypesCount = (arrow::UnionType::kMaxTypeCode + 1) * (arrow::UnionType::kMaxTypeCode + 1); - if (variantType->GetAlternativesCount() > maxTypesCount) { + if (variantType->GetAlternativesCount() > MAX_VARIANT_NESTED_SIZE) { return false; } NMiniKQL::TType* innerType = variantType->GetUnderlyingType(); - if (innerType->IsStruct() || innerType->IsTuple()) { - return IsArrowCompatible(innerType); - } - - YQL_ENSURE(false, "Unexpected underlying variant type: " << innerType->GetKindAsStr()); - return false; + return (innerType->IsStruct() || innerType->IsTuple()) && IsArrowCompatible(innerType); } case NMiniKQL::TType::EKind::Tagged: { diff --git a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp index e604b9c7f013..9f6f33dffd0e 100644 --- a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp +++ b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp @@ -24,17 +24,19 @@ inline static constexpr size_t TEST_ARRAY_DATATYPE_SIZE = 1 << 16; inline static constexpr size_t TEST_ARRAY_NESTED_SIZE = 1 << 8; inline static constexpr ui8 DECIMAL_PRECISION = 35; inline static constexpr ui8 DECIMAL_SCALE = 10; +inline static constexpr ui32 VARIANT_NESTED_SIZE = 260; +inline static constexpr ui32 VARIANT_OVER_LIMIT_SIZE = NKikimr::NKqp::NFormats::MAX_VARIANT_NESTED_SIZE + 1; static_assert(DECIMAL_PRECISION >= DECIMAL_SCALE, "Decimal precision must be greater than or equal to scale"); +static_assert(VARIANT_NESTED_SIZE <= NKikimr::NKqp::NFormats::MAX_VARIANT_NESTED_SIZE, "VARIANT_NESTED_SIZE must be less than or equal to MAX_VARIANT_NESTED_SIZE"); +static_assert(VARIANT_OVER_LIMIT_SIZE > NKikimr::NKqp::NFormats::MAX_VARIANT_NESTED_SIZE, "VARIANT_OVER_LIMIT_SIZE must be greater than MAX_VARIANT_NESTED_SIZE"); namespace { ui16 GetTimezoneIdSkipEmpty(ui16 index) { - auto size = NTi::GetTimezones().size(); - while (NTi::GetTimezones()[index % size].empty()) { - index = (index + 1) % size; - } - return GetTimezoneId(NTi::GetTimezones()[index % size]); + const auto& timezones = NTi::GetTimezones(); + auto name = timezones[index % timezones.size()]; + return GetTimezoneId(name.empty() ? "Europe/Moscow" : name); } std::string SerializeToBinaryJson(const TStringBuf json) { @@ -167,7 +169,6 @@ struct TTestContext { TMemoryUsageInfo MemInfo; THolderFactory HolderFactory; TDefaultValueBuilder Vb; - ui16 VariantSize = 0; TVector BasicTypes = { TDataType::Create(NUdf::TDataType::Id, TypeEnv), @@ -396,128 +397,50 @@ struct TTestContext { TType* GetVariantOverStructType() { TStructMember members[4] = { - {"0_yson", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, - {"1_json-document", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, - {"2_uuid", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, - {"3_float", TDataType::Create(NUdf::TDataType::Id, TypeEnv)} + {"0_i32", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, + {"1_string", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, + {"2_float", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, + {"3_bool", TDataType::Create(NUdf::TDataType::Id, TypeEnv)} }; auto structType = TStructType::Create(4, members, TypeEnv); return TVariantType::Create(structType, TypeEnv); } - TUnboxedValueVector CreateVariantOverStruct(ui32 quantity) { + TUnboxedValueVector CreateVariantsOverStruct(ui32 quantity) { TUnboxedValueVector values; for (ui64 value = 0; value < quantity; ++value) { auto typeIndex = value % 4; NUdf::TUnboxedValue item; if (typeIndex == 0) { - std::string data = TStringBuilder() << "{value=" << value << "}"; - item = MakeString(NUdf::TStringRef(data.data(), data.size())); + item = NUdf::TUnboxedValuePod(static_cast(value)); } else if (typeIndex == 1) { - std::string data = TStringBuilder() << "{\"value\":" << value << "}"; - item = MakeString(SerializeToBinaryJson(data)); + item = MakeString(TStringBuilder() << "value=" << value); } else if (typeIndex == 2) { - std::string sample = "7856341212905634789012345678901"; - std::string data = TStringBuilder() << HexDecode(sample + static_cast('0' + (value % 10))); - item = MakeString(NUdf::TStringRef(data.data(), data.size())); - } else if (typeIndex == 3) { item = NUdf::TUnboxedValuePod(static_cast(value) / 4); - } - auto wrapped = Vb.NewVariant(typeIndex, std::move(item)); - values.push_back(std::move(wrapped)); - } - return values; - } - - TType* GetOptionalVariantOverStructType() { - return TOptionalType::Create(GetVariantOverStructType(), TypeEnv); - } - - TUnboxedValueVector CreateOptionalVariantOverStruct(ui32 quantity) { - TUnboxedValueVector values; - for (ui64 value = 0; value < quantity; ++value) { - auto typeIndex = value % 4; - NUdf::TUnboxedValue item; - - if (value % 2 == 0) { - values.push_back(NUdf::TUnboxedValuePod()); - continue; - } - - if (typeIndex == 0) { - std::string data = TStringBuilder() << "{value=" << value << "}"; - item = MakeString(NUdf::TStringRef(data.data(), data.size())); - } else if (typeIndex == 1) { - std::string data = TStringBuilder() << "{\"value\":" << value << "}"; - item = MakeString(SerializeToBinaryJson(data)); - } else if (typeIndex == 2) { - std::string sample = "7856341212905634789012345678901"; - std::string data = TStringBuilder() << HexDecode(sample + static_cast('0' + (value % 10))); - item = MakeString(NUdf::TStringRef(data.data(), data.size())); } else if (typeIndex == 3) { - item = NUdf::TUnboxedValuePod(static_cast(value) / 4); + item = NUdf::TUnboxedValuePod(value % 2 == 0); } - auto wrapped = Vb.NewVariant(typeIndex, std::move(item)).MakeOptional(); + auto wrapped = Vb.NewVariant(typeIndex, std::move(item)); values.push_back(std::move(wrapped)); } return values; } - TType* GetDoubleOptionalVariantOverStructType() { - return TOptionalType::Create(GetOptionalVariantOverStructType(), TypeEnv); - } - - TUnboxedValueVector CreateDoubleOptionalVariantOverStruct(ui32 quantity) { - TUnboxedValueVector values; - for (ui64 value = 0; value < quantity; ++value) { - auto typeIndex = value % 4; - NUdf::TUnboxedValue item; - - if (value % 3 == 0) { - if (typeIndex == 0) { - std::string data = TStringBuilder() << "{value=" << value << "}"; - item = MakeString(NUdf::TStringRef(data.data(), data.size())); - } else if (typeIndex == 1) { - std::string data = TStringBuilder() << "{\"value\":" << value << "}"; - item = MakeString(SerializeToBinaryJson(data)); - } else if (typeIndex == 2) { - std::string sample = "7856341212905634789012345678901"; - std::string data = TStringBuilder() << HexDecode(sample + static_cast('0' + (value % 10))); - item = MakeString(NUdf::TStringRef(data.data(), data.size())); - } else if (typeIndex == 3) { - item = NUdf::TUnboxedValuePod(static_cast(value) / 4); - } - - item = Vb.NewVariant(typeIndex, std::move(item)).MakeOptional(); - } else { - item = NUdf::TUnboxedValuePod(); - } - - if (value % 3 != 2) { - item = item.MakeOptional(); - } - - values.push_back(std::move(item)); - } - return values; - } - - TType* GetVariantOverTupleWithOptionalsType() { - TType* members[5] = { + TType* GetVariantOverTupleType() { + TType* members[4] = { TDataType::Create(NUdf::TDataType::Id, TypeEnv), TDataType::Create(NUdf::TDataType::Id, TypeEnv), TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TOptionalType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), TypeEnv) + TDataType::Create(NUdf::TDataType::Id, TypeEnv) }; - auto tupleType = TTupleType::Create(5, members, TypeEnv); + auto tupleType = TTupleType::Create(4, members, TypeEnv); return TVariantType::Create(tupleType, TypeEnv); } - TUnboxedValueVector CreateVariantOverTupleWithOptionals(ui32 quantity) { + TUnboxedValueVector CreateVariantsOverTuple(ui32 quantity) { TUnboxedValueVector values; for (ui64 value = 0; value < quantity; ++value) { - auto typeIndex = value % 5; + auto typeIndex = value % 4; NUdf::TUnboxedValue item; if (typeIndex == 0) { item = NUdf::TUnboxedValuePod(value % 3 == 0); @@ -527,12 +450,6 @@ struct TTestContext { item = NUdf::TUnboxedValuePod(static_cast(value)); } else if (typeIndex == 3) { item = NUdf::TUnboxedValuePod(static_cast(-value)); - } else if (typeIndex == 4) { - NUdf::TUnboxedValue innerItem; - innerItem = value % 2 == 0 - ? NUdf::TUnboxedValuePod(static_cast(value)) - : NUdf::TUnboxedValuePod(); - item = innerItem.MakeOptional(); } auto wrapped = Vb.NewVariant(typeIndex, std::move(item)); values.emplace_back(std::move(wrapped)); @@ -540,79 +457,48 @@ struct TTestContext { return values; } - TType* GetOptionalVariantOverTupleWithOptionalsType() { - return TOptionalType::Create(GetVariantOverTupleWithOptionalsType(), TypeEnv); + TType* GetVariantNestedType() { + TVector members(VARIANT_NESTED_SIZE, nullptr); + for (ui32 i = 0; i < VARIANT_NESTED_SIZE; ++i) { + if (i % 3 == 0) { + members[i] = TDataType::Create(NUdf::TDataType::Id, TypeEnv); + } else if (i % 3 == 1) { + members[i] = TDataType::Create(NUdf::TDataType::Id, TypeEnv); + } else { + members[i] = TDataType::Create(NUdf::TDataType::Id, TypeEnv); + } + } + auto tupleType = TTupleType::Create(VARIANT_NESTED_SIZE, members.data(), TypeEnv); + return TVariantType::Create(tupleType, TypeEnv); } - TUnboxedValueVector CreateOptionalVariantOverTupleWithOptionals(ui32 quantity) { + TUnboxedValueVector CreateVariantsNested(ui32 quantity) { TUnboxedValueVector values; for (ui64 value = 0; value < quantity; ++value) { - - if (value % 2 == 0) { - values.push_back(NUdf::TUnboxedValuePod()); - continue; - } - - auto typeIndex = value % 5; - NUdf::TUnboxedValue item; - if (typeIndex == 0) { - item = NUdf::TUnboxedValuePod(value % 3 == 0); - } else if (typeIndex == 1) { - item = NUdf::TUnboxedValuePod(static_cast(-value)); - } else if (typeIndex == 2) { - item = NUdf::TUnboxedValuePod(static_cast(value)); - } else if (typeIndex == 3) { - item = NUdf::TUnboxedValuePod(static_cast(-value)); - } else if (typeIndex == 4) { - NUdf::TUnboxedValue innerItem; - innerItem = value % 2 == 0 - ? NUdf::TUnboxedValuePod(static_cast(value)) - : NUdf::TUnboxedValuePod(); - item = innerItem.MakeOptional(); - } - auto wrapped = Vb.NewVariant(typeIndex, std::move(item)).MakeOptional(); + auto typeIndex = value % VARIANT_NESTED_SIZE; + NUdf::TUnboxedValue item = NUdf::TUnboxedValuePod(static_cast(value)); + auto wrapped = Vb.NewVariant(typeIndex, std::move(item)); values.emplace_back(std::move(wrapped)); } return values; } - TType* GetDoubleOptionalVariantOverTupleWithOptionalsType() { - return TOptionalType::Create(GetOptionalVariantOverTupleWithOptionalsType(), TypeEnv); + TType* GetVariantOverLimitType() { + TVector members(VARIANT_OVER_LIMIT_SIZE, nullptr); + for (ui32 i = 0; i < VARIANT_OVER_LIMIT_SIZE; ++i) { + members[i] = TDataType::Create(NUdf::TDataType::Id, TypeEnv); + } + auto tupleType = TTupleType::Create(VARIANT_OVER_LIMIT_SIZE, members.data(), TypeEnv); + return TVariantType::Create(tupleType, TypeEnv); } - TUnboxedValueVector CreateDoubleOptionalVariantOverTupleWithOptionals(ui32 quantity) { + TUnboxedValueVector CreateVariantsOverLimit(ui32 quantity) { TUnboxedValueVector values; for (ui64 value = 0; value < quantity; ++value) { - auto typeIndex = value % 5; - NUdf::TUnboxedValue item; - - if (value % 3 == 0) { - if (typeIndex == 0) { - item = NUdf::TUnboxedValuePod(value % 3 == 0); - } else if (typeIndex == 1) { - item = NUdf::TUnboxedValuePod(static_cast(-value)); - } else if (typeIndex == 2) { - item = NUdf::TUnboxedValuePod(static_cast(value)); - } else if (typeIndex == 3) { - item = NUdf::TUnboxedValuePod(static_cast(-value)); - } else if (typeIndex == 4) { - NUdf::TUnboxedValue innerItem; - innerItem = value % 2 == 0 - ? NUdf::TUnboxedValuePod(static_cast(value)) - : NUdf::TUnboxedValuePod(); - item = innerItem.MakeOptional(); - } - - item = Vb.NewVariant(typeIndex, std::move(item)); - } else { - item = NUdf::TUnboxedValuePod(); - } - - if (value % 3 != 2) { - item = item.MakeOptional(); - } - - values.emplace_back(std::move(item)); + auto typeIndex = value % VARIANT_OVER_LIMIT_SIZE; + NUdf::TUnboxedValue item = NUdf::TUnboxedValuePod(static_cast(value)); + auto wrapped = Vb.NewVariant(typeIndex, std::move(item)); + values.emplace_back(std::move(wrapped)); } return values; } @@ -671,32 +557,6 @@ struct TTestContext { } return values; } - - TType* GetLargeVariantType(const ui16 variantSize) { - VariantSize = variantSize; - TVector tupleTypes; - tupleTypes.reserve(variantSize); - for (ui64 index = 0; index < variantSize; ++index) { - tupleTypes.push_back(TTupleType::Create(BasicTypes.size(), BasicTypes.data(), TypeEnv)); - } - auto tupleOfTuplesType = TTupleType::Create(variantSize, tupleTypes.data(), TypeEnv); - return TVariantType::Create(tupleOfTuplesType, TypeEnv); - } - - TUnboxedValueVector CreateLargeVariant(ui32 quantity) { - TUnboxedValueVector values; - for (ui64 index = 0; index < quantity; ++index) { - NUdf::TUnboxedValue item; - auto typeIndex = index % VariantSize; - TUnboxedValueVector tupleItems; - for (ui64 i = 0; i < BasicTypes.size(); ++i) { - tupleItems.push_back(GetValueOfBasicType(BasicTypes[i], i + typeIndex)); - } - auto wrapped = Vb.NewVariant(typeIndex, HolderFactory.VectorAsArray(tupleItems)); - values.emplace_back(std::move(wrapped)); - } - return values; - } }; void AssertUnboxedValuesAreEqual(NUdf::TUnboxedValue& left, NUdf::TUnboxedValue& right, TType* type) { @@ -821,22 +681,22 @@ void AssertUnboxedValuesAreEqual(NUdf::TUnboxedValue& left, NUdf::TUnboxedValue& break; } - // case TType::EKind::Variant: { - // auto variantType = static_cast(type); - // UNIT_ASSERT_EQUAL(left.GetVariantIndex(), right.GetVariantIndex()); - // ui32 variantIndex = left.GetVariantIndex(); - // TType* innerType = variantType->GetUnderlyingType(); - // if (innerType->IsStruct()) { - // innerType = static_cast(innerType)->GetMemberType(variantIndex); - // } else { - // UNIT_ASSERT_C(innerType->IsTuple(), "Unexpected underlying variant type: " << innerType->GetKindAsStr()); - // innerType = static_cast(innerType)->GetElementType(variantIndex); - // } - // NUdf::TUnboxedValue leftValue = left.GetVariantItem(); - // NUdf::TUnboxedValue rightValue = right.GetVariantItem(); - // AssertUnboxedValuesAreEqual(leftValue, rightValue, innerType); - // break; - // } + case TType::EKind::Variant: { + auto variantType = static_cast(type); + UNIT_ASSERT_EQUAL(left.GetVariantIndex(), right.GetVariantIndex()); + ui32 variantIndex = left.GetVariantIndex(); + TType* innerType = variantType->GetUnderlyingType(); + if (innerType->IsStruct()) { + innerType = static_cast(innerType)->GetMemberType(variantIndex); + } else { + UNIT_ASSERT_C(innerType->IsTuple(), "Unexpected underlying variant type: " << innerType->GetKindAsStr()); + innerType = static_cast(innerType)->GetElementType(variantIndex); + } + NUdf::TUnboxedValue leftValue = left.GetVariantItem(); + NUdf::TUnboxedValue rightValue = right.GetVariantItem(); + AssertUnboxedValuesAreEqual(leftValue, rightValue, innerType); + break; + } case TType::EKind::Tagged: { auto taggedType = static_cast(type); @@ -1309,6 +1169,113 @@ Y_UNIT_TEST_SUITE(KqpFormats_Arrow_Conversion) { } } + Y_UNIT_TEST(NestedType_Variant_Struct) { + TTestContext context; + + auto variantType = context.GetVariantOverStructType(); + auto values = context.CreateVariantsOverStruct(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(variantType)); + + auto array = MakeArrowArray(values, variantType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::DENSE_UNION); + + auto unionArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(unionArray->num_fields(), 4); + UNIT_ASSERT(unionArray->field(0)->type_id() == arrow::Type::INT32); + UNIT_ASSERT(unionArray->field(1)->type_id() == arrow::Type::BINARY); + UNIT_ASSERT(unionArray->field(2)->type_id() == arrow::Type::FLOAT); + UNIT_ASSERT(unionArray->field(3)->type_id() == arrow::Type::UINT8); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, variantType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], variantType); + }; + } + + Y_UNIT_TEST(NestedType_Variant_Tuple) { + TTestContext context; + + auto variantType = context.GetVariantOverTupleType(); + auto values = context.CreateVariantsOverTuple(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(variantType)); + + auto array = MakeArrowArray(values, variantType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::DENSE_UNION); + + auto unionArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(unionArray->num_fields(), 4); + UNIT_ASSERT(unionArray->field(0)->type_id() == arrow::Type::UINT8); + UNIT_ASSERT(unionArray->field(1)->type_id() == arrow::Type::INT16); + UNIT_ASSERT(unionArray->field(2)->type_id() == arrow::Type::UINT16); + UNIT_ASSERT(unionArray->field(3)->type_id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, variantType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], variantType); + }; + } + + Y_UNIT_TEST(NestedType_Variant_Nested) { + TTestContext context; + + auto variantType = context.GetVariantNestedType(); + auto values = context.CreateVariantsNested(TEST_ARRAY_NESTED_SIZE * 3); + + UNIT_ASSERT(IsArrowCompatible(variantType)); + + auto array = MakeArrowArray(values, variantType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::DENSE_UNION); + + auto unionArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(unionArray->num_fields(), ((VARIANT_NESTED_SIZE - 1) / MAX_VARIANT_FLATTEN_SIZE) + 1); + + for (ui32 i = 0; i < static_cast(unionArray->num_fields()); ++i) { + UNIT_ASSERT(unionArray->field(i)->type_id() == arrow::Type::DENSE_UNION); + auto innerUnionArray = static_pointer_cast(unionArray->field(i)); + + auto remainingSize = static_cast(variantType)->GetAlternativesCount() - i * MAX_VARIANT_FLATTEN_SIZE; + UNIT_ASSERT_VALUES_EQUAL(innerUnionArray->num_fields(), std::min(MAX_VARIANT_FLATTEN_SIZE, remainingSize)); + + for (ui32 j = 0; j < static_cast(innerUnionArray->num_fields()); ++j) { + auto idx = j + i * MAX_VARIANT_FLATTEN_SIZE; + if (idx % 3 == 0) { + UNIT_ASSERT(innerUnionArray->field(j)->type_id() == arrow::Type::INT32); + } else if (idx % 3 == 1) { + UNIT_ASSERT(innerUnionArray->field(j)->type_id() == arrow::Type::INT64); + } else { + UNIT_ASSERT(innerUnionArray->field(j)->type_id() == arrow::Type::UINT32); + } + } + } + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, variantType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], variantType); + }; + } + + Y_UNIT_TEST(NestedType_Variant_OverLimit) { + TTestContext context; + + auto variantType = context.GetVariantOverLimitType(); + auto values = context.CreateVariantsOverLimit(TEST_ARRAY_NESTED_SIZE * 3); + + UNIT_ASSERT(!IsArrowCompatible(variantType)); + + try { + Y_UNUSED(MakeArrowArray(values, variantType)); + UNIT_FAIL("Expected exception"); + } catch (...) {} + } + Y_UNIT_TEST(NestedType_Tagged) { TTestContext context; @@ -1371,396 +1338,6 @@ Y_UNIT_TEST_SUITE(DqUnboxedValueToNativeArrowConversion) { ++index; } } - - // Y_UNIT_TEST(VariantOverStruct) { - // TTestContext context; - - // auto variantType = context.GetVariantOverStructType(); - // UNIT_ASSERT(IsArrowCompatible(variantType)); - - // auto values = context.CreateVariantOverStruct(100); - // auto array = MakeArrowArray(values, variantType); - // UNIT_ASSERT(array->ValidateFull().ok()); - // UNIT_ASSERT(static_cast(array->length()) == values.size()); - // UNIT_ASSERT(array->type_id() == arrow::Type::DENSE_UNION); - // auto unionArray = static_pointer_cast(array); - - // UNIT_ASSERT(unionArray->num_fields() == 4); - // UNIT_ASSERT(unionArray->field(0)->type_id() == arrow::Type::BINARY); - // UNIT_ASSERT(unionArray->field(1)->type_id() == arrow::Type::STRING); - // UNIT_ASSERT(unionArray->field(2)->type_id() == arrow::Type::FIXED_SIZE_BINARY); - // UNIT_ASSERT(unionArray->field(3)->type_id() == arrow::Type::FLOAT); - - // auto ysonArray = static_pointer_cast(unionArray->field(0)); - // auto jsonDocArray = static_pointer_cast(unionArray->field(1)); - // auto uuidArray = static_pointer_cast(unionArray->field(2)); - // auto floatArray = static_pointer_cast(unionArray->field(3)); - - // for (ui64 index = 0; index < values.size(); ++index) { - // auto value = values[index]; - // UNIT_ASSERT(value.GetVariantIndex() == static_cast(unionArray->child_id(index))); - // auto fieldIndex = unionArray->value_offset(index); - // if (value.GetVariantIndex() == 3) { - // auto valueArrow = floatArray->Value(fieldIndex); - // auto valueInner = value.GetVariantItem().Get(); - // UNIT_ASSERT(valueArrow == valueInner); - // } else { - // arrow::util::string_view viewArrow; - // if (value.GetVariantIndex() == 0) { - // viewArrow = ysonArray->GetView(fieldIndex); - // } else if (value.GetVariantIndex() == 1) { - // viewArrow = jsonDocArray->GetView(fieldIndex); - // } else if (value.GetVariantIndex() == 2) { - // viewArrow = uuidArray->GetView(fieldIndex); - // } - // std::string valueArrow(viewArrow.data(), viewArrow.size()); - // auto innerItem = value.GetVariantItem(); - // auto refInner = innerItem.AsStringRef(); - // std::string valueInner(refInner.Data(), refInner.Size()); - // UNIT_ASSERT(valueArrow == valueInner); - // } - // } - // } - - // Y_UNIT_TEST(OptionalVariantOverStruct) { - // TTestContext context; - - // auto variantType = context.GetOptionalVariantOverStructType(); - // UNIT_ASSERT(IsArrowCompatible(variantType)); - - // auto values = context.CreateOptionalVariantOverStruct(100); - // auto array = MakeArrowArray(values, variantType); - // UNIT_ASSERT(array->ValidateFull().ok()); - // UNIT_ASSERT(static_cast(array->length()) == values.size()); - // UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); - - // auto structArray = static_pointer_cast(array); - // UNIT_ASSERT(structArray->num_fields() == 1); - // UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::DENSE_UNION); - - // auto unionArray = static_pointer_cast(structArray->field(0)); - - // UNIT_ASSERT(unionArray->num_fields() == 4); - // UNIT_ASSERT(unionArray->field(0)->type_id() == arrow::Type::BINARY); - // UNIT_ASSERT(unionArray->field(1)->type_id() == arrow::Type::STRING); - // UNIT_ASSERT(unionArray->field(2)->type_id() == arrow::Type::FIXED_SIZE_BINARY); - // UNIT_ASSERT(unionArray->field(3)->type_id() == arrow::Type::FLOAT); - - // auto ysonArray = static_pointer_cast(unionArray->field(0)); - // auto jsonDocArray = static_pointer_cast(unionArray->field(1)); - // auto uuidArray = static_pointer_cast(unionArray->field(2)); - // auto floatArray = static_pointer_cast(unionArray->field(3)); - - // for (ui64 index = 0; index < values.size(); ++index) { - // auto value = values[index]; - // if (!value.HasValue()) { - // // NULL - // UNIT_ASSERT(structArray->IsNull(index)); - // continue; - // } - - // UNIT_ASSERT(!structArray->IsNull(index)); - - // UNIT_ASSERT(value.GetVariantIndex() == static_cast(unionArray->child_id(index))); - // auto fieldIndex = unionArray->value_offset(index); - // if (value.GetVariantIndex() == 3) { - // auto valueArrow = floatArray->Value(fieldIndex); - // auto valueInner = value.GetVariantItem().Get(); - // UNIT_ASSERT(valueArrow == valueInner); - // } else { - // arrow::util::string_view viewArrow; - // if (value.GetVariantIndex() == 0) { - // viewArrow = ysonArray->GetView(fieldIndex); - // } else if (value.GetVariantIndex() == 1) { - // viewArrow = jsonDocArray->GetView(fieldIndex); - // } else if (value.GetVariantIndex() == 2) { - // viewArrow = uuidArray->GetView(fieldIndex); - // } - // std::string valueArrow(viewArrow.data(), viewArrow.size()); - // auto innerItem = value.GetVariantItem(); - // auto refInner = innerItem.AsStringRef(); - // std::string valueInner(refInner.Data(), refInner.Size()); - // UNIT_ASSERT(valueArrow == valueInner); - // } - // } - // } - - // Y_UNIT_TEST(DoubleOptionalVariantOverStruct) { - // TTestContext context; - - // auto variantType = context.GetDoubleOptionalVariantOverStructType(); - // UNIT_ASSERT(IsArrowCompatible(variantType)); - - // auto values = context.CreateDoubleOptionalVariantOverStruct(100); - // auto array = MakeArrowArray(values, variantType); - // UNIT_ASSERT(array->ValidateFull().ok()); - // UNIT_ASSERT(static_cast(array->length()) == values.size()); - // UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); - - // auto firstStructArray = static_pointer_cast(array); - // UNIT_ASSERT(firstStructArray->num_fields() == 1); - // UNIT_ASSERT(firstStructArray->field(0)->type_id() == arrow::Type::STRUCT); - - // auto secondStructArray = static_pointer_cast(firstStructArray->field(0)); - // UNIT_ASSERT(secondStructArray->num_fields() == 1); - // UNIT_ASSERT(secondStructArray->field(0)->type_id() == arrow::Type::DENSE_UNION); - - // auto unionArray = static_pointer_cast(secondStructArray->field(0)); - - // UNIT_ASSERT(unionArray->num_fields() == 4); - // UNIT_ASSERT(unionArray->field(0)->type_id() == arrow::Type::BINARY); - // UNIT_ASSERT(unionArray->field(1)->type_id() == arrow::Type::STRING); - // UNIT_ASSERT(unionArray->field(2)->type_id() == arrow::Type::FIXED_SIZE_BINARY); - // UNIT_ASSERT(unionArray->field(3)->type_id() == arrow::Type::FLOAT); - - // auto ysonArray = static_pointer_cast(unionArray->field(0)); - // auto jsonDocArray = static_pointer_cast(unionArray->field(1)); - // auto uuidArray = static_pointer_cast(unionArray->field(2)); - // auto floatArray = static_pointer_cast(unionArray->field(3)); - - // for (ui64 index = 0; index < values.size(); ++index) { - // auto value = values[index]; - // if (!value.HasValue()) { - // if (value) { - // // Optional(NULL) - // UNIT_ASSERT(secondStructArray->IsNull(index)); - // } else { - // // NULL - // UNIT_ASSERT(firstStructArray->IsNull(index)); - // } - // continue; - // } - - // UNIT_ASSERT(!firstStructArray->IsNull(index) && !secondStructArray->IsNull(index)); - - // UNIT_ASSERT(value.GetVariantIndex() == static_cast(unionArray->child_id(index))); - // auto fieldIndex = unionArray->value_offset(index); - // if (value.GetVariantIndex() == 3) { - // auto valueArrow = floatArray->Value(fieldIndex); - // auto valueInner = value.GetVariantItem().Get(); - // UNIT_ASSERT_VALUES_EQUAL(valueArrow, valueInner); - // } else { - // arrow::util::string_view viewArrow; - // if (value.GetVariantIndex() == 0) { - // viewArrow = ysonArray->GetView(fieldIndex); - // } else if (value.GetVariantIndex() == 1) { - // viewArrow = jsonDocArray->GetView(fieldIndex); - // } else if (value.GetVariantIndex() == 2) { - // viewArrow = uuidArray->GetView(fieldIndex); - // } - // std::string valueArrow(viewArrow.data(), viewArrow.size()); - // auto innerItem = value.GetVariantItem(); - // auto refInner = innerItem.AsStringRef(); - // std::string valueInner(refInner.Data(), refInner.Size()); - // UNIT_ASSERT_VALUES_EQUAL(valueArrow, valueInner); - // } - // } - // } - - Y_UNIT_TEST(VariantOverTupleWithOptionals) { - TTestContext context; - - auto variantType = context.GetVariantOverTupleWithOptionalsType(); - UNIT_ASSERT(IsArrowCompatible(variantType)); - - auto values = context.CreateVariantOverTupleWithOptionals(100); - auto array = MakeArrowArray(values, variantType); - UNIT_ASSERT(array->ValidateFull().ok()); - UNIT_ASSERT(static_cast(array->length()) == values.size()); - UNIT_ASSERT(array->type_id() == arrow::Type::DENSE_UNION); - auto unionArray = static_pointer_cast(array); - - UNIT_ASSERT(unionArray->num_fields() == 5); - UNIT_ASSERT(unionArray->field(0)->type_id() == arrow::Type::UINT8); - UNIT_ASSERT(unionArray->field(1)->type_id() == arrow::Type::INT16); - UNIT_ASSERT(unionArray->field(2)->type_id() == arrow::Type::UINT16); - UNIT_ASSERT(unionArray->field(3)->type_id() == arrow::Type::INT32); - UNIT_ASSERT(unionArray->field(4)->type_id() == arrow::Type::UINT32); - auto boolArray = static_pointer_cast(unionArray->field(0)); - auto i16Array = static_pointer_cast(unionArray->field(1)); - auto ui16Array = static_pointer_cast(unionArray->field(2)); - auto i32Array = static_pointer_cast(unionArray->field(3)); - auto ui32Array = static_pointer_cast(unionArray->field(4)); - for (ui64 index = 0; index < values.size(); ++index) { - auto value = values[index]; - UNIT_ASSERT(value.GetVariantIndex() == static_cast(unionArray->child_id(index))); - auto fieldIndex = unionArray->value_offset(index); - if (value.GetVariantIndex() == 0) { - bool valueArrow = boolArray->Value(fieldIndex); - auto valueInner = value.GetVariantItem().Get(); - UNIT_ASSERT_VALUES_EQUAL(valueArrow, valueInner); - } else if (value.GetVariantIndex() == 1) { - auto valueArrow = i16Array->Value(fieldIndex); - auto valueInner = value.GetVariantItem().Get(); - UNIT_ASSERT_VALUES_EQUAL(valueArrow, valueInner); - } else if (value.GetVariantIndex() == 2) { - auto valueArrow = ui16Array->Value(fieldIndex); - auto valueInner = value.GetVariantItem().Get(); - UNIT_ASSERT_VALUES_EQUAL(valueArrow, valueInner); - } else if (value.GetVariantIndex() == 3) { - auto valueArrow = i32Array->Value(fieldIndex); - auto valueInner = value.GetVariantItem().Get(); - UNIT_ASSERT_VALUES_EQUAL(valueArrow, valueInner); - } else if (value.GetVariantIndex() == 4) { - if (!value.GetVariantItem().HasValue()) { - UNIT_ASSERT(ui32Array->IsNull(fieldIndex)); - } else { - auto valueArrow = ui32Array->Value(fieldIndex); - auto valueInner = value.GetVariantItem().Get(); - UNIT_ASSERT_VALUES_EQUAL(valueArrow, valueInner); - } - } - } - } - - Y_UNIT_TEST(OptionalVariantOverTupleWithOptionals) { - // DenseUnionArray does not support NULL values, so we wrap it in a StructArray - - TTestContext context; - - auto variantType = context.GetOptionalVariantOverTupleWithOptionalsType(); - UNIT_ASSERT(IsArrowCompatible(variantType)); - - auto values = context.CreateOptionalVariantOverTupleWithOptionals(100); - auto array = MakeArrowArray(values, variantType); - UNIT_ASSERT(array->ValidateFull().ok()); - UNIT_ASSERT(static_cast(array->length()) == values.size()); - UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); - - auto structArray = static_pointer_cast(array); - UNIT_ASSERT(structArray->num_fields() == 1); - UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::DENSE_UNION); - - auto unionArray = static_pointer_cast(structArray->field(0)); - UNIT_ASSERT(unionArray->num_fields() == 5); - UNIT_ASSERT(unionArray->field(0)->type_id() == arrow::Type::UINT8); - UNIT_ASSERT(unionArray->field(1)->type_id() == arrow::Type::INT16); - UNIT_ASSERT(unionArray->field(2)->type_id() == arrow::Type::UINT16); - UNIT_ASSERT(unionArray->field(3)->type_id() == arrow::Type::INT32); - UNIT_ASSERT(unionArray->field(4)->type_id() == arrow::Type::UINT32); - auto boolArray = static_pointer_cast(unionArray->field(0)); - auto i16Array = static_pointer_cast(unionArray->field(1)); - auto ui16Array = static_pointer_cast(unionArray->field(2)); - auto i32Array = static_pointer_cast(unionArray->field(3)); - auto ui32Array = static_pointer_cast(unionArray->field(4)); - for (ui64 index = 0; index < values.size(); ++index) { - auto value = values[index]; - if (!value) { - // NULL - UNIT_ASSERT(structArray->IsNull(index)); - continue; - } - - UNIT_ASSERT(!structArray->IsNull(index)); - - UNIT_ASSERT(value.GetVariantIndex() == static_cast(unionArray->child_id(index))); - auto fieldIndex = unionArray->value_offset(index); - if (value.GetVariantIndex() == 0) { - bool valueArrow = boolArray->Value(fieldIndex); - auto valueInner = value.GetVariantItem().Get(); - UNIT_ASSERT_VALUES_EQUAL(valueArrow, valueInner); - } else if (value.GetVariantIndex() == 1) { - auto valueArrow = i16Array->Value(fieldIndex); - auto valueInner = value.GetVariantItem().Get(); - UNIT_ASSERT_VALUES_EQUAL(valueArrow, valueInner); - } else if (value.GetVariantIndex() == 2) { - auto valueArrow = ui16Array->Value(fieldIndex); - auto valueInner = value.GetVariantItem().Get(); - UNIT_ASSERT_VALUES_EQUAL(valueArrow, valueInner); - } else if (value.GetVariantIndex() == 3) { - auto valueArrow = i32Array->Value(fieldIndex); - auto valueInner = value.GetVariantItem().Get(); - UNIT_ASSERT_VALUES_EQUAL(valueArrow, valueInner); - } else if (value.GetVariantIndex() == 4) { - if (!value.GetVariantItem().HasValue()) { - UNIT_ASSERT(ui32Array->IsNull(fieldIndex)); - } else { - auto valueArrow = ui32Array->Value(fieldIndex); - auto valueInner = value.GetVariantItem().Get(); - UNIT_ASSERT_VALUES_EQUAL(valueArrow, valueInner); - } - } - } - } - - Y_UNIT_TEST(DoubleOptionalVariantOverTupleWithOptionals) { - // DenseUnionArray does not support NULL values, so we wrap it in a StructArray - - TTestContext context; - - auto variantType = context.GetDoubleOptionalVariantOverTupleWithOptionalsType(); - UNIT_ASSERT(IsArrowCompatible(variantType)); - - auto values = context.CreateDoubleOptionalVariantOverTupleWithOptionals(100); - auto array = MakeArrowArray(values, variantType); - UNIT_ASSERT(array->ValidateFull().ok()); - UNIT_ASSERT(static_cast(array->length()) == values.size()); - UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); - - auto firstStructArray = static_pointer_cast(array); - UNIT_ASSERT(firstStructArray->num_fields() == 1); - UNIT_ASSERT(firstStructArray->field(0)->type_id() == arrow::Type::STRUCT); - - auto secondStructArray = static_pointer_cast(firstStructArray->field(0)); - UNIT_ASSERT(secondStructArray->num_fields() == 1); - UNIT_ASSERT(secondStructArray->field(0)->type_id() == arrow::Type::DENSE_UNION); - - auto unionArray = static_pointer_cast(secondStructArray->field(0)); - UNIT_ASSERT(unionArray->num_fields() == 5); - UNIT_ASSERT(unionArray->field(0)->type_id() == arrow::Type::UINT8); - UNIT_ASSERT(unionArray->field(1)->type_id() == arrow::Type::INT16); - UNIT_ASSERT(unionArray->field(2)->type_id() == arrow::Type::UINT16); - UNIT_ASSERT(unionArray->field(3)->type_id() == arrow::Type::INT32); - UNIT_ASSERT(unionArray->field(4)->type_id() == arrow::Type::UINT32); - auto boolArray = static_pointer_cast(unionArray->field(0)); - auto i16Array = static_pointer_cast(unionArray->field(1)); - auto ui16Array = static_pointer_cast(unionArray->field(2)); - auto i32Array = static_pointer_cast(unionArray->field(3)); - auto ui32Array = static_pointer_cast(unionArray->field(4)); - for (ui64 index = 0; index < values.size(); ++index) { - auto value = values[index]; - if (!value.HasValue()) { - if (value && !value.GetOptionalValue()) { - // Optional(NULL) - UNIT_ASSERT(secondStructArray->IsNull(index)); - } else if (!value) { - // NULL - UNIT_ASSERT(firstStructArray->IsNull(index)); - } - continue; - } - - UNIT_ASSERT(!firstStructArray->IsNull(index) && !secondStructArray->IsNull(index)); - - UNIT_ASSERT(value.GetVariantIndex() == static_cast(unionArray->child_id(index))); - auto fieldIndex = unionArray->value_offset(index); - if (value.GetVariantIndex() == 0) { - bool valueArrow = boolArray->Value(fieldIndex); - auto valueInner = value.GetVariantItem().Get(); - UNIT_ASSERT_VALUES_EQUAL(valueArrow, valueInner); - } else if (value.GetVariantIndex() == 1) { - auto valueArrow = i16Array->Value(fieldIndex); - auto valueInner = value.GetVariantItem().Get(); - UNIT_ASSERT_VALUES_EQUAL(valueArrow, valueInner); - } else if (value.GetVariantIndex() == 2) { - auto valueArrow = ui16Array->Value(fieldIndex); - auto valueInner = value.GetVariantItem().Get(); - UNIT_ASSERT_VALUES_EQUAL(valueArrow, valueInner); - } else if (value.GetVariantIndex() == 3) { - auto valueArrow = i32Array->Value(fieldIndex); - auto valueInner = value.GetVariantItem().Get(); - UNIT_ASSERT_VALUES_EQUAL(valueArrow, valueInner); - } else if (value.GetVariantIndex() == 4) { - if (!value.GetVariantItem().HasValue()) { - UNIT_ASSERT(ui32Array->IsNull(fieldIndex)); - } else { - auto valueArrow = ui32Array->Value(fieldIndex); - auto valueInner = value.GetVariantItem().Get(); - UNIT_ASSERT_VALUES_EQUAL(valueArrow, valueInner); - } - } - } - } } Y_UNIT_TEST_SUITE(DqUnboxedValueDoNotFitToArrow) { @@ -1871,29 +1448,6 @@ Y_UNIT_TEST_SUITE(DqUnboxedValueDoNotFitToArrow) { ++index; } } - - Y_UNIT_TEST(LargeVariant) { - TTestContext context; - - ui32 numberOfTypes = 500; - auto variantType = context.GetLargeVariantType(numberOfTypes); - UNIT_ASSERT(IsArrowCompatible(variantType)); - - auto values = context.CreateLargeVariant(1000); - auto array = MakeArrowArray(values, variantType); - UNIT_ASSERT(array->ValidateFull().ok()); - UNIT_ASSERT_EQUAL(static_cast(array->length()), values.size()); - UNIT_ASSERT_EQUAL(array->type_id(), arrow::Type::DENSE_UNION); - auto unionArray = static_pointer_cast(array); - ui32 numberOfGroups = (numberOfTypes - 1) / arrow::UnionType::kMaxTypeCode + 1; - UNIT_ASSERT_EQUAL(numberOfGroups, static_cast(unionArray->num_fields())); - ui32 typesInArrow = 0; - for (auto i = 0 ; i < unionArray->num_fields(); ++i) { - UNIT_ASSERT_EQUAL(unionArray->field(i)->type_id(), arrow::Type::DENSE_UNION); - typesInArrow += unionArray->field(i)->num_fields(); - } - UNIT_ASSERT_EQUAL(numberOfTypes, typesInArrow); - } } Y_UNIT_TEST_SUITE(ConvertUnboxedValueToArrowAndBack){ @@ -1912,96 +1466,6 @@ Y_UNIT_TEST_SUITE(ConvertUnboxedValueToArrowAndBack){ } } - // Y_UNIT_TEST(VariantOverStruct) { - // TTestContext context; - - // auto variantType = context.GetVariantOverStructType(); - // UNIT_ASSERT(IsArrowCompatible(variantType)); - - // auto values = context.CreateVariantOverStruct(100); - // auto array = MakeArrowArray(values, variantType); - // auto restoredValues = ExtractUnboxedVector(array, variantType, context.HolderFactory); - // UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); - // for (ui64 index = 0; index < values.size(); ++index) { - // AssertUnboxedValuesAreEqual(values[index], restoredValues[index], variantType); - // } - // } - - // Y_UNIT_TEST(OptionalVariantOverStruct) { - // TTestContext context; - - // auto optionalVariantType = context.GetOptionalVariantOverStructType(); - // UNIT_ASSERT(IsArrowCompatible(optionalVariantType)); - - // auto values = context.CreateOptionalVariantOverStruct(100); - // auto array = MakeArrowArray(values, optionalVariantType); - // auto restoredValues = ExtractUnboxedVector(array, optionalVariantType, context.HolderFactory); - // UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); - // for (ui64 index = 0; index < values.size(); ++index) { - // AssertUnboxedValuesAreEqual(values[index], restoredValues[index], optionalVariantType); - // } - // } - - // Y_UNIT_TEST(DoubleOptionalVariantOverStruct) { - // TTestContext context; - - // auto doubleOptionalVariantType = context.GetDoubleOptionalVariantOverStructType(); - // UNIT_ASSERT(IsArrowCompatible(doubleOptionalVariantType)); - - // auto values = context.CreateDoubleOptionalVariantOverStruct(100); - // auto array = MakeArrowArray(values, doubleOptionalVariantType); - // auto restoredValues = ExtractUnboxedVector(array, doubleOptionalVariantType, context.HolderFactory); - // UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); - // for (ui64 index = 0; index < values.size(); ++index) { - // AssertUnboxedValuesAreEqual(values[index], restoredValues[index], doubleOptionalVariantType); - // } - // } - - Y_UNIT_TEST(VariantOverTupleWithOptionals) { - TTestContext context; - - auto variantType = context.GetVariantOverTupleWithOptionalsType(); - UNIT_ASSERT(IsArrowCompatible(variantType)); - - auto values = context.CreateVariantOverTupleWithOptionals(100); - auto array = MakeArrowArray(values, variantType); - auto restoredValues = ExtractUnboxedVector(array, variantType, context.HolderFactory); - UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); - for (ui64 index = 0; index < values.size(); ++index) { - AssertUnboxedValuesAreEqual(values[index], restoredValues[index], variantType); - } - } - - Y_UNIT_TEST(OptionalVariantOverTupleWithOptionals) { - TTestContext context; - - auto optionalVariantType = context.GetOptionalVariantOverTupleWithOptionalsType(); - UNIT_ASSERT(IsArrowCompatible(optionalVariantType)); - - auto values = context.CreateOptionalVariantOverTupleWithOptionals(100); - auto array = MakeArrowArray(values, optionalVariantType); - auto restoredValues = ExtractUnboxedVector(array, optionalVariantType, context.HolderFactory); - UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); - for (ui64 index = 0; index < values.size(); ++index) { - AssertUnboxedValuesAreEqual(values[index], restoredValues[index], optionalVariantType); - } - } - - Y_UNIT_TEST(DoubleOptionalVariantOverTupleWithOptionals) { - TTestContext context; - - auto doubleOptionalVariantType = context.GetDoubleOptionalVariantOverTupleWithOptionalsType(); - UNIT_ASSERT(IsArrowCompatible(doubleOptionalVariantType)); - - auto values = context.CreateDoubleOptionalVariantOverTupleWithOptionals(100); - auto array = MakeArrowArray(values, doubleOptionalVariantType); - auto restoredValues = ExtractUnboxedVector(array, doubleOptionalVariantType, context.HolderFactory); - UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); - for (ui64 index = 0; index < values.size(); ++index) { - AssertUnboxedValuesAreEqual(values[index], restoredValues[index], doubleOptionalVariantType); - } - } - Y_UNIT_TEST(DictOptionalToTuple) { TTestContext context; @@ -2031,21 +1495,6 @@ Y_UNIT_TEST_SUITE(ConvertUnboxedValueToArrowAndBack){ AssertUnboxedValuesAreEqual(values[index], restoredValues[index], doubleOptionalType); } } - - Y_UNIT_TEST(LargeVariant) { - TTestContext context; - - auto variantType = context.GetLargeVariantType(500); - UNIT_ASSERT(IsArrowCompatible(variantType)); - - auto values = context.CreateLargeVariant(1000); - auto array = MakeArrowArray(values, variantType); - auto restoredValues = ExtractUnboxedVector(array, variantType, context.HolderFactory); - UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); - for (ui64 index = 0; index < values.size(); ++index) { - AssertUnboxedValuesAreEqual(values[index], restoredValues[index], variantType); - } - } } } // namespace NKikimr::NKqp::NFormats diff --git a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp index 154550491f86..788d387d706d 100644 --- a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp +++ b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp @@ -308,19 +308,21 @@ NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& arr auto rowInChild = unionArray->value_offset(row); auto valuesArray = unionArray->field(variantIndex); - if (variantType->GetAlternativesCount() > arrow::UnionType::kMaxTypeCode) { + YQL_ENSURE(variantType->GetAlternativesCount() <= MAX_VARIANT_NESTED_SIZE, "Variant type has more than " << MAX_VARIANT_NESTED_SIZE << " alternatives"); + + if (variantType->GetAlternativesCount() > MAX_VARIANT_FLATTEN_SIZE) { YQL_ENSURE(valuesArray->type_id() == arrow::Type::DENSE_UNION, "Unexpected array type"); auto innerUnionArray = static_pointer_cast(valuesArray); auto innerVariantIndex = innerUnionArray->child_id(rowInChild); rowInChild = innerUnionArray->value_offset(rowInChild); valuesArray = innerUnionArray->field(innerVariantIndex); - variantIndex =variantIndex * arrow::UnionType::kMaxTypeCode + innerVariantIndex; + variantIndex = variantIndex * MAX_VARIANT_FLATTEN_SIZE + innerVariantIndex; } NMiniKQL::TType* innerType = variantType->GetUnderlyingType(); if (innerType->IsStruct()) { - innerType =static_cast(innerType)->GetMemberType(variantIndex); + innerType = static_cast(innerType)->GetMemberType(variantIndex); } else { YQL_ENSURE(innerType->IsTuple(), "Unexpected underlying variant type: " << innerType->GetKindAsStr()); innerType = static_cast(innerType)->GetElementType(variantIndex); From d6d639219ebf65f7b700a7486d97dad0ba4d4fb6 Mon Sep 17 00:00:00 2001 From: Daniil Timizhev Date: Fri, 7 Nov 2025 19:47:48 +0300 Subject: [PATCH 20/25] Add nested optionals tests, remove old tests --- .../ut/kqp_formats_arrow_ut.cpp | 522 ++++++++---------- 1 file changed, 235 insertions(+), 287 deletions(-) diff --git a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp index 9f6f33dffd0e..38f9466a34bf 100644 --- a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp +++ b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp @@ -339,11 +339,83 @@ struct TTestContext { return values; } - TType* GetDoubleOptionalType() { + TType* GetOptionalStructType() { + return TOptionalType::Create(GetStructType(), TypeEnv); + } + + TUnboxedValueVector CreateOptionalStructs(ui32 quantity) { + TUnboxedValueVector values = CreateStructs(quantity); + for (size_t i = 0; i < values.size(); ++i) { + values[i] = (i % 2 == 0) ? values[i].MakeOptional() : NUdf::TUnboxedValuePod(); + } + return values; + } + + TType* GetOptionalTupleType() { + return TOptionalType::Create(GetTupleType(), TypeEnv); + } + + TUnboxedValueVector CreateOptionalTuples(ui32 quantity) { + TUnboxedValueVector values = CreateTuples(quantity); + for (size_t i = 0; i < values.size(); ++i) { + values[i] = (i % 2 == 0) ? values[i].MakeOptional() : NUdf::TUnboxedValuePod(); + } + return values; + } + + TType* GetOptionalListType() { + return TOptionalType::Create(GetListType(), TypeEnv); + } + + TUnboxedValueVector CreateOptionalLists(ui32 quantity) { + TUnboxedValueVector values = CreateLists(quantity); + for (size_t i = 0; i < values.size(); ++i) { + values[i] = (i % 2 == 0) ? values[i].MakeOptional() : NUdf::TUnboxedValuePod(); + } + return values; + } + + TType* GetOptionalDictType() { + return TOptionalType::Create(GetDictType(), TypeEnv); + } + + TUnboxedValueVector CreateOptionalDicts(ui32 quantity) { + TUnboxedValueVector values = CreateDicts(quantity); + for (size_t i = 0; i < values.size(); ++i) { + values[i] = (i % 2 == 0) ? values[i].MakeOptional() : NUdf::TUnboxedValuePod(); + } + return values; + } + + TType* GetOptionalVariantType() { + return TOptionalType::Create(GetVariantOverStructType(), TypeEnv); + } + + TUnboxedValueVector CreateOptionalVariants(ui32 quantity) { + TUnboxedValueVector values = CreateVariantsOverStruct(quantity); + for (size_t i = 0; i < values.size(); ++i) { + values[i] = (i % 2 == 0) ? values[i].MakeOptional() : NUdf::TUnboxedValuePod(); + } + return values; + } + + TType* GetOptionalTaggedType() { + return TOptionalType::Create(GetTaggedType(), TypeEnv); + } + + TUnboxedValueVector CreateOptionalTaggeds(ui32 quantity) { + TUnboxedValueVector values = CreateTaggeds(quantity); + for (size_t i = 0; i < values.size(); ++i) { + values[i] = (i % 2 == 0) ? values[i].MakeOptional() : NUdf::TUnboxedValuePod(); + } + return values; + } + + TType* GetOptionalOptionalType() { return TOptionalType::Create(GetDataOptionalType(), TypeEnv); } - TUnboxedValueVector CreateDoubleOptionals(ui32 quantity) { + TUnboxedValueVector CreateOptionalOptionals(ui32 quantity) { TUnboxedValueVector values; for (ui64 value = 0; value < quantity; ++value) { if (value % 3 == 0) { @@ -369,32 +441,6 @@ struct TTestContext { return values; } - TType* GetOptionalListOfOptional() { - TType* itemType = TOptionalType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), TypeEnv); - return TOptionalType::Create(TListType::Create(itemType, TypeEnv), TypeEnv); - } - - TUnboxedValueVector CreateOptionalListOfOptional(ui32 quantity) { - TUnboxedValueVector values; - for (ui64 value = 0; value < quantity; ++value) { - if (value % 2 == 0) { - values.emplace_back(NUdf::TUnboxedValuePod()); - continue; - } - - TUnboxedValueVector items; - items.reserve(value); - for (ui64 i = 0; i < value; ++i) { - NUdf::TUnboxedValue item = ((value + i) % 2 == 0) ? NUdf::TUnboxedValuePod() : NUdf::TUnboxedValuePod(i); - items.push_back(std::move(item).MakeOptional()); - } - - auto listValue = Vb.NewList(items.data(), value); - values.emplace_back(std::move(listValue).MakeOptional()); - } - return values; - } - TType* GetVariantOverStructType() { TStructMember members[4] = { {"0_i32", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, @@ -502,61 +548,6 @@ struct TTestContext { } return values; } - - TType* GetDictOptionalToTupleType() { - TType* keyType = TOptionalType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), TypeEnv); - TType* members[2] = { - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - }; - TType* payloadType = TTupleType::Create(2, members, TypeEnv); - return TDictType::Create(keyType, payloadType, TypeEnv); - } - - TUnboxedValueVector CreateDictOptionalToTuple(ui32 quantity) { - TUnboxedValueVector values; - for (ui64 value = 0; value < quantity; ++value) { - auto dictBuilder = Vb.NewDict(GetDictOptionalToTupleType(), 0); - for (ui64 i = 0; i < value * value; ++i) { - NUdf::TUnboxedValue key; - if (i == 0) { - key = NUdf::TUnboxedValuePod(); - } else { - key = NUdf::TUnboxedValuePod(value / 4).MakeOptional(); - } - NUdf::TUnboxedValue* items; - auto payload = Vb.NewArray(2, items); - items[0] = NUdf::TUnboxedValuePod(static_cast(-value)); - items[1] = NUdf::TUnboxedValuePod(static_cast(value)); - dictBuilder->Add(std::move(key), std::move(payload)); - } - auto dictValue = dictBuilder->Build(); - values.emplace_back(std::move(dictValue)); - } - return values; - } - - TType* GetOptionalOfOptionalType() { - return TOptionalType::Create( - TOptionalType::Create( - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TypeEnv), - TypeEnv); - } - - TUnboxedValueVector CreateOptionalOfOptional(ui32 quantity) { - TUnboxedValueVector values; - for (ui64 value = 0; value < quantity; ++value) { - NUdf::TUnboxedValue element = value % 3 == 0 - ? NUdf::TUnboxedValuePod(value).MakeOptional() - : NUdf::TUnboxedValuePod(); - if (value % 3 != 2) { - element = element.MakeOptional(); - } - values.emplace_back(std::move(element)); - } - return values; - } }; void AssertUnboxedValuesAreEqual(NUdf::TUnboxedValue& left, NUdf::TUnboxedValue& right, TType* type) { @@ -1146,11 +1137,169 @@ Y_UNIT_TEST_SUITE(KqpFormats_Arrow_Conversion) { } } - Y_UNIT_TEST(NestedType_Optional_Double) { + Y_UNIT_TEST(NestedType_Optional_Struct) { + TTestContext context; + + auto optionalType = context.GetOptionalStructType(); + auto values = context.CreateOptionalStructs(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(optionalType)); + + auto array = MakeArrowArray(values, optionalType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + auto structArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 5); + + UNIT_ASSERT(structArray->GetFieldByName("ABC") && structArray->GetFieldByName("ABC") == structArray->field(0)); + UNIT_ASSERT(structArray->GetFieldByName("DEF") && structArray->GetFieldByName("DEF") == structArray->field(1)); + UNIT_ASSERT(structArray->GetFieldByName("GHI") && structArray->GetFieldByName("GHI") == structArray->field(2)); + UNIT_ASSERT(structArray->GetFieldByName("JKL") && structArray->GetFieldByName("JKL") == structArray->field(3)); + UNIT_ASSERT(structArray->GetFieldByName("MNO") && structArray->GetFieldByName("MNO") == structArray->field(4)); + + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::BINARY); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT32); + UNIT_ASSERT(structArray->field(2)->type_id() == arrow::Type::UINT64); + UNIT_ASSERT(structArray->field(3)->type_id() == arrow::Type::INT64); + UNIT_ASSERT(structArray->field(4)->type_id() == arrow::Type::STRING); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, optionalType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], optionalType); + } + } + + Y_UNIT_TEST(NestedType_Optional_Tuple) { + TTestContext context; + + auto optionalType = context.GetOptionalTupleType(); + auto values = context.CreateOptionalTuples(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(optionalType)); + + auto array = MakeArrowArray(values, optionalType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + auto structArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 3); + + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::UINT8); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT8); + UNIT_ASSERT(structArray->field(2)->type_id() == arrow::Type::UINT8); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, optionalType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], optionalType); + } + } + + Y_UNIT_TEST(NestedType_Optional_List) { + TTestContext context; + + auto optionalType = context.GetOptionalListType(); + auto values = context.CreateOptionalLists(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(optionalType)); + + auto array = MakeArrowArray(values, optionalType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::LIST); + auto listArray = static_pointer_cast(array); + UNIT_ASSERT(listArray->value_type()->id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, optionalType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], optionalType); + } + } + + Y_UNIT_TEST(NestedType_Optional_Dict) { + TTestContext context; + + auto optionalType = context.GetOptionalDictType(); + auto values = context.CreateOptionalDicts(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(optionalType)); + + auto array = MakeArrowArray(values, optionalType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::LIST); + auto listArray = static_pointer_cast(array); + UNIT_ASSERT(listArray->value_type()->id() == arrow::Type::STRUCT); + + auto structArray = static_pointer_cast(listArray->value_slice(0)); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 2); + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::DOUBLE); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, optionalType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], optionalType); + } + } + + Y_UNIT_TEST(NestedType_Optional_Variant) { + TTestContext context; + + auto variantType = context.GetOptionalVariantType(); + auto values = context.CreateOptionalVariants(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(variantType)); + + auto array = MakeArrowArray(values, variantType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + + auto structArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 1); + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::DENSE_UNION); + + auto unionArray = static_pointer_cast(structArray->field(0)); + UNIT_ASSERT_VALUES_EQUAL(unionArray->num_fields(), 4); + UNIT_ASSERT(unionArray->field(0)->type_id() == arrow::Type::INT32); + UNIT_ASSERT(unionArray->field(1)->type_id() == arrow::Type::BINARY); + UNIT_ASSERT(unionArray->field(2)->type_id() == arrow::Type::FLOAT); + UNIT_ASSERT(unionArray->field(3)->type_id() == arrow::Type::UINT8); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, variantType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], variantType); + }; + } + + Y_UNIT_TEST(NestedType_Optional_Tagged) { + TTestContext context; + + auto optionalType = context.GetOptionalTaggedType(); + auto values = context.CreateOptionalTaggeds(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(optionalType)); + + auto array = MakeArrowArray(values, optionalType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, optionalType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], optionalType); + } + } + + Y_UNIT_TEST(NestedType_Optional_Optional) { TTestContext context; - auto optionalType = context.GetDoubleOptionalType(); - auto values = context.CreateDoubleOptionals(TEST_ARRAY_NESTED_SIZE); + auto optionalType = context.GetOptionalOptionalType(); + auto values = context.CreateOptionalOptionals(TEST_ARRAY_NESTED_SIZE); UNIT_ASSERT(IsArrowCompatible(optionalType)); @@ -1296,205 +1445,4 @@ Y_UNIT_TEST_SUITE(KqpFormats_Arrow_Conversion) { } } -Y_UNIT_TEST_SUITE(DqUnboxedValueToNativeArrowConversion) { - Y_UNIT_TEST(OptionalListOfOptional) { - TTestContext context; - - auto listType = context.GetOptionalListOfOptional(); - Y_ABORT_UNLESS(IsArrowCompatible(listType)); - - auto values = context.CreateOptionalListOfOptional(100); - auto array = MakeArrowArray(values, listType); - UNIT_ASSERT(array->ValidateFull().ok()); - UNIT_ASSERT(static_cast(array->length()) == values.size()); - UNIT_ASSERT(array->type_id() == arrow::Type::LIST); - - auto listArray = static_pointer_cast(array); - UNIT_ASSERT(listArray->num_fields() == 1); - UNIT_ASSERT(listArray->value_type()->id() == arrow::Type::INT32); - - auto i32Array = static_pointer_cast(listArray->values()); - auto index = 0; - auto innerIndex = 0; - for (const auto& value: values) { - if (!value.HasValue()) { - UNIT_ASSERT(listArray->IsNull(index)); - ++index; - continue; - } - - auto listValue = value.GetOptionalValue(); - - UNIT_ASSERT_VALUES_EQUAL(listValue.GetListLength(), static_cast(listArray->value_length(index))); - const auto iter = listValue.GetListIterator(); - for (NUdf::TUnboxedValue item; iter.Next(item);) { - if (!item.HasValue()) { - UNIT_ASSERT(i32Array->IsNull(innerIndex)); - } else { - UNIT_ASSERT(i32Array->Value(innerIndex) == item.GetOptionalValue().Get()); - } - ++innerIndex; - } - ++index; - } - } -} - -Y_UNIT_TEST_SUITE(DqUnboxedValueDoNotFitToArrow) { - Y_UNIT_TEST(DictOptionalToTuple) { - TTestContext context; - - auto dictType = context.GetDictOptionalToTupleType(); - UNIT_ASSERT(IsArrowCompatible(dictType)); - - auto values = context.CreateDictOptionalToTuple(100); - auto array = MakeArrowArray(values, dictType); - UNIT_ASSERT(array->ValidateFull().ok()); - UNIT_ASSERT_EQUAL(static_cast(array->length()), values.size()); - UNIT_ASSERT_EQUAL(array->type_id(), arrow::Type::STRUCT); - - auto wrapArray = static_pointer_cast(array); - UNIT_ASSERT_EQUAL(wrapArray->num_fields(), 2); - UNIT_ASSERT_EQUAL(wrapArray->field(0)->type_id(), arrow::Type::LIST); - - UNIT_ASSERT_EQUAL(wrapArray->field(1)->type_id(), arrow::Type::UINT64); - auto listArray = static_pointer_cast(wrapArray->field(0)); - UNIT_ASSERT_EQUAL(static_cast(listArray->length()), values.size()); - - UNIT_ASSERT_EQUAL(wrapArray->field(1)->type_id(), arrow::Type::UINT64); - auto customArray = static_pointer_cast(wrapArray->field(1)); - UNIT_ASSERT_EQUAL(static_cast(customArray->length()), values.size()); - - UNIT_ASSERT_EQUAL(listArray->value_type()->id(), arrow::Type::STRUCT); - auto structArray = static_pointer_cast(listArray->values()); - - UNIT_ASSERT_EQUAL(listArray->num_fields(), 1); - UNIT_ASSERT_EQUAL(structArray->num_fields(), 2); - UNIT_ASSERT_EQUAL(structArray->field(0)->type_id(), arrow::Type::DOUBLE); - UNIT_ASSERT_EQUAL(structArray->field(1)->type_id(), arrow::Type::STRUCT); - auto keysArray = static_pointer_cast(structArray->field(0)); - auto itemsArray = static_pointer_cast(structArray->field(1)); - UNIT_ASSERT_EQUAL(itemsArray->num_fields(), 2); - UNIT_ASSERT_EQUAL(itemsArray->field(0)->type_id(), arrow::Type::INT32); - UNIT_ASSERT_EQUAL(itemsArray->field(1)->type_id(), arrow::Type::UINT32); - auto i32Array = static_pointer_cast(itemsArray->field(0)); - auto ui32Array = static_pointer_cast(itemsArray->field(1)); - - ui64 index = 0; - for (const auto& value: values) { - UNIT_ASSERT(value.GetDictLength() == static_cast(listArray->value_length(index))); - for (auto subindex = listArray->value_offset(index); subindex < listArray->value_offset(index + 1); ++subindex) { - NUdf::TUnboxedValue key = keysArray->IsNull(subindex) - ? NUdf::TUnboxedValuePod() - : NUdf::TUnboxedValuePod(keysArray->Value(subindex)); - UNIT_ASSERT(value.Contains(key)); - NUdf::TUnboxedValue payloadValue = value.Lookup(key); - UNIT_ASSERT_EQUAL(payloadValue.GetElement(0).Get(), i32Array->Value(subindex)); - UNIT_ASSERT_EQUAL(payloadValue.GetElement(1).Get(), ui32Array->Value(subindex)); - } - ++index; - } - } - - Y_UNIT_TEST(OptionalOfOptional) { - TTestContext context; - - auto doubleOptionalType = context.GetOptionalOfOptionalType(); - UNIT_ASSERT(IsArrowCompatible(doubleOptionalType)); - - auto values = context.CreateOptionalOfOptional(100); - auto array = MakeArrowArray(values, doubleOptionalType); - UNIT_ASSERT(array->ValidateFull().ok()); - UNIT_ASSERT_EQUAL(static_cast(array->length()), values.size()); - - auto index = 0; - for (auto value: values) { - std::shared_ptr currentArray = array; - int depth = 0; - - while (currentArray->type()->id() == arrow::Type::STRUCT) { - auto structArray = static_pointer_cast(currentArray); - UNIT_ASSERT_EQUAL(structArray->num_fields(), 1); - - if (structArray->IsNull(index)) { - break; - } - - ++depth; - - auto childArray = structArray->field(0); - if (childArray->type()->id() == arrow::Type::DENSE_UNION) { - break; - } - - currentArray = childArray; - } - - while (depth--) { - UNIT_ASSERT(value); - value = value.GetOptionalValue(); - } - - if (value.HasValue()) { - if (currentArray->type()->id() == arrow::Type::INT32) { - UNIT_ASSERT_EQUAL(value.Get(), static_pointer_cast(currentArray)->Value(index)); - } else { - UNIT_ASSERT(!currentArray->IsNull(index)); - } - } else { - UNIT_ASSERT(currentArray->IsNull(index)); - } - - ++index; - } - } -} - -Y_UNIT_TEST_SUITE(ConvertUnboxedValueToArrowAndBack){ - Y_UNIT_TEST(OptionalListOfOptional) { - TTestContext context; - - auto listType = context.GetOptionalListOfOptional(); - Y_ABORT_UNLESS(IsArrowCompatible(listType)); - - auto values = context.CreateOptionalListOfOptional(100); - auto array = MakeArrowArray(values, listType); - auto restoredValues = ExtractUnboxedVector(array, listType, context.HolderFactory); - UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); - for (ui64 index = 0; index < values.size(); ++index) { - AssertUnboxedValuesAreEqual(values[index], restoredValues[index], listType); - } - } - - Y_UNIT_TEST(DictOptionalToTuple) { - TTestContext context; - - auto dictType = context.GetDictOptionalToTupleType(); - UNIT_ASSERT(IsArrowCompatible(dictType)); - - auto values = context.CreateDictOptionalToTuple(100); - auto array = MakeArrowArray(values, dictType); - auto restoredValues = ExtractUnboxedVector(array, dictType, context.HolderFactory); - UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); - for (ui64 index = 0; index < values.size(); ++index) { - AssertUnboxedValuesAreEqual(values[index], restoredValues[index], dictType); - } - } - - Y_UNIT_TEST(OptionalOfOptional) { - TTestContext context; - - auto doubleOptionalType = context.GetOptionalOfOptionalType(); - UNIT_ASSERT(IsArrowCompatible(doubleOptionalType)); - - auto values = context.CreateOptionalOfOptional(100); - auto array = MakeArrowArray(values, doubleOptionalType); - auto restoredValues = ExtractUnboxedVector(array, doubleOptionalType, context.HolderFactory); - UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); - for (ui64 index = 0; index < values.size(); ++index) { - AssertUnboxedValuesAreEqual(values[index], restoredValues[index], doubleOptionalType); - } - } -} - } // namespace NKikimr::NKqp::NFormats From 54641c94e6a75af9de5690994e3159b0f6959994 Mon Sep 17 00:00:00 2001 From: Daniil Timizhev Date: Fri, 7 Nov 2025 22:04:33 +0300 Subject: [PATCH 21/25] Implemented many nested tests (thx cursor) --- .../ut/kqp_formats_arrow_ut.cpp | 1481 ++++++++++++++++- 1 file changed, 1390 insertions(+), 91 deletions(-) diff --git a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp index 38f9466a34bf..5b43033ce609 100644 --- a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp +++ b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp @@ -244,6 +244,106 @@ struct TTestContext { return values; } + TType* GetStructNestedValueType() { + auto listType = GetListType(); + std::vector innerMembers = { + {"12", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, + {"34", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, + }; + auto innerStructType = TStructType::Create(2, innerMembers.data(), TypeEnv); + + std::vector members = { + {"56", listType}, + {"78", innerStructType}, + {"910", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, + }; + return TStructType::Create(3, members.data(), TypeEnv); + } + + TUnboxedValueVector CreateStructsNestedValue(ui32 quantity) { + TUnboxedValueVector values; + auto lists = CreateLists(quantity); + + for (ui32 value = 0; value < quantity; ++value) { + NUdf::TUnboxedValue* items; + auto structValue = Vb.NewArray(3, items); + + items[0] = lists[value]; + + NUdf::TUnboxedValue* innerItems; + auto innerStructValue = Vb.NewArray(2, innerItems); + innerItems[0] = NUdf::TUnboxedValuePod(static_cast(value)); + innerItems[1] = NUdf::TUnboxedValuePod(static_cast(-value)); + items[1] = std::move(innerStructValue); + + items[2] = NUdf::TUnboxedValuePod(static_cast(value)); + + values.emplace_back(std::move(structValue)); + } + return values; + } + + TType* GetStructOptionalValueType() { + std::vector members = { + {"opt1", TOptionalType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), TypeEnv)}, + {"opt2", TOptionalType::Create(GetTypeOfSingular(TypeEnv), TypeEnv)}, + {"opt3", TOptionalType::Create(TOptionalType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), TypeEnv), TypeEnv)}, + {"optless", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, + }; + return TStructType::Create(4, members.data(), TypeEnv); + } + + TUnboxedValueVector CreateStructsOptionalValue(ui32 quantity) { + TUnboxedValueVector values; + for (ui32 value = 0; value < quantity; ++value) { + NUdf::TUnboxedValue* items; + auto structValue = Vb.NewArray(4, items); + + if (value % 2 == 0) { + items[0] = NUdf::TUnboxedValuePod(static_cast(-value)).MakeOptional(); + items[1] = NUdf::TUnboxedValuePod().MakeOptional(); + } else { + items[0] = NUdf::TUnboxedValuePod(); + items[1] = NUdf::TUnboxedValuePod(); + } + + if (value % 3 == 0) { + items[2] = NUdf::TUnboxedValuePod(static_cast(-value)).MakeOptional().MakeOptional(); + } else if (value % 3 == 1) { + items[2] = NUdf::TUnboxedValuePod(static_cast(-value)).MakeOptional(); + } else { + items[2] = NUdf::TUnboxedValuePod(); + } + + items[3] = NUdf::TUnboxedValuePod(static_cast(value)); + + values.emplace_back(std::move(structValue)); + } + return values; + } + + TType* GetStructTaggedValueType() { + std::vector members = { + {"1", TTaggedType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), "test", TypeEnv)}, + {"2", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, + {"3", TTaggedType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), "tag2", TypeEnv)}, + }; + return TStructType::Create(3, members.data(), TypeEnv); + } + + TUnboxedValueVector CreateStructsTaggedValue(ui32 quantity) { + TUnboxedValueVector values; + for (ui32 value = 0; value < quantity; ++value) { + NUdf::TUnboxedValue* items; + auto structValue = Vb.NewArray(3, items); + items[0] = NUdf::TUnboxedValuePod(static_cast(-value)); + items[1] = NUdf::TUnboxedValuePod(static_cast(value)); + items[2] = NUdf::TUnboxedValuePod(static_cast(value)); + values.emplace_back(std::move(structValue)); + } + return values; + } + TType* GetTupleType() { TType* members[3] = { TDataType::Create(NUdf::TDataType::Id, TypeEnv), @@ -266,6 +366,87 @@ struct TTestContext { return values; } + TType* GetTupleNestedValueType() { + auto listType = GetListType(); + auto structType = GetStructType(); + TType* members[2] = { + listType, + structType, + }; + return TTupleType::Create(2, members, TypeEnv); + } + + TUnboxedValueVector CreateTuplesNestedValue(ui32 quantity) { + TUnboxedValueVector values; + + auto lists = CreateLists(quantity); + auto structs = CreateStructs(quantity); + for (ui32 value = 0; value < quantity; ++value) { + NUdf::TUnboxedValue* items; + auto tupleValue = Vb.NewArray(2, items); + items[0] = lists[value]; + items[1] = structs[value]; + values.push_back(std::move(tupleValue)); + } + return values; + } + + TType* GetTupleOptionalValueType() { + TType* members[3] = { + TOptionalType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), TypeEnv), + TOptionalType::Create(GetTypeOfSingular(TypeEnv), TypeEnv), + TOptionalType::Create(TOptionalType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), TypeEnv), TypeEnv) + }; + return TTupleType::Create(3, members, TypeEnv); + } + + TUnboxedValueVector CreateTuplesOptionalValue(ui32 quantity) { + TUnboxedValueVector values; + for (ui32 value = 0; value < quantity; ++value) { + NUdf::TUnboxedValue* items; + auto tupleValue = Vb.NewArray(3, items); + + if (value % 2 == 0) { + items[0] = NUdf::TUnboxedValuePod(static_cast(-value)).MakeOptional(); + items[1] = NUdf::TUnboxedValuePod().MakeOptional(); + } else { + items[0] = NUdf::TUnboxedValuePod(); + items[1] = NUdf::TUnboxedValuePod(); + } + + if (value % 3 == 0) { + items[2] = NUdf::TUnboxedValuePod(static_cast(-value)).MakeOptional().MakeOptional(); + } else if (value % 3 == 1) { + items[2] = NUdf::TUnboxedValuePod(static_cast(-value)).MakeOptional(); + } else { + items[2] = NUdf::TUnboxedValuePod(); + } + + values.push_back(std::move(tupleValue)); + } + return values; + } + + TType* GetTupleTaggedValueType() { + TType* members[2] = { + TTaggedType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), "test", TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv) + }; + return TTupleType::Create(2, members, TypeEnv); + } + + TUnboxedValueVector CreateTuplesTaggedValue(ui32 quantity) { + TUnboxedValueVector values; + for (ui32 value = 0; value < quantity; ++value) { + NUdf::TUnboxedValue* items; + auto tupleValue = Vb.NewArray(2, items); + items[0] = NUdf::TUnboxedValuePod(static_cast(-value)); + items[1] = NUdf::TUnboxedValuePod(static_cast(-value)); + values.push_back(std::move(tupleValue)); + } + return values; + } + TType* GetListType() { auto itemType = TDataType::Create(NUdf::TDataType::Id, TypeEnv); return TListType::Create(itemType, TypeEnv); @@ -286,6 +467,95 @@ struct TTestContext { return values; } + TType* GetListNestedValueType() { + std::vector members = { + {"first", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, + {"second", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, + }; + auto itemType = TStructType::Create(2, members.data(), TypeEnv); + return TListType::Create(itemType, TypeEnv); + } + + TUnboxedValueVector CreateListsNestedValue(ui32 quantity) { + TUnboxedValueVector values; + values.reserve(quantity); + for (ui64 value = 0; value < quantity; ++value) { + TUnboxedValueVector items; + items.reserve(value); + for (ui64 i = 0; i < value; ++i) { + NUdf::TUnboxedValue* structItem; + auto structItemValue = Vb.NewArray(2, structItem); + structItem[0] = NUdf::TUnboxedValuePod(static_cast(i)); + structItem[1] = NUdf::TUnboxedValuePod(static_cast(-i)); + items.push_back(std::move(structItemValue)); + } + auto listValue = Vb.NewList(items.data(), value); + values.emplace_back(std::move(listValue)); + } + return values; + } + + TType* GetListOptionalValueType() { + auto itemType = TOptionalType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), TypeEnv); + return TListType::Create(itemType, TypeEnv); + } + + TUnboxedValueVector CreateListsOptionalValue(ui32 quantity) { + TUnboxedValueVector values; + values.reserve(quantity); + for (ui64 value = 0; value < quantity; ++value) { + TUnboxedValueVector items; + items.reserve(value); + for (ui64 i = 0; i < value; ++i) { + items.push_back((i % 2 == 0) ? NUdf::TUnboxedValuePod(static_cast(-i)).MakeOptional() : NUdf::TUnboxedValuePod()); + } + auto listValue = Vb.NewList(items.data(), value); + values.emplace_back(std::move(listValue)); + } + return values; + } + + TType* GetListOptionalVariantValueType() { + auto itemType = TOptionalType::Create(GetVariantOverTupleType(), TypeEnv); + return TListType::Create(itemType, TypeEnv); + } + + TUnboxedValueVector CreateListsOptionalVariantValue(ui32 quantity) { + TUnboxedValueVector values; + values.reserve(quantity); + for (ui64 value = 0; value < quantity; ++value) { + TUnboxedValueVector items; + items.reserve(value); + for (ui64 i = 0; i < value; ++i) { + auto typeIndex = i % 4; + NUdf::TUnboxedValue item; + if (typeIndex == 0) { + item = NUdf::TUnboxedValuePod(i % 3 == 0); + } else if (typeIndex == 1) { + item = NUdf::TUnboxedValuePod(static_cast(-i)); + } else if (typeIndex == 2) { + item = NUdf::TUnboxedValuePod(static_cast(i)); + } else if (typeIndex == 3) { + item = NUdf::TUnboxedValuePod(static_cast(-i)); + } + auto wrapped = Vb.NewVariant(typeIndex, std::move(item)); + items.emplace_back(std::move(wrapped)); + } + auto listValue = Vb.NewList(items.data(), value); + values.emplace_back(std::move(listValue)); + } + return values; + } + + TType* GetListTaggedValueType() { + auto itemType = TTaggedType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), "test", TypeEnv); + return TListType::Create(itemType, TypeEnv); + } + + TUnboxedValueVector CreateListsTaggedValue(ui32 quantity) { + return CreateLists(quantity); + } + TType* GetDictType() { TType* keyType = TDataType::Create(NUdf::TDataType::Id, TypeEnv); TType* payloadType = TDataType::Create(NUdf::TDataType::Id, TypeEnv); @@ -307,11 +577,130 @@ struct TTestContext { return values; } - TType* GetDataOptionalType() { + TType* GetDictNestedKeyType() { + TType* tupleItems[2] = { + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + }; + TType* keyType = TTupleType::Create(2, tupleItems, TypeEnv); + TType* payloadType = TDataType::Create(NUdf::TDataType::Id, TypeEnv); + return TDictType::Create(keyType, payloadType, TypeEnv); + } + + TUnboxedValueVector CreateDictsNestedKey(ui32 quantity) { + TUnboxedValueVector values; + for (ui64 value = 0; value < quantity; ++value) { + auto dictBuilder = Vb.NewDict(GetDictNestedKeyType(), 0); + for (ui64 i = 0; i < value; ++i) { + NUdf::TUnboxedValue* keyItems; + auto keyValue = Vb.NewArray(2, keyItems); + keyItems[0] = NUdf::TUnboxedValuePod(static_cast(i)); + keyItems[1] = NUdf::TUnboxedValuePod(static_cast(-i)); + NUdf::TUnboxedValue payload = NUdf::TUnboxedValuePod(static_cast(i * value)); + dictBuilder->Add(std::move(keyValue), std::move(payload)); + } + auto dictValue = dictBuilder->Build(); + values.emplace_back(std::move(dictValue)); + } + return values; + } + + TType* GetDictOptionalKeyType() { + TType* keyType = TOptionalType::Create(TOptionalType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), TypeEnv), TypeEnv); + TType* payloadType = TDataType::Create(NUdf::TDataType::Id, TypeEnv); + return TDictType::Create(keyType, payloadType, TypeEnv); + } + + TUnboxedValueVector CreateDictsOptionalKey(ui32 quantity) { + TUnboxedValueVector values; + for (ui64 value = 0; value < quantity; ++value) { + auto dictBuilder = Vb.NewDict(GetDictOptionalKeyType(), 0); + for (ui64 i = 0; i < value; ++i) { + NUdf::TUnboxedValue key; + if (i % 3 == 0) { + key = NUdf::TUnboxedValuePod(static_cast(i)).MakeOptional().MakeOptional(); + } else if (i % 3 == 1) { + key = NUdf::TUnboxedValuePod().MakeOptional(); + } else { + key = NUdf::TUnboxedValuePod(); + } + NUdf::TUnboxedValue payload = NUdf::TUnboxedValuePod(static_cast(i * value)); + dictBuilder->Add(std::move(key), std::move(payload)); + } + auto dictValue = dictBuilder->Build(); + values.emplace_back(std::move(dictValue)); + } + return values; + } + + TType* GetDictTaggedKeyType() { + TType* keyType = TTaggedType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), "key_tag", TypeEnv); + TType* payloadType = TDataType::Create(NUdf::TDataType::Id, TypeEnv); + return TDictType::Create(keyType, payloadType, TypeEnv); + } + + TUnboxedValueVector CreateDictsTaggedKey(ui32 quantity) { + TUnboxedValueVector values; + for (ui64 value = 0; value < quantity; ++value) { + auto dictBuilder = Vb.NewDict(GetDictTaggedKeyType(), 0); + for (ui64 i = 0; i < value; ++i) { + NUdf::TUnboxedValue key = NUdf::TUnboxedValuePod(static_cast(i)); + NUdf::TUnboxedValue payload = NUdf::TUnboxedValuePod(static_cast(i * value)); + dictBuilder->Add(std::move(key), std::move(payload)); + } + auto dictValue = dictBuilder->Build(); + values.emplace_back(std::move(dictValue)); + } + return values; + } + + TType* GetDictOptionalVariantKeyType() { + TType* variantMembers[2] = { + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv) + }; + auto tupleType = TTupleType::Create(2, variantMembers, TypeEnv); + auto variantType = TVariantType::Create(tupleType, TypeEnv); + TType* keyType = TOptionalType::Create(TOptionalType::Create(variantType, TypeEnv), TypeEnv); + TType* payloadType = TDataType::Create(NUdf::TDataType::Id, TypeEnv); + return TDictType::Create(keyType, payloadType, TypeEnv); + } + + TUnboxedValueVector CreateDictsOptionalVariantKey(ui32 quantity) { + TUnboxedValueVector values; + for (ui64 value = 0; value < quantity; ++value) { + auto dictBuilder = Vb.NewDict(GetDictOptionalVariantKeyType(), 0); + for (ui64 i = 0; i < value; ++i) { + NUdf::TUnboxedValue key; + if (i % 3 == 0) { + auto typeIndex = i % 2; + NUdf::TUnboxedValue variantItem; + if (typeIndex == 0) { + variantItem = NUdf::TUnboxedValuePod(static_cast(i)); + } else { + variantItem = NUdf::TUnboxedValuePod(static_cast(i)); + } + auto variantValue = Vb.NewVariant(typeIndex, std::move(variantItem)); + key = variantValue.MakeOptional().MakeOptional(); + } else if (i % 3 == 1) { + key = NUdf::TUnboxedValuePod().MakeOptional(); + } else { + key = NUdf::TUnboxedValuePod(); + } + NUdf::TUnboxedValue payload = NUdf::TUnboxedValuePod(static_cast(i * value)); + dictBuilder->Add(std::move(key), std::move(payload)); + } + auto dictValue = dictBuilder->Build(); + values.emplace_back(std::move(dictValue)); + } + return values; + } + + TType* GetOptionalDataValueType() { return TOptionalType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), TypeEnv); } - TUnboxedValueVector CreateDataOptionals(ui32 quantity) { + TUnboxedValueVector CreateOptionalsDataValue(ui32 quantity) { TUnboxedValueVector values; for (ui64 value = 0; value < quantity; ++value) { if (value % 2 == 0) { @@ -323,11 +712,11 @@ struct TTestContext { return values; } - TType* GetSingularOptionalType() { + TType* GetOptionalSingularValueType() { return TOptionalType::Create(GetTypeOfSingular(TypeEnv), TypeEnv); } - TUnboxedValueVector CreateSingularOptionals(ui32 quantity) { + TUnboxedValueVector CreateOptionalsSingularValueType(ui32 quantity) { TUnboxedValueVector values; for (ui64 value = 0; value < quantity; ++value) { if (value % 2 == 0) { @@ -339,11 +728,11 @@ struct TTestContext { return values; } - TType* GetOptionalStructType() { + TType* GetOptionalStructValueType() { return TOptionalType::Create(GetStructType(), TypeEnv); } - TUnboxedValueVector CreateOptionalStructs(ui32 quantity) { + TUnboxedValueVector CreateOptionalsStructValue(ui32 quantity) { TUnboxedValueVector values = CreateStructs(quantity); for (size_t i = 0; i < values.size(); ++i) { values[i] = (i % 2 == 0) ? values[i].MakeOptional() : NUdf::TUnboxedValuePod(); @@ -351,11 +740,11 @@ struct TTestContext { return values; } - TType* GetOptionalTupleType() { + TType* GetOptionalTupleValueType() { return TOptionalType::Create(GetTupleType(), TypeEnv); } - TUnboxedValueVector CreateOptionalTuples(ui32 quantity) { + TUnboxedValueVector CreateOptionalsTupleValue(ui32 quantity) { TUnboxedValueVector values = CreateTuples(quantity); for (size_t i = 0; i < values.size(); ++i) { values[i] = (i % 2 == 0) ? values[i].MakeOptional() : NUdf::TUnboxedValuePod(); @@ -363,11 +752,11 @@ struct TTestContext { return values; } - TType* GetOptionalListType() { + TType* GetOptionalListValueType() { return TOptionalType::Create(GetListType(), TypeEnv); } - TUnboxedValueVector CreateOptionalLists(ui32 quantity) { + TUnboxedValueVector CreateOptionalsValueList(ui32 quantity) { TUnboxedValueVector values = CreateLists(quantity); for (size_t i = 0; i < values.size(); ++i) { values[i] = (i % 2 == 0) ? values[i].MakeOptional() : NUdf::TUnboxedValuePod(); @@ -375,11 +764,11 @@ struct TTestContext { return values; } - TType* GetOptionalDictType() { + TType* GetOptionalDictValueType() { return TOptionalType::Create(GetDictType(), TypeEnv); } - TUnboxedValueVector CreateOptionalDicts(ui32 quantity) { + TUnboxedValueVector CreateOptionalsDictValue(ui32 quantity) { TUnboxedValueVector values = CreateDicts(quantity); for (size_t i = 0; i < values.size(); ++i) { values[i] = (i % 2 == 0) ? values[i].MakeOptional() : NUdf::TUnboxedValuePod(); @@ -387,11 +776,11 @@ struct TTestContext { return values; } - TType* GetOptionalVariantType() { + TType* GetOptionalVariantValueType() { return TOptionalType::Create(GetVariantOverStructType(), TypeEnv); } - TUnboxedValueVector CreateOptionalVariants(ui32 quantity) { + TUnboxedValueVector CreateOptionalsVariantValue(ui32 quantity) { TUnboxedValueVector values = CreateVariantsOverStruct(quantity); for (size_t i = 0; i < values.size(); ++i) { values[i] = (i % 2 == 0) ? values[i].MakeOptional() : NUdf::TUnboxedValuePod(); @@ -399,11 +788,11 @@ struct TTestContext { return values; } - TType* GetOptionalTaggedType() { + TType* GetOptionalTaggedValueType() { return TOptionalType::Create(GetTaggedType(), TypeEnv); } - TUnboxedValueVector CreateOptionalTaggeds(ui32 quantity) { + TUnboxedValueVector CreateOptionalsValueTagged(ui32 quantity) { TUnboxedValueVector values = CreateTaggeds(quantity); for (size_t i = 0; i < values.size(); ++i) { values[i] = (i % 2 == 0) ? values[i].MakeOptional() : NUdf::TUnboxedValuePod(); @@ -411,11 +800,11 @@ struct TTestContext { return values; } - TType* GetOptionalOptionalType() { - return TOptionalType::Create(GetDataOptionalType(), TypeEnv); + TType* GetOptionalOptionalValueType() { + return TOptionalType::Create(GetOptionalDataValueType(), TypeEnv); } - TUnboxedValueVector CreateOptionalOptionals(ui32 quantity) { + TUnboxedValueVector CreateOptionalsOptionalValue(ui32 quantity) { TUnboxedValueVector values; for (ui64 value = 0; value < quantity; ++value) { if (value % 3 == 0) { @@ -441,18 +830,140 @@ struct TTestContext { return values; } - TType* GetVariantOverStructType() { - TStructMember members[4] = { - {"0_i32", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, - {"1_string", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, - {"2_float", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, - {"3_bool", TDataType::Create(NUdf::TDataType::Id, TypeEnv)} - }; - auto structType = TStructType::Create(4, members, TypeEnv); - return TVariantType::Create(structType, TypeEnv); + TType* GetTaggedStructValueType() { + return TTaggedType::Create(GetStructType(), "struct_tag", TypeEnv); } - TUnboxedValueVector CreateVariantsOverStruct(ui32 quantity) { + TUnboxedValueVector CreateTaggedsStructValue(ui32 quantity) { + return CreateStructs(quantity); + } + + TType* GetTaggedTupleValueype() { + return TTaggedType::Create(GetTupleType(), "tuple_tag", TypeEnv); + } + + TUnboxedValueVector CreateTaggedsTupleValue(ui32 quantity) { + return CreateTuples(quantity); + } + + TType* GetTaggedListValueType() { + return TTaggedType::Create(GetListType(), "list_tag", TypeEnv); + } + + TUnboxedValueVector CreateTaggedsValueList(ui32 quantity) { + return CreateLists(quantity); + } + + TType* GetTaggedDictValueType() { + return TTaggedType::Create(GetDictType(), "dict_tag", TypeEnv); + } + + TUnboxedValueVector CreateTaggedsDictValue(ui32 quantity) { + return CreateDicts(quantity); + } + + TType* GetTaggedOptionalValueType() { + auto optionalType = TOptionalType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), TypeEnv); + return TTaggedType::Create(optionalType, "opt_tag", TypeEnv); + } + + TUnboxedValueVector CreateTaggedsOptionalValue(ui32 quantity) { + TUnboxedValueVector values; + for (ui64 value = 0; value < quantity; ++value) { + if (value % 2 == 0) { + values.push_back(NUdf::TUnboxedValuePod(static_cast(value)).MakeOptional()); + } else { + values.emplace_back(); + } + } + return values; + } + + TType* GetTaggedOptionalOptionalValueType() { + auto innerOptional = TOptionalType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), TypeEnv); + auto outerOptional = TOptionalType::Create(innerOptional, TypeEnv); + return TTaggedType::Create(outerOptional, "opt_opt_tag", TypeEnv); + } + + TUnboxedValueVector CreateTaggedsOptionalOptionalValue(ui32 quantity) { + TUnboxedValueVector values; + for (ui64 value = 0; value < quantity; ++value) { + if (value % 3 == 0) { + values.push_back(NUdf::TUnboxedValuePod(static_cast(value)).MakeOptional().MakeOptional()); + } else if (value % 3 == 1) { + values.push_back(NUdf::TUnboxedValuePod().MakeOptional()); + } else { + values.emplace_back(); + } + } + return values; + } + + TType* GetTaggedTaggedValueType() { + auto innerTagged = TTaggedType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), "inner_tag", TypeEnv); + return TTaggedType::Create(innerTagged, "outer_tag", TypeEnv); + } + + TUnboxedValueVector CreateTaggedsTaggedValue(ui32 quantity) { + TUnboxedValueVector values; + for (ui64 value = 0; value < quantity; ++value) { + values.push_back(NUdf::TUnboxedValuePod(static_cast(value))); + } + return values; + } + + TType* GetTaggedOptionalTaggedValueType() { + auto innerTagged = TTaggedType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), "inner_tag", TypeEnv); + auto optional = TOptionalType::Create(innerTagged, TypeEnv); + return TTaggedType::Create(optional, "outer_tag", TypeEnv); + } + + TUnboxedValueVector CreateTaggedsOptionalTaggedValue(ui32 quantity) { + TUnboxedValueVector values; + for (ui64 value = 0; value < quantity; ++value) { + if (value % 2 == 0) { + values.push_back(NUdf::TUnboxedValuePod(static_cast(value)).MakeOptional()); + } else { + values.emplace_back(); + } + } + return values; + } + + TType* GetTaggedTaggedOptionalTaggedTaggedValueType() { + auto baseType = TDataType::Create(NUdf::TDataType::Id, TypeEnv); + auto innerTagged1 = TTaggedType::Create(baseType, "inner1", TypeEnv); + auto innerTagged2 = TTaggedType::Create(innerTagged1, "inner2", TypeEnv); + auto optional = TOptionalType::Create(innerTagged2, TypeEnv); + auto outerTagged1 = TTaggedType::Create(optional, "outer1", TypeEnv); + auto outerTagged2 = TTaggedType::Create(outerTagged1, "outer2", TypeEnv); + return outerTagged2; + } + + TUnboxedValueVector CreateTaggedsTaggedOptionalTaggedTaggedValue(ui32 quantity) { + TUnboxedValueVector values; + for (ui64 value = 0; value < quantity; ++value) { + if (value % 2 == 0) { + values.push_back(NUdf::TUnboxedValuePod(static_cast(value)).MakeOptional()); + } else { + values.emplace_back(); + } + } + return values; + } + + TType* GetVariantOverStructType() { + TStructMember members[4] = { + {"0_i32", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, + {"1_string", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, + {"2_float", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, + {"3_bool", TDataType::Create(NUdf::TDataType::Id, TypeEnv)} + }; + auto structType = TStructType::Create(4, members, TypeEnv); + return TVariantType::Create(structType, TypeEnv); + } + + TUnboxedValueVector CreateVariantsOverStruct(ui32 quantity) { TUnboxedValueVector values; for (ui64 value = 0; value < quantity; ++value) { auto typeIndex = value % 4; @@ -548,6 +1059,86 @@ struct TTestContext { } return values; } + + TType* GetVariantComprehensiveType() { + // Variant over Tuple containing all type categories: + // Data, Optional, Optional, Singular, Struct, Tuple, List, Dict, Variant, Tagged + TType* members[10] = { + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TOptionalType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), TypeEnv), + TOptionalType::Create(TOptionalType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), TypeEnv), TypeEnv), + GetTypeOfSingular(TypeEnv), + GetStructType(), + GetTupleType(), + GetListType(), + GetDictType(), + GetVariantOverTupleType(), + TTaggedType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), "tag", TypeEnv) + }; + auto tupleType = TTupleType::Create(10, members, TypeEnv); + return TVariantType::Create(tupleType, TypeEnv); + } + + TUnboxedValueVector CreateVariantsComprehensive(ui32 quantity) { + TUnboxedValueVector values; + auto structs = CreateStructs(5); + auto tuples = CreateTuples(5); + auto lists = CreateLists(5); + auto dicts = CreateDicts(5); + auto variants = CreateVariantsOverTuple(5); + + for (ui64 value = 0; value < quantity; ++value) { + auto typeIndex = value % 10; + NUdf::TUnboxedValue item; + + switch (typeIndex) { + case 0: + item = NUdf::TUnboxedValuePod(static_cast(value)); + break; + case 1: + if (value % 2 == 0) { + item = NUdf::TUnboxedValuePod(static_cast(value)).MakeOptional(); + } else { + item = NUdf::TUnboxedValuePod(); + } + break; + case 2: + if (value % 3 == 0) { + item = NUdf::TUnboxedValuePod(static_cast(value)).MakeOptional().MakeOptional(); + } else if (value % 3 == 1) { + item = NUdf::TUnboxedValuePod().MakeOptional(); + } else { + item = NUdf::TUnboxedValuePod(); + } + break; + case 3: + item = NUdf::TUnboxedValuePod(); + break; + case 4: + item = structs[value % structs.size()]; + break; + case 5: + item = tuples[value % tuples.size()]; + break; + case 6: + item = lists[value % lists.size()]; + break; + case 7: + item = dicts[value % dicts.size()]; + break; + case 8: + item = variants[value % variants.size()]; + break; + case 9: + item = NUdf::TUnboxedValuePod(static_cast(value)); + break; + } + + auto wrapped = Vb.NewVariant(typeIndex, std::move(item)); + values.emplace_back(std::move(wrapped)); + } + return values; + } }; void AssertUnboxedValuesAreEqual(NUdf::TUnboxedValue& left, NUdf::TUnboxedValue& right, TType* type) { @@ -976,7 +1567,7 @@ Y_UNIT_TEST_SUITE(KqpFormats_Arrow_Conversion) { } // Nested types - Y_UNIT_TEST(NestedType_List) { + Y_UNIT_TEST(NestedType_List_DataValue) { TTestContext context; auto listType = context.GetListType(); @@ -999,7 +1590,115 @@ Y_UNIT_TEST_SUITE(KqpFormats_Arrow_Conversion) { } } - Y_UNIT_TEST(NestedType_Tuple) { + Y_UNIT_TEST(NestedType_List_NestedValue) { + TTestContext context; + + auto listType = context.GetListNestedValueType(); + auto values = context.CreateListsNestedValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(listType)); + + auto array = MakeArrowArray(values, listType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::LIST); + auto listArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(listArray->num_fields(), 1); + UNIT_ASSERT(listArray->value_type()->id() == arrow::Type::STRUCT); + + for (size_t i = 0; i < values.size(); ++i) { + auto structArray = static_pointer_cast(listArray->value_slice(i)); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 2); + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::UINT8); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT8); + + auto arrowValue = ExtractUnboxedValue(array, i, listType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], listType); + } + } + + Y_UNIT_TEST(NestedType_List_OptionalValue) { + TTestContext context; + + auto listType = context.GetListOptionalValueType(); + auto values = context.CreateListsOptionalValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(listType)); + + auto array = MakeArrowArray(values, listType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::LIST); + auto listArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(listArray->num_fields(), 1); + UNIT_ASSERT(listArray->value_type()->id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, listType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], listType); + } + } + + Y_UNIT_TEST(NestedType_List_OptionalVariantValue) { + TTestContext context; + + auto listType = context.GetListOptionalVariantValueType(); + auto values = context.CreateListsOptionalVariantValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(listType)); + + auto array = MakeArrowArray(values, listType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::LIST); + + auto listArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(listArray->num_fields(), 1); + UNIT_ASSERT(listArray->value_type()->id() == arrow::Type::STRUCT); + + for (size_t i = 0; i < values.size(); ++i) { + auto structArray = static_pointer_cast(listArray->value_slice(i)); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 1); + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::DENSE_UNION); + + auto unionArray = static_pointer_cast(structArray->field(0)); + UNIT_ASSERT_VALUES_EQUAL(unionArray->num_fields(), 4); + UNIT_ASSERT(unionArray->field(0)->type_id() == arrow::Type::UINT8); + UNIT_ASSERT(unionArray->field(1)->type_id() == arrow::Type::INT16); + UNIT_ASSERT(unionArray->field(2)->type_id() == arrow::Type::UINT16); + UNIT_ASSERT(unionArray->field(3)->type_id() == arrow::Type::INT32); + + auto arrowValue = ExtractUnboxedValue(array, i, listType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], listType); + }; + } + + Y_UNIT_TEST(NestedType_List_TaggedValue) { + TTestContext context; + + auto listType = context.GetListTaggedValueType(); + auto values = context.CreateListsTaggedValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(listType)); + + auto array = MakeArrowArray(values, listType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::LIST); + auto listArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(listArray->num_fields(), 1); + UNIT_ASSERT(listArray->value_type()->id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, listType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], listType); + } + } + + Y_UNIT_TEST(NestedType_Tuple_DataValue) { TTestContext context; auto tupleType = context.GetTupleType(); @@ -1024,54 +1723,367 @@ Y_UNIT_TEST_SUITE(KqpFormats_Arrow_Conversion) { UNIT_ASSERT_VALUES_EQUAL(static_cast(structArray->field(2)->length()), values.size()); for (size_t i = 0; i < values.size(); ++i) { - auto arrowValue = ExtractUnboxedValue(array, i, tupleType, context.HolderFactory); - AssertUnboxedValuesAreEqual(arrowValue, values[i], tupleType); + auto arrowValue = ExtractUnboxedValue(array, i, tupleType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], tupleType); + } + } + + Y_UNIT_TEST(NestedType_Tuple_NestedValue) { + TTestContext context; + + auto tupleType = context.GetTupleNestedValueType(); + auto values = context.CreateTuplesNestedValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(tupleType)); + + auto array = MakeArrowArray(values, tupleType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + auto structArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 2); + + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::LIST); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::STRUCT); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, tupleType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], tupleType); + } + } + + Y_UNIT_TEST(NestedType_Tuple_OptionalValue) { + TTestContext context; + + auto tupleType = context.GetTupleOptionalValueType(); + auto values = context.CreateTuplesOptionalValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(tupleType)); + + auto array = MakeArrowArray(values, tupleType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + auto structArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 3); + + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::INT32); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::STRUCT); + UNIT_ASSERT(structArray->field(2)->type_id() == arrow::Type::STRUCT); + + auto secondStructArray = static_pointer_cast(structArray->field(1)); + UNIT_ASSERT_VALUES_EQUAL(secondStructArray->num_fields(), 1); + UNIT_ASSERT(secondStructArray->field(0)->type_id() == arrow::Type::NA); + + auto thirdStructArray = static_pointer_cast(structArray->field(2)); + UNIT_ASSERT_VALUES_EQUAL(thirdStructArray->num_fields(), 1); + UNIT_ASSERT(thirdStructArray->field(0)->type_id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, tupleType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], tupleType); + } + } + + Y_UNIT_TEST(NestedType_Tuple_TaggedValue) { + TTestContext context; + + auto tupleType = context.GetTupleTaggedValueType(); + auto values = context.CreateTuplesTaggedValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(tupleType)); + + auto array = MakeArrowArray(values, tupleType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + auto structArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 2); + + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::INT32); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, tupleType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], tupleType); + } + } + + Y_UNIT_TEST(NestedType_Struct_DataValue) { + TTestContext context; + + auto structType = context.GetStructType(); + auto values = context.CreateStructs(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(structType)); + + auto array = MakeArrowArray(values, structType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + auto structArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 5); + + UNIT_ASSERT(structArray->GetFieldByName("ABC") && structArray->GetFieldByName("ABC") == structArray->field(0)); + UNIT_ASSERT(structArray->GetFieldByName("DEF") && structArray->GetFieldByName("DEF") == structArray->field(1)); + UNIT_ASSERT(structArray->GetFieldByName("GHI") && structArray->GetFieldByName("GHI") == structArray->field(2)); + UNIT_ASSERT(structArray->GetFieldByName("JKL") && structArray->GetFieldByName("JKL") == structArray->field(3)); + UNIT_ASSERT(structArray->GetFieldByName("MNO") && structArray->GetFieldByName("MNO") == structArray->field(4)); + + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::BINARY); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT32); + UNIT_ASSERT(structArray->field(2)->type_id() == arrow::Type::UINT64); + UNIT_ASSERT(structArray->field(3)->type_id() == arrow::Type::INT64); + UNIT_ASSERT(structArray->field(4)->type_id() == arrow::Type::STRING); + + for (int i = 0; i < structArray->num_fields(); ++i) { + UNIT_ASSERT_VALUES_EQUAL(structArray->field(i)->length(), values.size()); + } + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, structType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], structType); + } + } + + Y_UNIT_TEST(NestedType_Struct_NestedValue) { + TTestContext context; + + auto structType = context.GetStructNestedValueType(); + auto values = context.CreateStructsNestedValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(structType)); + + auto array = MakeArrowArray(values, structType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + auto structArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 3); + + UNIT_ASSERT(structArray->GetFieldByName("56") && structArray->GetFieldByName("56") == structArray->field(0)); + UNIT_ASSERT(structArray->GetFieldByName("78") && structArray->GetFieldByName("78") == structArray->field(1)); + UNIT_ASSERT(structArray->GetFieldByName("910") && structArray->GetFieldByName("910") == structArray->field(2)); + + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::LIST); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::STRUCT); + UNIT_ASSERT(structArray->field(2)->type_id() == arrow::Type::INT32); + + auto innerStructArray = static_pointer_cast(structArray->field(1)); + UNIT_ASSERT_VALUES_EQUAL(innerStructArray->num_fields(), 2); + UNIT_ASSERT(innerStructArray->GetFieldByName("12") && innerStructArray->GetFieldByName("12") == innerStructArray->field(0)); + UNIT_ASSERT(innerStructArray->GetFieldByName("34") && innerStructArray->GetFieldByName("34") == innerStructArray->field(1)); + UNIT_ASSERT(innerStructArray->field(0)->type_id() == arrow::Type::UINT8); + UNIT_ASSERT(innerStructArray->field(1)->type_id() == arrow::Type::INT8); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, structType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], structType); + } + } + + Y_UNIT_TEST(NestedType_Struct_OptionalValue) { + TTestContext context; + + auto structType = context.GetStructOptionalValueType(); + auto values = context.CreateStructsOptionalValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(structType)); + + auto array = MakeArrowArray(values, structType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + auto structArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 4); + + UNIT_ASSERT(structArray->GetFieldByName("opt1") && structArray->GetFieldByName("opt1") == structArray->field(0)); + UNIT_ASSERT(structArray->GetFieldByName("opt2") && structArray->GetFieldByName("opt2") == structArray->field(1)); + UNIT_ASSERT(structArray->GetFieldByName("opt3") && structArray->GetFieldByName("opt3") == structArray->field(2)); + UNIT_ASSERT(structArray->GetFieldByName("optless") && structArray->GetFieldByName("optless") == structArray->field(3)); + + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::INT32); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::STRUCT); + UNIT_ASSERT(structArray->field(2)->type_id() == arrow::Type::STRUCT); + UNIT_ASSERT(structArray->field(3)->type_id() == arrow::Type::UINT64); + + auto optNullStructArray = static_pointer_cast(structArray->field(1)); + UNIT_ASSERT_VALUES_EQUAL(optNullStructArray->num_fields(), 1); + UNIT_ASSERT(optNullStructArray->field(0)->type_id() == arrow::Type::NA); + + auto optOptStructArray = static_pointer_cast(structArray->field(2)); + UNIT_ASSERT_VALUES_EQUAL(optOptStructArray->num_fields(), 1); + UNIT_ASSERT(optOptStructArray->field(0)->type_id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, structType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], structType); + } + } + + Y_UNIT_TEST(NestedType_Struct_TaggedValue) { + TTestContext context; + + auto structType = context.GetStructTaggedValueType(); + auto values = context.CreateStructsTaggedValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(structType)); + + auto array = MakeArrowArray(values, structType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + auto structArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 3); + + UNIT_ASSERT(structArray->GetFieldByName("1") && structArray->GetFieldByName("1") == structArray->field(0)); + UNIT_ASSERT(structArray->GetFieldByName("2") && structArray->GetFieldByName("2") == structArray->field(1)); + UNIT_ASSERT(structArray->GetFieldByName("3") && structArray->GetFieldByName("3") == structArray->field(2)); + + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::INT32); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT32); + UNIT_ASSERT(structArray->field(2)->type_id() == arrow::Type::UINT64); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, structType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], structType); + } + } + + Y_UNIT_TEST(NestedType_Dict_DataKey) { + TTestContext context; + + auto dictType = context.GetDictType(); + auto values = context.CreateDicts(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(dictType)); + + auto array = MakeArrowArray(values, dictType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::LIST); + auto listArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(listArray->num_fields(), 1); + UNIT_ASSERT(listArray->value_type()->id() == arrow::Type::STRUCT); + + for (size_t i = 0; i < values.size(); ++i) { + auto structArray = static_pointer_cast(listArray->value_slice(i)); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 2); + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::DOUBLE); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT32); + + auto arrowValue = ExtractUnboxedValue(array, i, dictType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], dictType); + } + } + + Y_UNIT_TEST(NestedType_Dict_NestedKey) { + TTestContext context; + + auto dictType = context.GetDictNestedKeyType(); + auto values = context.CreateDictsNestedKey(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(dictType)); + + auto array = MakeArrowArray(values, dictType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::LIST); + auto listArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(listArray->num_fields(), 1); + UNIT_ASSERT(listArray->value_type()->id() == arrow::Type::STRUCT); + + for (size_t i = 0; i < values.size(); ++i) { + auto structArray = static_pointer_cast(listArray->value_slice(i)); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 2); + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::STRUCT); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT32); + + auto keyStructArray = static_pointer_cast(structArray->field(0)); + UNIT_ASSERT_VALUES_EQUAL(keyStructArray->num_fields(), 2); + + UNIT_ASSERT(keyStructArray->field(0)->type_id() == arrow::Type::UINT8); + UNIT_ASSERT(keyStructArray->field(1)->type_id() == arrow::Type::INT8); + + auto arrowValue = ExtractUnboxedValue(array, i, dictType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], dictType); + } + } + + Y_UNIT_TEST(NestedType_Dict_OptionalKey) { + TTestContext context; + + auto dictType = context.GetDictOptionalKeyType(); + auto values = context.CreateDictsOptionalKey(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(dictType)); + + auto array = MakeArrowArray(values, dictType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::LIST); + auto listArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(listArray->num_fields(), 1); + UNIT_ASSERT(listArray->value_type()->id() == arrow::Type::STRUCT); + + for (size_t i = 0; i < values.size(); ++i) { + auto structArray = static_pointer_cast(listArray->value_slice(i)); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 2); + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::STRUCT); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT32); + + auto keyStructArray = static_pointer_cast(structArray->field(0)); + UNIT_ASSERT_VALUES_EQUAL(keyStructArray->num_fields(), 1); + UNIT_ASSERT(keyStructArray->field(0)->type_id() == arrow::Type::INT32); + + auto arrowValue = ExtractUnboxedValue(array, i, dictType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], dictType); } } - Y_UNIT_TEST(NestedType_Struct) { + Y_UNIT_TEST(NestedType_Dict_TaggedKey) { TTestContext context; - auto structType = context.GetStructType(); - auto values = context.CreateStructs(TEST_ARRAY_NESTED_SIZE); + auto dictType = context.GetDictTaggedKeyType(); + auto values = context.CreateDictsTaggedKey(TEST_ARRAY_NESTED_SIZE); - UNIT_ASSERT(IsArrowCompatible(structType)); + UNIT_ASSERT(IsArrowCompatible(dictType)); - auto array = MakeArrowArray(values, structType); + auto array = MakeArrowArray(values, dictType); UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); - UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); - auto structArray = static_pointer_cast(array); - UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 5); - - UNIT_ASSERT(structArray->GetFieldByName("ABC") && structArray->GetFieldByName("ABC") == structArray->field(0)); - UNIT_ASSERT(structArray->GetFieldByName("DEF") && structArray->GetFieldByName("DEF") == structArray->field(1)); - UNIT_ASSERT(structArray->GetFieldByName("GHI") && structArray->GetFieldByName("GHI") == structArray->field(2)); - UNIT_ASSERT(structArray->GetFieldByName("JKL") && structArray->GetFieldByName("JKL") == structArray->field(3)); - UNIT_ASSERT(structArray->GetFieldByName("MNO") && structArray->GetFieldByName("MNO") == structArray->field(4)); - - UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::BINARY); - UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT32); - UNIT_ASSERT(structArray->field(2)->type_id() == arrow::Type::UINT64); - UNIT_ASSERT(structArray->field(3)->type_id() == arrow::Type::INT64); - UNIT_ASSERT(structArray->field(4)->type_id() == arrow::Type::STRING); - - for (int i = 0; i < structArray->num_fields(); ++i) { - UNIT_ASSERT_VALUES_EQUAL(structArray->field(i)->length(), values.size()); - } + UNIT_ASSERT(array->type_id() == arrow::Type::LIST); + auto listArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(listArray->num_fields(), 1); + UNIT_ASSERT(listArray->value_type()->id() == arrow::Type::STRUCT); for (size_t i = 0; i < values.size(); ++i) { - auto arrowValue = ExtractUnboxedValue(array, i, structType, context.HolderFactory); - AssertUnboxedValuesAreEqual(arrowValue, values[i], structType); + auto structArray = static_pointer_cast(listArray->value_slice(i)); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 2); + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::INT32); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT32); + + auto arrowValue = ExtractUnboxedValue(array, i, dictType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], dictType); } } - Y_UNIT_TEST(NestedType_Dict) { + Y_UNIT_TEST(NestedType_Dict_OptionalVariantKey) { TTestContext context; - auto dictType = context.GetDictType(); - auto values = context.CreateDicts(TEST_ARRAY_NESTED_SIZE); + auto dictType = context.GetDictOptionalVariantKeyType(); + auto values = context.CreateDictsOptionalVariantKey(TEST_ARRAY_NESTED_SIZE); UNIT_ASSERT(IsArrowCompatible(dictType)); @@ -1087,19 +2099,32 @@ Y_UNIT_TEST_SUITE(KqpFormats_Arrow_Conversion) { for (size_t i = 0; i < values.size(); ++i) { auto structArray = static_pointer_cast(listArray->value_slice(i)); UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 2); - UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::DOUBLE); + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::STRUCT); UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT32); + auto keyStructArray = static_pointer_cast(structArray->field(0)); + UNIT_ASSERT_VALUES_EQUAL(keyStructArray->num_fields(), 1); + UNIT_ASSERT(keyStructArray->field(0)->type_id() == arrow::Type::STRUCT); + + auto keyInnerStructArray = static_pointer_cast(keyStructArray->field(0)); + UNIT_ASSERT_VALUES_EQUAL(keyInnerStructArray->num_fields(), 1); + UNIT_ASSERT(keyInnerStructArray->field(0)->type_id() == arrow::Type::DENSE_UNION); + + auto unionArray = static_pointer_cast(keyInnerStructArray->field(0)); + UNIT_ASSERT_VALUES_EQUAL(unionArray->num_fields(), 2); + UNIT_ASSERT(unionArray->field(0)->type_id() == arrow::Type::INT32); + UNIT_ASSERT(unionArray->field(1)->type_id() == arrow::Type::UINT8); + auto arrowValue = ExtractUnboxedValue(array, i, dictType, context.HolderFactory); AssertUnboxedValuesAreEqual(arrowValue, values[i], dictType); } } - Y_UNIT_TEST(NestedType_Optional_Data) { + Y_UNIT_TEST(NestedType_Optional_DataValue) { TTestContext context; - auto optionalType = context.GetDataOptionalType(); - auto values = context.CreateDataOptionals(TEST_ARRAY_NESTED_SIZE); + auto optionalType = context.GetOptionalDataValueType(); + auto values = context.CreateOptionalsDataValue(TEST_ARRAY_NESTED_SIZE); UNIT_ASSERT(IsArrowCompatible(optionalType)); @@ -1114,11 +2139,11 @@ Y_UNIT_TEST_SUITE(KqpFormats_Arrow_Conversion) { } } - Y_UNIT_TEST(NestedType_Optional_Singular) { + Y_UNIT_TEST(NestedType_Optional_SingularValue) { TTestContext context; - auto optionalType = context.GetSingularOptionalType(); - auto values = context.CreateSingularOptionals(TEST_ARRAY_NESTED_SIZE); + auto optionalType = context.GetOptionalSingularValueType(); + auto values = context.CreateOptionalsSingularValueType(TEST_ARRAY_NESTED_SIZE); UNIT_ASSERT(IsArrowCompatible(optionalType)); @@ -1137,11 +2162,11 @@ Y_UNIT_TEST_SUITE(KqpFormats_Arrow_Conversion) { } } - Y_UNIT_TEST(NestedType_Optional_Struct) { + Y_UNIT_TEST(NestedType_Optional_StructValue) { TTestContext context; - auto optionalType = context.GetOptionalStructType(); - auto values = context.CreateOptionalStructs(TEST_ARRAY_NESTED_SIZE); + auto optionalType = context.GetOptionalStructValueType(); + auto values = context.CreateOptionalsStructValue(TEST_ARRAY_NESTED_SIZE); UNIT_ASSERT(IsArrowCompatible(optionalType)); @@ -1171,11 +2196,11 @@ Y_UNIT_TEST_SUITE(KqpFormats_Arrow_Conversion) { } } - Y_UNIT_TEST(NestedType_Optional_Tuple) { + Y_UNIT_TEST(NestedType_Optional_TupleValue) { TTestContext context; - auto optionalType = context.GetOptionalTupleType(); - auto values = context.CreateOptionalTuples(TEST_ARRAY_NESTED_SIZE); + auto optionalType = context.GetOptionalTupleValueType(); + auto values = context.CreateOptionalsTupleValue(TEST_ARRAY_NESTED_SIZE); UNIT_ASSERT(IsArrowCompatible(optionalType)); @@ -1197,11 +2222,11 @@ Y_UNIT_TEST_SUITE(KqpFormats_Arrow_Conversion) { } } - Y_UNIT_TEST(NestedType_Optional_List) { + Y_UNIT_TEST(NestedType_Optional_ListValue) { TTestContext context; - auto optionalType = context.GetOptionalListType(); - auto values = context.CreateOptionalLists(TEST_ARRAY_NESTED_SIZE); + auto optionalType = context.GetOptionalListValueType(); + auto values = context.CreateOptionalsValueList(TEST_ARRAY_NESTED_SIZE); UNIT_ASSERT(IsArrowCompatible(optionalType)); @@ -1219,11 +2244,11 @@ Y_UNIT_TEST_SUITE(KqpFormats_Arrow_Conversion) { } } - Y_UNIT_TEST(NestedType_Optional_Dict) { + Y_UNIT_TEST(NestedType_Optional_DictValue) { TTestContext context; - auto optionalType = context.GetOptionalDictType(); - auto values = context.CreateOptionalDicts(TEST_ARRAY_NESTED_SIZE); + auto optionalType = context.GetOptionalDictValueType(); + auto values = context.CreateOptionalsDictValue(TEST_ARRAY_NESTED_SIZE); UNIT_ASSERT(IsArrowCompatible(optionalType)); @@ -1246,11 +2271,11 @@ Y_UNIT_TEST_SUITE(KqpFormats_Arrow_Conversion) { } } - Y_UNIT_TEST(NestedType_Optional_Variant) { + Y_UNIT_TEST(NestedType_Optional_VariantValue) { TTestContext context; - auto variantType = context.GetOptionalVariantType(); - auto values = context.CreateOptionalVariants(TEST_ARRAY_NESTED_SIZE); + auto variantType = context.GetOptionalVariantValueType(); + auto values = context.CreateOptionalsVariantValue(TEST_ARRAY_NESTED_SIZE); UNIT_ASSERT(IsArrowCompatible(variantType)); @@ -1276,11 +2301,11 @@ Y_UNIT_TEST_SUITE(KqpFormats_Arrow_Conversion) { }; } - Y_UNIT_TEST(NestedType_Optional_Tagged) { + Y_UNIT_TEST(NestedType_Optional_TaggedValue) { TTestContext context; - auto optionalType = context.GetOptionalTaggedType(); - auto values = context.CreateOptionalTaggeds(TEST_ARRAY_NESTED_SIZE); + auto optionalType = context.GetOptionalTaggedValueType(); + auto values = context.CreateOptionalsValueTagged(TEST_ARRAY_NESTED_SIZE); UNIT_ASSERT(IsArrowCompatible(optionalType)); @@ -1295,11 +2320,11 @@ Y_UNIT_TEST_SUITE(KqpFormats_Arrow_Conversion) { } } - Y_UNIT_TEST(NestedType_Optional_Optional) { + Y_UNIT_TEST(NestedType_Optional_OptionalValue) { TTestContext context; - auto optionalType = context.GetOptionalOptionalType(); - auto values = context.CreateOptionalOptionals(TEST_ARRAY_NESTED_SIZE); + auto optionalType = context.GetOptionalOptionalValueType(); + auto values = context.CreateOptionalsOptionalValue(TEST_ARRAY_NESTED_SIZE); UNIT_ASSERT(IsArrowCompatible(optionalType)); @@ -1425,7 +2450,86 @@ Y_UNIT_TEST_SUITE(KqpFormats_Arrow_Conversion) { } catch (...) {} } - Y_UNIT_TEST(NestedType_Tagged) { + Y_UNIT_TEST(NestedType_Variant_Comprehensive) { + TTestContext context; + + auto variantType = context.GetVariantComprehensiveType(); + auto values = context.CreateVariantsComprehensive(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(variantType)); + + auto array = MakeArrowArray(values, variantType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::DENSE_UNION); + + auto unionArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(unionArray->num_fields(), 10); + + // Field 0: Data (i32) + UNIT_ASSERT(unionArray->field(0)->type_id() == arrow::Type::INT32); + + // Field 1: Optional + UNIT_ASSERT(unionArray->field(1)->type_id() == arrow::Type::INT32); + + // Field 2: Optional> + UNIT_ASSERT(unionArray->field(2)->type_id() == arrow::Type::STRUCT); + auto optOptStructArray = static_pointer_cast(unionArray->field(2)); + UNIT_ASSERT_VALUES_EQUAL(optOptStructArray->num_fields(), 1); + UNIT_ASSERT(optOptStructArray->field(0)->type_id() == arrow::Type::INT32); + + // Field 3: Void + UNIT_ASSERT(unionArray->field(3)->type_id() == arrow::Type::STRUCT); + auto voidStructArray = static_pointer_cast(unionArray->field(3)); + UNIT_ASSERT_VALUES_EQUAL(voidStructArray->num_fields(), 0); + + // Field 4: Struct + UNIT_ASSERT(unionArray->field(4)->type_id() == arrow::Type::STRUCT); + auto structArray = static_pointer_cast(unionArray->field(4)); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 5); + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::BINARY); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT32); + UNIT_ASSERT(structArray->field(2)->type_id() == arrow::Type::UINT64); + UNIT_ASSERT(structArray->field(3)->type_id() == arrow::Type::INT64); + UNIT_ASSERT(structArray->field(4)->type_id() == arrow::Type::STRING); + + // Field 5: Tuple + UNIT_ASSERT(unionArray->field(5)->type_id() == arrow::Type::STRUCT); + auto tupleArray = static_pointer_cast(unionArray->field(5)); + UNIT_ASSERT_VALUES_EQUAL(tupleArray->num_fields(), 3); + UNIT_ASSERT(tupleArray->field(0)->type_id() == arrow::Type::UINT8); + UNIT_ASSERT(tupleArray->field(1)->type_id() == arrow::Type::INT8); + UNIT_ASSERT(tupleArray->field(2)->type_id() == arrow::Type::UINT8); + + // Field 6: List + UNIT_ASSERT(unionArray->field(6)->type_id() == arrow::Type::LIST); + auto listArray = static_pointer_cast(unionArray->field(6)); + UNIT_ASSERT(listArray->value_type()->id() == arrow::Type::INT32); + + // Field 7: Dict + UNIT_ASSERT(unionArray->field(7)->type_id() == arrow::Type::LIST); + auto dictListArray = static_pointer_cast(unionArray->field(7)); + UNIT_ASSERT(dictListArray->value_type()->id() == arrow::Type::STRUCT); + + // Field 8: Variant + UNIT_ASSERT(unionArray->field(8)->type_id() == arrow::Type::DENSE_UNION); + auto nestedUnionArray = static_pointer_cast(unionArray->field(8)); + UNIT_ASSERT_VALUES_EQUAL(nestedUnionArray->num_fields(), 4); + UNIT_ASSERT(nestedUnionArray->field(0)->type_id() == arrow::Type::UINT8); + UNIT_ASSERT(nestedUnionArray->field(1)->type_id() == arrow::Type::INT16); + UNIT_ASSERT(nestedUnionArray->field(2)->type_id() == arrow::Type::UINT16); + UNIT_ASSERT(nestedUnionArray->field(3)->type_id() == arrow::Type::INT32); + + // Field 9: Tagged + UNIT_ASSERT(unionArray->field(9)->type_id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, variantType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], variantType); + }; + } + + Y_UNIT_TEST(NestedType_Tagged_DataValue) { TTestContext context; auto taggedType = context.GetTaggedType(); @@ -1443,6 +2547,201 @@ Y_UNIT_TEST_SUITE(KqpFormats_Arrow_Conversion) { AssertUnboxedValuesAreEqual(arrowValue, values[i], taggedType); } } + + Y_UNIT_TEST(NestedType_Tagged_StructValue) { + TTestContext context; + + auto taggedType = context.GetTaggedStructValueType(); + auto values = context.CreateTaggedsStructValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(taggedType)); + + auto array = MakeArrowArray(values, taggedType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + + auto structArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 5); + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::BINARY); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT32); + UNIT_ASSERT(structArray->field(2)->type_id() == arrow::Type::UINT64); + UNIT_ASSERT(structArray->field(3)->type_id() == arrow::Type::INT64); + UNIT_ASSERT(structArray->field(4)->type_id() == arrow::Type::STRING); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, taggedType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], taggedType); + } + } + + Y_UNIT_TEST(NestedType_Tagged_TupleValue) { + TTestContext context; + + auto taggedType = context.GetTaggedTupleValueype(); + auto values = context.CreateTaggedsTupleValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(taggedType)); + + auto array = MakeArrowArray(values, taggedType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + + auto structArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 3); + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::UINT8); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT8); + UNIT_ASSERT(structArray->field(2)->type_id() == arrow::Type::UINT8); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, taggedType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], taggedType); + } + } + + Y_UNIT_TEST(NestedType_Tagged_ListValue) { + TTestContext context; + + auto taggedType = context.GetTaggedListValueType(); + auto values = context.CreateTaggedsValueList(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(taggedType)); + + auto array = MakeArrowArray(values, taggedType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::LIST); + + auto listArray = static_pointer_cast(array); + UNIT_ASSERT(listArray->value_type()->id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, taggedType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], taggedType); + } + } + + Y_UNIT_TEST(NestedType_Tagged_DictValue) { + TTestContext context; + + auto taggedType = context.GetTaggedDictValueType(); + auto values = context.CreateTaggedsDictValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(taggedType)); + + auto array = MakeArrowArray(values, taggedType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::LIST); + + auto listArray = static_pointer_cast(array); + UNIT_ASSERT(listArray->value_type()->id() == arrow::Type::STRUCT); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, taggedType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], taggedType); + } + } + + Y_UNIT_TEST(NestedType_Tagged_OptionalValue) { + TTestContext context; + + auto taggedType = context.GetTaggedOptionalValueType(); + auto values = context.CreateTaggedsOptionalValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(taggedType)); + + auto array = MakeArrowArray(values, taggedType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, taggedType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], taggedType); + } + } + + Y_UNIT_TEST(NestedType_Tagged_OptionalOptionalValue) { + TTestContext context; + + auto taggedType = context.GetTaggedOptionalOptionalValueType(); + auto values = context.CreateTaggedsOptionalOptionalValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(taggedType)); + + auto array = MakeArrowArray(values, taggedType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + + auto structArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 1); + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, taggedType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], taggedType); + } + } + + Y_UNIT_TEST(NestedType_Tagged_TaggedValue) { + TTestContext context; + + auto taggedType = context.GetTaggedTaggedValueType(); + auto values = context.CreateTaggedsTaggedValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(taggedType)); + + auto array = MakeArrowArray(values, taggedType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, taggedType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], taggedType); + } + } + + Y_UNIT_TEST(NestedType_Tagged_OptionalTaggedValue) { + TTestContext context; + + auto taggedType = context.GetTaggedOptionalTaggedValueType(); + auto values = context.CreateTaggedsOptionalTaggedValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(taggedType)); + + auto array = MakeArrowArray(values, taggedType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, taggedType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], taggedType); + } + } + + Y_UNIT_TEST(NestedType_Tagged_TaggedOptionalTaggedTaggedValue) { + TTestContext context; + + auto taggedType = context.GetTaggedTaggedOptionalTaggedTaggedValueType(); + auto values = context.CreateTaggedsTaggedOptionalTaggedTaggedValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(taggedType)); + + auto array = MakeArrowArray(values, taggedType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, taggedType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], taggedType); + } + } } } // namespace NKikimr::NKqp::NFormats From 0eac064471e50a7fd71bd8bd5bab459814986370 Mon Sep 17 00:00:00 2001 From: Daniil Timizhev Date: Fri, 7 Nov 2025 22:04:43 +0300 Subject: [PATCH 22/25] Fix bug with external optional --- .../ut/kqp_formats_ut_helpers.cpp | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp index 788d387d706d..f2f50e92cb62 100644 --- a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp +++ b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp @@ -205,12 +205,21 @@ NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& arr ++depth; } + if (innerType->IsOptional()) { // depth + 1 == count of structs for types with validity bitmaps + innerType = SkipTaggedType(static_cast(innerType)->GetItemType()); + ++depth; + } + + auto wrap = NeedWrapByExternalOptional(innerType); + auto isNull = innerArray->IsNull(row); + NUdf::TUnboxedValue value; - if (NeedWrapByExternalOptional(innerType) || !innerArray->IsNull(row)) { + if (wrap || !isNull) { value = NFormats::ExtractUnboxedValue(innerArray, row, innerType, holderFactory); - if (NeedWrapByExternalOptional(innerType)) { - --depth; - } + } + + if (wrap || isNull) { + --depth; } for (int i = 0; i < depth; ++i) { From 97b3ad5745c3f243bd243a18055a308bffee6a11 Mon Sep 17 00:00:00 2001 From: Daniil Timizhev Date: Fri, 7 Nov 2025 23:00:46 +0300 Subject: [PATCH 23/25] Update tests with new nested convertations --- .../ut/arrow/kqp_result_set_formats_ut.cpp | 113 ++++-------------- 1 file changed, 26 insertions(+), 87 deletions(-) diff --git a/ydb/core/kqp/ut/arrow/kqp_result_set_formats_ut.cpp b/ydb/core/kqp/ut/arrow/kqp_result_set_formats_ut.cpp index f2d15a3a21ff..58916dbee26d 100644 --- a/ydb/core/kqp/ut/arrow/kqp_result_set_formats_ut.cpp +++ b/ydb/core/kqp/ut/arrow/kqp_result_set_formats_ut.cpp @@ -1482,7 +1482,7 @@ UuidNotNullValue: [ /** * More tests for different types with correctness and convertations between Arrow and UV : - * ydb/library/yql/dq/runtime/dq_arrow_helpers_ut.cpp + * ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp */ // Optional @@ -1565,58 +1565,8 @@ column1: -- is_valid: all not null } } - // Optional>>> - Y_UNIT_TEST(ArrowFormat_Types_Optional_3) { - auto kikimr = CreateKikimrRunner(/* withSampleTables */ true); - auto client = kikimr.GetQueryClient(); - - { - auto batches = ExecuteAndCombineBatches(client, R"( - SELECT Just(Just(Just(Key1))), Just(Just(Just(Name))) FROM Join2 - WHERE Key1 IN [104, 106, 108] - ORDER BY Key1; - )", /* assertSize */ false, 1); - - UNIT_ASSERT_C(!batches.empty(), "Batches must not be empty"); - - const auto& batch = batches.front(); - - UNIT_ASSERT_VALUES_EQUAL(batch->num_rows(), 3); - UNIT_ASSERT_VALUES_EQUAL(batch->num_columns(), 2); - - ValidateOptionalColumn(batch->column(0), 3, false); - ValidateOptionalColumn(batch->column(1), 3, false); - - const TString expected = -R"(column0: -- is_valid: all not null - -- child 0 type: struct not null> - -- is_valid: all not null - -- child 0 type: struct - -- is_valid: all not null - -- child 0 type: uint32 - [ - 104, - 106, - 108 - ] -column1: -- is_valid: all not null - -- child 0 type: struct not null> - -- is_valid: all not null - -- child 0 type: struct - -- is_valid: all not null - -- child 0 type: binary - [ - 4E616D6533, - 4E616D6533, - null - ] -)"; - UNIT_ASSERT_VALUES_EQUAL(batch->ToString(), expected); - } - } - // Optional> - Y_UNIT_TEST(ArrowFormat_Types_Optional_4) { + Y_UNIT_TEST(ArrowFormat_Types_Optional_3) { auto kikimr = CreateKikimrRunner(/* withSampleTables */ false); auto client = kikimr.GetQueryClient(); @@ -1656,7 +1606,7 @@ R"(column0: -- is_valid: all not null } // Optional>> - Y_UNIT_TEST(ArrowFormat_Types_Optional_5) { + Y_UNIT_TEST(ArrowFormat_Types_Optional_4) { auto kikimr = CreateKikimrRunner(/* withSampleTables */ false); auto client = kikimr.GetQueryClient(); @@ -1676,7 +1626,7 @@ R"(column0: -- is_valid: all not null const TString expected = R"(column0: -- is_valid: all not null - -- child 0 type: struct not null> + -- child 0 type: struct> -- is_valid: all not null -- child 0 type: dense_union -- is_valid: all not null @@ -1899,29 +1849,24 @@ R"(column0: -- is_valid: all not null UNIT_ASSERT_VALUES_EQUAL(batch->num_rows(), 1); UNIT_ASSERT_VALUES_EQUAL(batch->num_columns(), 1); - UNIT_ASSERT_C(batch->column(0)->type()->id() == arrow::Type::STRUCT, "Column type must be arrow::Type::STRUCT"); + UNIT_ASSERT_C(batch->column(0)->type()->id() == arrow::Type::LIST, "Column type must be arrow::Type::LIST"); const TString expected = -R"(column0: -- is_valid: all not null - -- child 0 type: map - [ - keys: +R"(column0: [ + -- is_valid: all not null + -- child 0 type: binary [ 61, 63, 62 ] - values: + -- child 1 type: int32 [ 1, 3, 2 ] - ] - -- child 1 type: uint64 - [ - 0 - ] + ] )"; UNIT_ASSERT_VALUES_EQUAL(batch->ToString(), expected); @@ -1944,30 +1889,24 @@ R"(column0: -- is_valid: all not null UNIT_ASSERT_VALUES_EQUAL(batch->num_rows(), 1); UNIT_ASSERT_VALUES_EQUAL(batch->num_columns(), 1); - UNIT_ASSERT_C(batch->column(0)->type()->id() == arrow::Type::STRUCT, "Column type must be arrow::Type::STRUCT"); + UNIT_ASSERT_C(batch->column(0)->type()->id() == arrow::Type::LIST, "Column type must be arrow::Type::LIST"); const TString expected = -R"(column0: -- is_valid: all not null - -- child 0 type: list> - [ - -- is_valid: all not null - -- child 0 type: binary - [ - 61, - 62, - null - ] - -- child 1 type: int32 - [ - 1, - 2, - 3 - ] - ] - -- child 1 type: uint64 - [ - 0 - ] +R"(column0: [ + -- is_valid: all not null + -- child 0 type: binary + [ + 61, + 62, + null + ] + -- child 1 type: int32 + [ + 1, + 2, + 3 + ] + ] )"; UNIT_ASSERT_VALUES_EQUAL(batch->ToString(), expected); From 3ec5e4cd0eb78cc1f5e0ebf790826a6b70dc1ca7 Mon Sep 17 00:00:00 2001 From: Daniil Timizhev Date: Mon, 10 Nov 2025 12:23:47 +0300 Subject: [PATCH 24/25] Double optional variant test --- .../ut/kqp_formats_arrow_ut.cpp | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp index 5b43033ce609..4ddc892f6c80 100644 --- a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp +++ b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp @@ -818,6 +818,18 @@ struct TTestContext { return values; } + TType* GetOptionalOptionalVariantType() { + return TOptionalType::Create(GetOptionalVariantValueType(), TypeEnv); + } + + TUnboxedValueVector CreateOptionalsOptionalVariantValue(ui32 quantity) { + TUnboxedValueVector values = CreateOptionalsVariantValue(quantity); + for (size_t i = 0; i < values.size(); ++i) { + values[i] = (i % 4 != 0) ? values[i].MakeOptional() : NUdf::TUnboxedValuePod(); + } + return values; + } + TType* GetTaggedType() { return TTaggedType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), "tag", TypeEnv); } @@ -2343,6 +2355,40 @@ Y_UNIT_TEST_SUITE(KqpFormats_Arrow_Conversion) { } } + Y_UNIT_TEST(NestedType_Optional_OptionalVariantValue) { + TTestContext context; + + auto optionalType = context.GetOptionalOptionalVariantType(); + auto values = context.CreateOptionalsOptionalVariantValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(optionalType)); + + auto array = MakeArrowArray(values, optionalType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + auto structArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 1); + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::STRUCT); + + auto innerStructArray = static_pointer_cast(structArray->field(0)); + UNIT_ASSERT_VALUES_EQUAL(innerStructArray->num_fields(), 1); + UNIT_ASSERT(innerStructArray->field(0)->type_id() == arrow::Type::DENSE_UNION); + + auto innerUnionArray = static_pointer_cast(innerStructArray->field(0)); + UNIT_ASSERT_VALUES_EQUAL(innerUnionArray->num_fields(), 4); + UNIT_ASSERT(innerUnionArray->field(0)->type_id() == arrow::Type::INT32); + UNIT_ASSERT(innerUnionArray->field(1)->type_id() == arrow::Type::BINARY); + UNIT_ASSERT(innerUnionArray->field(2)->type_id() == arrow::Type::FLOAT); + UNIT_ASSERT(innerUnionArray->field(3)->type_id() == arrow::Type::UINT8); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, optionalType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], optionalType); + } + } + Y_UNIT_TEST(NestedType_Variant_Struct) { TTestContext context; From 4ff3894868fb9bba81a268efd022ca68496f230a Mon Sep 17 00:00:00 2001 From: Daniil Timizhev Date: Mon, 10 Nov 2025 12:24:26 +0300 Subject: [PATCH 25/25] pretty extract for optionals --- .../ut/kqp_formats_ut_helpers.cpp | 73 +++++++++---------- 1 file changed, 33 insertions(+), 40 deletions(-) diff --git a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp index f2f50e92cb62..cec436ecb94c 100644 --- a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp +++ b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp @@ -180,54 +180,47 @@ NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& arr NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& array, ui64 row, const NMiniKQL::TOptionalType* optionalType, const NMiniKQL::THolderFactory& holderFactory) { - auto innerOptionalType = SkipTaggedType(optionalType->GetItemType()); - if (NeedWrapByExternalOptional(innerOptionalType)) { - YQL_ENSURE(array->type_id() == arrow::Type::STRUCT, "Unexpected array type"); - - auto innerArray = array; - auto innerType = static_cast(optionalType); - int depth = 0; - - while (innerArray->type_id() == arrow::Type::STRUCT) { - auto structArray = static_pointer_cast(innerArray); - YQL_ENSURE(structArray->num_fields() == 1, "Unexpected count of fields"); - - if (structArray->IsNull(row)) { - NUdf::TUnboxedValue value; - for (int i = 0; i < depth; ++i) { - value = value.MakeOptional(); - } - return value; - } + auto innerType = SkipTaggedType(optionalType->GetItemType()); + ui32 depth = 1; - innerType = SkipTaggedType(static_cast(innerType)->GetItemType()); - innerArray = structArray->field(0); - ++depth; - } + while (innerType->IsOptional()) { + innerType = SkipTaggedType(static_cast(innerType)->GetItemType()); + ++depth; + } - if (innerType->IsOptional()) { // depth + 1 == count of structs for types with validity bitmaps - innerType = SkipTaggedType(static_cast(innerType)->GetItemType()); - ++depth; - } + // For types without native validity bitmap (e.g., Variant, Null) we need to wrap them in an additional struct layer + // Furthermore, other singular types (e.g., Void, EmptyList, EmptyDict) also need to wrap (from YQL-15332) + // Thus, the depth == 2 for Optional> type + if (NeedWrapByExternalOptional(innerType)) { + ++depth; + } - auto wrap = NeedWrapByExternalOptional(innerType); - auto isNull = innerArray->IsNull(row); + auto innerArray = array; + NUdf::TUnboxedValue value; - NUdf::TUnboxedValue value; - if (wrap || !isNull) { - value = NFormats::ExtractUnboxedValue(innerArray, row, innerType, holderFactory); - } + for (ui32 i = 1; i < depth; ++i) { + YQL_ENSURE(innerArray->type_id() == arrow::Type::STRUCT, "Unexpected array type"); + auto structArray = static_pointer_cast(innerArray); + YQL_ENSURE(structArray->num_fields() == 1, "Unexpected count of fields"); - if (wrap || isNull) { - --depth; + if (structArray->IsNull(row)) { + for (ui32 j = 1; j < i; ++j) { + value = value.MakeOptional(); + } + return value; } - for (int i = 0; i < depth; ++i) { - value = value.MakeOptional(); - } - return value; + innerArray = structArray->field(0); + } + + if (!innerArray->IsNull(row)) { + value = NFormats::ExtractUnboxedValue(innerArray, row, innerType, holderFactory); } - return NFormats::ExtractUnboxedValue(array, row, innerOptionalType, holderFactory).Release().MakeOptional(); + + for (ui32 i = 1; i < depth; ++i) { + value = value.MakeOptional(); + } + return value; } NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& array, ui64 row,