From f7a297c4987a77bf3f71eb28325e680fbd5d1e9e Mon Sep 17 00:00:00 2001 From: Vladilen Muzychenko Date: Fri, 14 Nov 2025 13:56:21 +0000 Subject: [PATCH 1/8] Remove double BinaryJson parsing --- .../accessor/sub_columns/json_extractors.cpp | 29 ++----------------- .../accessor/sub_columns/json_extractors.h | 2 +- yql/essentials/types/binary_json/write.cpp | 28 ++++++++++++++++++ yql/essentials/types/binary_json/write.h | 7 +++++ 4 files changed, 39 insertions(+), 27 deletions(-) diff --git a/ydb/core/formats/arrow/accessor/sub_columns/json_extractors.cpp b/ydb/core/formats/arrow/accessor/sub_columns/json_extractors.cpp index d4b883a04827..8d0667fa1803 100644 --- a/ydb/core/formats/arrow/accessor/sub_columns/json_extractors.cpp +++ b/ydb/core/formats/arrow/accessor/sub_columns/json_extractors.cpp @@ -39,30 +39,11 @@ TConclusionStatus TKVExtractor::DoFill(TDataBuilder& dataBuilder, std::deque>& iterators, const TStringBuf key, NBinaryJson::TEntryCursor& value) const { + std::deque>& iterators, const TStringBuf key, const NBinaryJson::TEntryCursor& value) const { std::variant res; bool addRes = true; - if (value.GetType() == NBinaryJson::EEntryType::String) { - NJson::TJsonValue jsonValue(value.GetString()); - NJsonWriter::TBuf sout; - sout.WriteJsonValue(&jsonValue); - res = NBinaryJson::SerializeToBinaryJson(sout.Str(), false); - } else if (value.GetType() == NBinaryJson::EEntryType::Number) { - const double val = value.GetNumber(); - double integer; - if (modf(val, &integer)) { - res = NBinaryJson::SerializeToBinaryJson(std::to_string(val), false); - } else { - res = NBinaryJson::SerializeToBinaryJson(std::to_string((i64)integer), false); - } - } else if (value.GetType() == NBinaryJson::EEntryType::BoolFalse) { - static const TString falseString = "false"; - res = NBinaryJson::SerializeToBinaryJson(falseString, false); - } else if (value.GetType() == NBinaryJson::EEntryType::BoolTrue) { - static const TString trueString = "true"; - res = NBinaryJson::SerializeToBinaryJson(trueString, false); - } else if (value.GetType() == NBinaryJson::EEntryType::Container) { + if (value.GetType() == NBinaryJson::EEntryType::Container) { auto container = value.GetContainer(); if (FirstLevelOnly || container.GetType() == NBinaryJson::EContainerType::Array) { res = NBinaryJson::SerializeToBinaryJson(NBinaryJson::SerializeToJson(container), false); @@ -76,12 +57,8 @@ TConclusionStatus IJsonObjectExtractor::AddDataToBuilder(TDataBuilder& dataBuild } else { return TConclusionStatus::Fail("unexpected top value scalar in container iterator"); } - - } else if (value.GetType() == NBinaryJson::EEntryType::Null) { - static const TString nullString = "null"; - res = NBinaryJson::SerializeToBinaryJson(nullString, false); } else { - return TConclusionStatus::Fail("unexpected json value type: " + ::ToString((int)value.GetType())); + res = NBinaryJson::SerializeToBinaryJson(value); } if (addRes) { diff --git a/ydb/core/formats/arrow/accessor/sub_columns/json_extractors.h b/ydb/core/formats/arrow/accessor/sub_columns/json_extractors.h index 048288cb0988..3a035eb53a38 100644 --- a/ydb/core/formats/arrow/accessor/sub_columns/json_extractors.h +++ b/ydb/core/formats/arrow/accessor/sub_columns/json_extractors.h @@ -24,7 +24,7 @@ class IJsonObjectExtractor { } [[nodiscard]] TConclusionStatus AddDataToBuilder(TDataBuilder& dataBuilder, std::deque>& iterators, - const TStringBuf key, NBinaryJson::TEntryCursor& value) const; + const TStringBuf key, const NBinaryJson::TEntryCursor& value) const; public: virtual ~IJsonObjectExtractor() = default; diff --git a/yql/essentials/types/binary_json/write.cpp b/yql/essentials/types/binary_json/write.cpp index e830811bd34d..abbead0e66df 100644 --- a/yql/essentials/types/binary_json/write.cpp +++ b/yql/essentials/types/binary_json/write.cpp @@ -764,4 +764,32 @@ TBinaryJson SerializeToBinaryJson(const NUdf::TUnboxedValue& value) { return std::move(serializer).Serialize(); } +std::variant SerializeToBinaryJson(const NBinaryJson::TEntryCursor& value) { + TBinaryJsonCallbacks callbacks(/* throwException */ false, /* allowInf */ false); + + switch (value.GetType()) { + case NBinaryJson::EEntryType::BoolFalse: + callbacks.OnBoolean(false); + break; + case NBinaryJson::EEntryType::BoolTrue: + callbacks.OnBoolean(true); + break; + case NBinaryJson::EEntryType::Null: + callbacks.OnNull(); + break; + case NBinaryJson::EEntryType::String: + callbacks.OnString(value.GetString()); + break; + case NBinaryJson::EEntryType::Number: + callbacks.OnDouble(value.GetNumber()); + break; + case NBinaryJson::EEntryType::Container: + // TODO: Support recursive parsing and returning TBinaryJson + return TString("Container type is not a scalar value"); + } + + TBinaryJsonSerializer serializer(std::move(callbacks).GetResult()); + return std::move(serializer).Serialize(); +} + } // namespace NKikimr::NBinaryJson diff --git a/yql/essentials/types/binary_json/write.h b/yql/essentials/types/binary_json/write.h index 819cb0023759..ea5136891d85 100644 --- a/yql/essentials/types/binary_json/write.h +++ b/yql/essentials/types/binary_json/write.h @@ -1,6 +1,7 @@ #pragma once #include "format.h" +#include "read.h" #include @@ -21,4 +22,10 @@ std::variant SerializeToBinaryJson(const TStringBuf json, * @brief Translates DOM layout from `yql/library/dom` library into BinaryJson */ TBinaryJson SerializeToBinaryJson(const NYql::NUdf::TUnboxedValue& value); + +/** + * @brief Translates read cursor into a separate BinaryJson + */ +std::variant SerializeToBinaryJson(const NBinaryJson::TEntryCursor& value); + } // namespace NKikimr::NBinaryJson From 608b53cc0ae126e8284535c50bae69e82867601e Mon Sep 17 00:00:00 2001 From: Vladilen Muzychenko Date: Mon, 17 Nov 2025 15:30:29 +0000 Subject: [PATCH 2/8] Add array and object parsing --- .../types/binary_json/ut/extract_ut.cpp | 119 ++++++++++++++++++ yql/essentials/types/binary_json/ut/ya.make | 1 + yql/essentials/types/binary_json/write.cpp | 53 +++++++- 3 files changed, 167 insertions(+), 6 deletions(-) create mode 100644 yql/essentials/types/binary_json/ut/extract_ut.cpp diff --git a/yql/essentials/types/binary_json/ut/extract_ut.cpp b/yql/essentials/types/binary_json/ut/extract_ut.cpp new file mode 100644 index 000000000000..a65c9ba6c645 --- /dev/null +++ b/yql/essentials/types/binary_json/ut/extract_ut.cpp @@ -0,0 +1,119 @@ +#include "test_base.h" + +#include +#include + +using namespace NKikimr::NBinaryJson; + +class TBinaryJsonExtractTest: public TBinaryJsonTestBase { +public: + TBinaryJsonExtractTest() + : TBinaryJsonTestBase() + { + } + + UNIT_TEST_SUITE(TBinaryJsonExtractTest); + UNIT_TEST(TestTopLevelScalar); + UNIT_TEST(TestTopLevelArray); + UNIT_TEST(TestTopLevelMap); + UNIT_TEST(TestComplexCases); + UNIT_TEST_SUITE_END(); + + void TestTopLevelScalar() { + const TVector testCases = { + {"1"}, + {"1.25"}, + {"-2.36"}, + {"0"}, + {"\"string\""}, + {"\"\""}, + {"null"}, + {"true"}, + {"false"}, + }; + + for (const auto& testCase : testCases) { + const auto binaryJson = std::get(SerializeToBinaryJson(testCase)); + const auto reader = TBinaryJsonReader::Make(binaryJson); + const auto container = reader->GetRootCursor(); + const auto cursor = container.GetElement(0); + const auto extractedBinaryJson = std::get(SerializeToBinaryJson(cursor)); + + UNIT_ASSERT_EQUAL(SerializeToJson(binaryJson), SerializeToJson(extractedBinaryJson)); + } + } + + void TestTopLevelArray() { + const TVector> testCases = { + {"[1]", "1"}, + {R"(["string"])", R"("string")"}, + {"[true]", "true"}, + {"[-1.2]", "-1.2"}, + {"[null]", "null"}, + }; + + for (const auto& testCase : testCases) { + const auto originalBinaryJson = std::get(SerializeToBinaryJson(testCase.first)); + const auto reader = TBinaryJsonReader::Make(originalBinaryJson); + const auto container = reader->GetRootCursor(); + const auto cursor = container.GetArrayIterator().Next(); + const auto extractedBinaryJson = std::get(SerializeToBinaryJson(cursor)); + + const auto expectedBinaryJson = std::get(SerializeToBinaryJson(testCase.second)); + + UNIT_ASSERT_EQUAL(SerializeToJson(expectedBinaryJson), SerializeToJson(extractedBinaryJson)); + } + } + + void TestTopLevelMap() { + const TVector> testCases = { + {R"({"root": "a"})", R"("a")"}, + {R"({"root": ""})", R"("")"}, + {R"({"root": 1})", "1"}, + {R"({"root": -1.2})", "-1.2"}, + {R"({"root": true})", "true"}, + {R"({"root": false})", "false"}, + {R"({"root": null})", "null"}, + {R"({"root": {}})", "{}"}, + {R"({"root": []})", "[]"}, + }; + + for (const auto& testCase : testCases) { + const auto originalBinaryJson = std::get(SerializeToBinaryJson(testCase.first)); + const auto reader = TBinaryJsonReader::Make(originalBinaryJson); + const auto container = reader->GetRootCursor(); + const auto cursor = container.GetObjectIterator().Next(); + const auto extractedBinaryJson = std::get(SerializeToBinaryJson(cursor.second)); + + const auto expectedBinaryJson = std::get(SerializeToBinaryJson(testCase.second)); + + UNIT_ASSERT_EQUAL(SerializeToJson(expectedBinaryJson), SerializeToJson(extractedBinaryJson)); + } + } + + void TestComplexCases() { + // clang-format off + const TVector> testCases = { + {R"({"root": {"a": "b", "c": {"d": 1, "e": [1, 2.1, -3, "f", true, false, null, {}, {"g": []}]}}})", + R"({"a": "b", "c": {"d": 1, "e": [1, 2.1, -3, "f", true, false, null, {}, {"g": []}]}})"}, + + {R"({"root": [[], [1, 1, 1, 2], {"a": true}, null, [], [[[]]], [[42]]]})", + R"([[], [1, 1, 1, 2], {"a": true}, null, [], [[[]]], [[42]]])"}, + }; + // clang-format on + + for (const auto& testCase : testCases) { + const auto originalBinaryJson = std::get(SerializeToBinaryJson(testCase.first)); + const auto reader = TBinaryJsonReader::Make(originalBinaryJson); + const auto container = reader->GetRootCursor(); + const auto cursor = container.GetObjectIterator().Next(); + const auto extractedBinaryJson = std::get(SerializeToBinaryJson(cursor.second)); + + const auto expectedBinaryJson = std::get(SerializeToBinaryJson(testCase.second)); + + UNIT_ASSERT_EQUAL(SerializeToJson(expectedBinaryJson), SerializeToJson(extractedBinaryJson)); + } + } +}; + +UNIT_TEST_SUITE_REGISTRATION(TBinaryJsonExtractTest); diff --git a/yql/essentials/types/binary_json/ut/ya.make b/yql/essentials/types/binary_json/ut/ya.make index 49ef9c24a04a..cc312342736c 100644 --- a/yql/essentials/types/binary_json/ut/ya.make +++ b/yql/essentials/types/binary_json/ut/ya.make @@ -6,6 +6,7 @@ SRCS( entry_ut.cpp test_base.cpp valid_ut.cpp + extract_ut.cpp ) IF (SANITIZER_TYPE == "thread" OR WITH_VALGRIND) diff --git a/yql/essentials/types/binary_json/write.cpp b/yql/essentials/types/binary_json/write.cpp index abbead0e66df..0ea9edfc598d 100644 --- a/yql/essentials/types/binary_json/write.cpp +++ b/yql/essentials/types/binary_json/write.cpp @@ -764,9 +764,7 @@ TBinaryJson SerializeToBinaryJson(const NUdf::TUnboxedValue& value) { return std::move(serializer).Serialize(); } -std::variant SerializeToBinaryJson(const NBinaryJson::TEntryCursor& value) { - TBinaryJsonCallbacks callbacks(/* throwException */ false, /* allowInf */ false); - +TString SerializeReadCursorToBinaryJson(TBinaryJsonCallbacks& callbacks, const NBinaryJson::TEntryCursor& value) { switch (value.GetType()) { case NBinaryJson::EEntryType::BoolFalse: callbacks.OnBoolean(false); @@ -783,9 +781,52 @@ std::variant SerializeToBinaryJson(const NBinaryJson::TEnt case NBinaryJson::EEntryType::Number: callbacks.OnDouble(value.GetNumber()); break; - case NBinaryJson::EEntryType::Container: - // TODO: Support recursive parsing and returning TBinaryJson - return TString("Container type is not a scalar value"); + case NBinaryJson::EEntryType::Container: { + auto container = value.GetContainer(); + if (container.GetType() == NBinaryJson::EContainerType::Array) { + callbacks.OnOpenArray(); + + auto it = container.GetArrayIterator(); + while (it.HasNext()) { + auto value = it.Next(); + if (auto error = SerializeReadCursorToBinaryJson(callbacks, value); !error.empty()) { + return error; + } + } + + callbacks.OnCloseArray(); + + } else if (container.GetType() == NBinaryJson::EContainerType::Object) { + callbacks.OnOpenMap(); + + auto it = container.GetObjectIterator(); + while (it.HasNext()) { + auto [key, value] = it.Next(); + if (key.GetType() != NBinaryJson::EEntryType::String) { + return TString("Unexpected non-string key"); + } + + callbacks.OnMapKey(key.GetString()); + if (auto error = SerializeReadCursorToBinaryJson(callbacks, value); !error.empty()) { + return error; + } + } + + callbacks.OnCloseMap(); + } else { + return TString("Unexpected top value scalar in container iterator"); + } + } + } + + return {}; +} + +std::variant SerializeToBinaryJson(const NBinaryJson::TEntryCursor& value) { + TBinaryJsonCallbacks callbacks(/* throwException */ false, /* allowInf */ false); + + if (auto error = SerializeReadCursorToBinaryJson(callbacks, value); !error.empty()) { + return error; } TBinaryJsonSerializer serializer(std::move(callbacks).GetResult()); From 1fba8e70edcfd5d6a82b9e0ac54cb3c3b98d7ac2 Mon Sep 17 00:00:00 2001 From: Vladilen Muzychenko Date: Mon, 17 Nov 2025 15:33:50 +0000 Subject: [PATCH 3/8] Refactoring --- yql/essentials/types/binary_json/write.cpp | 80 +++++++++++----------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/yql/essentials/types/binary_json/write.cpp b/yql/essentials/types/binary_json/write.cpp index 0ea9edfc598d..7f4a08fa6cca 100644 --- a/yql/essentials/types/binary_json/write.cpp +++ b/yql/essentials/types/binary_json/write.cpp @@ -727,44 +727,8 @@ template return simdjson::SUCCESS; #undef RETURN_IF_NOT_SUCCESS } -} // namespace - -std::variant SerializeToBinaryJsonImpl(const TStringBuf json, bool allowInf) { - std::variant res; - TBinaryJsonCallbacks callbacks(/* throwException */ false, allowInf); - const simdjson::padded_string paddedJson(json); - simdjson::ondemand::parser parser; - try { - auto doc = parser.iterate(paddedJson); - if (auto status = doc.error(); status != simdjson::SUCCESS) { - res = TString(simdjson::error_message(status)); - return res; - } - if (auto status = SimdJsonToJsonIndex(doc.value_unsafe(), callbacks); status != simdjson::SUCCESS) { - res = TString(simdjson::error_message(status)); - return res; - } - } catch (const simdjson::simdjson_error& e) { - res = TString(e.what()); - return res; - } - TBinaryJsonSerializer serializer(std::move(callbacks).GetResult()); - res = std::move(serializer).Serialize(); - return res; -} - -std::variant SerializeToBinaryJson(const TStringBuf json, bool allowInf) { - return SerializeToBinaryJsonImpl(json, allowInf); -} - -TBinaryJson SerializeToBinaryJson(const NUdf::TUnboxedValue& value) { - TBinaryJsonCallbacks callbacks(/* throwException */ false, /* allowInf */ false); - DomToJsonIndex(value, callbacks); - TBinaryJsonSerializer serializer(std::move(callbacks).GetResult()); - return std::move(serializer).Serialize(); -} -TString SerializeReadCursorToBinaryJson(TBinaryJsonCallbacks& callbacks, const NBinaryJson::TEntryCursor& value) { +TString SerializeEntryCursorToBinaryJson(TBinaryJsonCallbacks& callbacks, const NBinaryJson::TEntryCursor& value) { switch (value.GetType()) { case NBinaryJson::EEntryType::BoolFalse: callbacks.OnBoolean(false); @@ -789,7 +753,7 @@ TString SerializeReadCursorToBinaryJson(TBinaryJsonCallbacks& callbacks, const N auto it = container.GetArrayIterator(); while (it.HasNext()) { auto value = it.Next(); - if (auto error = SerializeReadCursorToBinaryJson(callbacks, value); !error.empty()) { + if (auto error = SerializeEntryCursorToBinaryJson(callbacks, value); !error.empty()) { return error; } } @@ -807,7 +771,7 @@ TString SerializeReadCursorToBinaryJson(TBinaryJsonCallbacks& callbacks, const N } callbacks.OnMapKey(key.GetString()); - if (auto error = SerializeReadCursorToBinaryJson(callbacks, value); !error.empty()) { + if (auto error = SerializeEntryCursorToBinaryJson(callbacks, value); !error.empty()) { return error; } } @@ -821,11 +785,47 @@ TString SerializeReadCursorToBinaryJson(TBinaryJsonCallbacks& callbacks, const N return {}; } +} // namespace + +std::variant SerializeToBinaryJsonImpl(const TStringBuf json, bool allowInf) { + std::variant res; + TBinaryJsonCallbacks callbacks(/* throwException */ false, allowInf); + const simdjson::padded_string paddedJson(json); + simdjson::ondemand::parser parser; + try { + auto doc = parser.iterate(paddedJson); + if (auto status = doc.error(); status != simdjson::SUCCESS) { + res = TString(simdjson::error_message(status)); + return res; + } + if (auto status = SimdJsonToJsonIndex(doc.value_unsafe(), callbacks); status != simdjson::SUCCESS) { + res = TString(simdjson::error_message(status)); + return res; + } + } catch (const simdjson::simdjson_error& e) { + res = TString(e.what()); + return res; + } + TBinaryJsonSerializer serializer(std::move(callbacks).GetResult()); + res = std::move(serializer).Serialize(); + return res; +} + +std::variant SerializeToBinaryJson(const TStringBuf json, bool allowInf) { + return SerializeToBinaryJsonImpl(json, allowInf); +} + +TBinaryJson SerializeToBinaryJson(const NUdf::TUnboxedValue& value) { + TBinaryJsonCallbacks callbacks(/* throwException */ false, /* allowInf */ false); + DomToJsonIndex(value, callbacks); + TBinaryJsonSerializer serializer(std::move(callbacks).GetResult()); + return std::move(serializer).Serialize(); +} std::variant SerializeToBinaryJson(const NBinaryJson::TEntryCursor& value) { TBinaryJsonCallbacks callbacks(/* throwException */ false, /* allowInf */ false); - if (auto error = SerializeReadCursorToBinaryJson(callbacks, value); !error.empty()) { + if (auto error = SerializeEntryCursorToBinaryJson(callbacks, value); !error.empty()) { return error; } From dfd73df8e6b74868d4d92480c0952f6f0f525ce4 Mon Sep 17 00:00:00 2001 From: Vladilen Muzychenko Date: Tue, 18 Nov 2025 08:59:18 +0000 Subject: [PATCH 4/8] Fix review notes --- yql/essentials/types/binary_json/ut/extract_ut.cpp | 9 +++++++++ yql/essentials/types/binary_json/write.cpp | 3 +++ 2 files changed, 12 insertions(+) diff --git a/yql/essentials/types/binary_json/ut/extract_ut.cpp b/yql/essentials/types/binary_json/ut/extract_ut.cpp index a65c9ba6c645..e94d857a27a7 100644 --- a/yql/essentials/types/binary_json/ut/extract_ut.cpp +++ b/yql/essentials/types/binary_json/ut/extract_ut.cpp @@ -13,12 +13,21 @@ class TBinaryJsonExtractTest: public TBinaryJsonTestBase { } UNIT_TEST_SUITE(TBinaryJsonExtractTest); + UNIT_TEST(TestInvalidCursor); UNIT_TEST(TestTopLevelScalar); UNIT_TEST(TestTopLevelArray); UNIT_TEST(TestTopLevelMap); UNIT_TEST(TestComplexCases); UNIT_TEST_SUITE_END(); + void TestInvalidCursor() { + TEntryCursor invalidCursor({}, TEntry(EEntryType(31), 0)); + auto result = SerializeToBinaryJson(invalidCursor); + UNIT_ASSERT(std::holds_alternative(result)); + + UNIT_ASSERT_EQUAL("Unexpected entry type", std::get(result)); + } + void TestTopLevelScalar() { const TVector testCases = { {"1"}, diff --git a/yql/essentials/types/binary_json/write.cpp b/yql/essentials/types/binary_json/write.cpp index 7f4a08fa6cca..be9988adda15 100644 --- a/yql/essentials/types/binary_json/write.cpp +++ b/yql/essentials/types/binary_json/write.cpp @@ -780,7 +780,10 @@ TString SerializeEntryCursorToBinaryJson(TBinaryJsonCallbacks& callbacks, const } else { return TString("Unexpected top value scalar in container iterator"); } + break; } + default: + return TString("Unexpected entry type"); } return {}; From 10a497b9c5889a49bef8a96f865eaace56915b9c Mon Sep 17 00:00:00 2001 From: Vladilen Muzychenko Date: Tue, 18 Nov 2025 09:13:15 +0000 Subject: [PATCH 5/8] Fix review notes --- yql/essentials/types/binary_json/ut/extract_ut.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/yql/essentials/types/binary_json/ut/extract_ut.cpp b/yql/essentials/types/binary_json/ut/extract_ut.cpp index e94d857a27a7..f2afc7bc06aa 100644 --- a/yql/essentials/types/binary_json/ut/extract_ut.cpp +++ b/yql/essentials/types/binary_json/ut/extract_ut.cpp @@ -59,6 +59,8 @@ class TBinaryJsonExtractTest: public TBinaryJsonTestBase { {"[true]", "true"}, {"[-1.2]", "-1.2"}, {"[null]", "null"}, + {"[[1,2]]", "[1,2]"}, + {R"([{"a": "b"}])", R"({"a": "b"})"}, }; for (const auto& testCase : testCases) { From 85196e1d44c6b0290fc045e0abd6007285358eb1 Mon Sep 17 00:00:00 2001 From: Vladilen Muzychenko Date: Wed, 19 Nov 2025 09:00:57 +0000 Subject: [PATCH 6/8] Fix review notes --- .../types/binary_json/ut/extract_ut.cpp | 12 ++++----- yql/essentials/types/binary_json/write.cpp | 26 ++++++++----------- 2 files changed, 17 insertions(+), 21 deletions(-) diff --git a/yql/essentials/types/binary_json/ut/extract_ut.cpp b/yql/essentials/types/binary_json/ut/extract_ut.cpp index f2afc7bc06aa..2e4e47fb143d 100644 --- a/yql/essentials/types/binary_json/ut/extract_ut.cpp +++ b/yql/essentials/types/binary_json/ut/extract_ut.cpp @@ -21,11 +21,11 @@ class TBinaryJsonExtractTest: public TBinaryJsonTestBase { UNIT_TEST_SUITE_END(); void TestInvalidCursor() { - TEntryCursor invalidCursor({}, TEntry(EEntryType(31), 0)); + TEntryCursor invalidCursor({}, TEntry(EEntryType(15), 0)); auto result = SerializeToBinaryJson(invalidCursor); UNIT_ASSERT(std::holds_alternative(result)); - UNIT_ASSERT_EQUAL("Unexpected entry type", std::get(result)); + UNIT_ASSERT_VALUES_EQUAL("Undefined value 15 in NKikimr::NBinaryJson::EEntryType. ", std::get(result)); } void TestTopLevelScalar() { @@ -48,7 +48,7 @@ class TBinaryJsonExtractTest: public TBinaryJsonTestBase { const auto cursor = container.GetElement(0); const auto extractedBinaryJson = std::get(SerializeToBinaryJson(cursor)); - UNIT_ASSERT_EQUAL(SerializeToJson(binaryJson), SerializeToJson(extractedBinaryJson)); + UNIT_ASSERT_VALUES_EQUAL(SerializeToJson(binaryJson), SerializeToJson(extractedBinaryJson)); } } @@ -72,7 +72,7 @@ class TBinaryJsonExtractTest: public TBinaryJsonTestBase { const auto expectedBinaryJson = std::get(SerializeToBinaryJson(testCase.second)); - UNIT_ASSERT_EQUAL(SerializeToJson(expectedBinaryJson), SerializeToJson(extractedBinaryJson)); + UNIT_ASSERT_VALUES_EQUAL(SerializeToJson(expectedBinaryJson), SerializeToJson(extractedBinaryJson)); } } @@ -98,7 +98,7 @@ class TBinaryJsonExtractTest: public TBinaryJsonTestBase { const auto expectedBinaryJson = std::get(SerializeToBinaryJson(testCase.second)); - UNIT_ASSERT_EQUAL(SerializeToJson(expectedBinaryJson), SerializeToJson(extractedBinaryJson)); + UNIT_ASSERT_VALUES_EQUAL(SerializeToJson(expectedBinaryJson), SerializeToJson(extractedBinaryJson)); } } @@ -122,7 +122,7 @@ class TBinaryJsonExtractTest: public TBinaryJsonTestBase { const auto expectedBinaryJson = std::get(SerializeToBinaryJson(testCase.second)); - UNIT_ASSERT_EQUAL(SerializeToJson(expectedBinaryJson), SerializeToJson(extractedBinaryJson)); + UNIT_ASSERT_VALUES_EQUAL(SerializeToJson(expectedBinaryJson), SerializeToJson(extractedBinaryJson)); } } }; diff --git a/yql/essentials/types/binary_json/write.cpp b/yql/essentials/types/binary_json/write.cpp index be9988adda15..61d947e9403b 100644 --- a/yql/essentials/types/binary_json/write.cpp +++ b/yql/essentials/types/binary_json/write.cpp @@ -728,7 +728,7 @@ template #undef RETURN_IF_NOT_SUCCESS } -TString SerializeEntryCursorToBinaryJson(TBinaryJsonCallbacks& callbacks, const NBinaryJson::TEntryCursor& value) { +void SerializeEntryCursorToBinaryJson(TBinaryJsonCallbacks& callbacks, const NBinaryJson::TEntryCursor& value) { switch (value.GetType()) { case NBinaryJson::EEntryType::BoolFalse: callbacks.OnBoolean(false); @@ -753,9 +753,7 @@ TString SerializeEntryCursorToBinaryJson(TBinaryJsonCallbacks& callbacks, const auto it = container.GetArrayIterator(); while (it.HasNext()) { auto value = it.Next(); - if (auto error = SerializeEntryCursorToBinaryJson(callbacks, value); !error.empty()) { - return error; - } + SerializeEntryCursorToBinaryJson(callbacks, value); } callbacks.OnCloseArray(); @@ -767,26 +765,22 @@ TString SerializeEntryCursorToBinaryJson(TBinaryJsonCallbacks& callbacks, const while (it.HasNext()) { auto [key, value] = it.Next(); if (key.GetType() != NBinaryJson::EEntryType::String) { - return TString("Unexpected non-string key"); + ythrow yexception() << "Unexpected non-string key: " << key.GetType(); } callbacks.OnMapKey(key.GetString()); - if (auto error = SerializeEntryCursorToBinaryJson(callbacks, value); !error.empty()) { - return error; - } + SerializeEntryCursorToBinaryJson(callbacks, value); } callbacks.OnCloseMap(); } else { - return TString("Unexpected top value scalar in container iterator"); + ythrow yexception() << "Unexpected type in container iterator: " << container.GetType(); } break; } default: - return TString("Unexpected entry type"); + ythrow yexception() << "Unexpected entry type: " << value.GetType(); } - - return {}; } } // namespace @@ -826,10 +820,12 @@ TBinaryJson SerializeToBinaryJson(const NUdf::TUnboxedValue& value) { } std::variant SerializeToBinaryJson(const NBinaryJson::TEntryCursor& value) { - TBinaryJsonCallbacks callbacks(/* throwException */ false, /* allowInf */ false); + TBinaryJsonCallbacks callbacks(/* throwException */ true, /* allowInf */ false); - if (auto error = SerializeEntryCursorToBinaryJson(callbacks, value); !error.empty()) { - return error; + try { + SerializeEntryCursorToBinaryJson(callbacks, value); + } catch (const yexception& ex) { + return TString(ex.what()); } TBinaryJsonSerializer serializer(std::move(callbacks).GetResult()); From c8f8bb1d33e42462b341e81cc470726940cba732 Mon Sep 17 00:00:00 2001 From: Vladilen Muzychenko Date: Wed, 19 Nov 2025 09:54:39 +0000 Subject: [PATCH 7/8] Fix review notes --- yql/essentials/types/binary_json/write.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/yql/essentials/types/binary_json/write.cpp b/yql/essentials/types/binary_json/write.cpp index 61d947e9403b..1d753cd19486 100644 --- a/yql/essentials/types/binary_json/write.cpp +++ b/yql/essentials/types/binary_json/write.cpp @@ -765,7 +765,7 @@ void SerializeEntryCursorToBinaryJson(TBinaryJsonCallbacks& callbacks, const NBi while (it.HasNext()) { auto [key, value] = it.Next(); if (key.GetType() != NBinaryJson::EEntryType::String) { - ythrow yexception() << "Unexpected non-string key: " << key.GetType(); + throw yexception() << "Unexpected non-string key: " << key.GetType(); } callbacks.OnMapKey(key.GetString()); @@ -774,12 +774,12 @@ void SerializeEntryCursorToBinaryJson(TBinaryJsonCallbacks& callbacks, const NBi callbacks.OnCloseMap(); } else { - ythrow yexception() << "Unexpected type in container iterator: " << container.GetType(); + throw yexception() << "Unexpected type in container iterator: " << container.GetType(); } break; } default: - ythrow yexception() << "Unexpected entry type: " << value.GetType(); + throw yexception() << "Unexpected entry type: " << value.GetType(); } } } // namespace From db6d2f64974bfe417fa1557ec4924b3a334ac87b Mon Sep 17 00:00:00 2001 From: Vladilen Muzychenko Date: Fri, 21 Nov 2025 11:10:20 +0000 Subject: [PATCH 8/8] Fix first level double parsing --- ydb/core/formats/arrow/accessor/sub_columns/json_extractors.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ydb/core/formats/arrow/accessor/sub_columns/json_extractors.cpp b/ydb/core/formats/arrow/accessor/sub_columns/json_extractors.cpp index 8d0667fa1803..e89217a46c15 100644 --- a/ydb/core/formats/arrow/accessor/sub_columns/json_extractors.cpp +++ b/ydb/core/formats/arrow/accessor/sub_columns/json_extractors.cpp @@ -46,7 +46,7 @@ TConclusionStatus IJsonObjectExtractor::AddDataToBuilder(TDataBuilder& dataBuild if (value.GetType() == NBinaryJson::EEntryType::Container) { auto container = value.GetContainer(); if (FirstLevelOnly || container.GetType() == NBinaryJson::EContainerType::Array) { - res = NBinaryJson::SerializeToBinaryJson(NBinaryJson::SerializeToJson(container), false); + res = NBinaryJson::SerializeToBinaryJson(value); // TODO: add support for arrays if needed // } else if (container.GetType() == NBinaryJson::EContainerType::Array) { // iterators.emplace_back(std::make_unique(container.GetArrayIterator(), key));