From 361f546fded2ed899630b68b58d6113777a9a9f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lisen=20=E6=9D=A8?= <li.yang@timeplus.io>
Date: Tue, 30 Jan 2024 14:36:39 +0800
Subject: [PATCH 1/5] support tracking bucket changes in two level hash table

---
 src/Common/HashTable/TimeBucketHashMap.h      | 14 ++++
 src/Common/HashTable/TimeBucketHashTable.h    | 57 ++++++++++++++++
 src/Common/HashTable/TwoLevelHashMap.h        | 14 ++++
 src/Common/HashTable/TwoLevelHashTable.h      | 46 +++++++++++++
 src/Common/HashTable/TwoLevelStringHashMap.h  | 14 ++++
 .../HashTable/TwoLevelStringHashTable.h       | 66 +++++++++++++++++++
 6 files changed, 211 insertions(+)
diff --git a/src/Common/HashTable/TimeBucketHashMap.h b/src/Common/HashTable/TimeBucketHashMap.h
index 172d1d1c192..685ede30af4 100644
--- a/src/Common/HashTable/TimeBucketHashMap.h
+++ b/src/Common/HashTable/TimeBucketHashMap.h
@@ -33,6 +33,20 @@ class TimeBucketHashMapTable
             p.second.forEachValue(func);
     }
 
+    template <typename Func>
+    void ALWAYS_INLINE forEachValueOfUpdatedBuckets(Func && func, bool reset_updated = false)
+    {
+        for (auto & p : this->impls)
+        {
+            if (this->isUpdatedBucket(p.first))
+            {
+                p.second.forEachValue(func);
+                if (reset_updated)
+                    this->resetUpdated(p.first);
+            }
+        }
+    }
+
     typename Cell::Mapped & ALWAYS_INLINE operator[](const Key & x)
     {
         LookupResult it;
diff --git a/src/Common/HashTable/TimeBucketHashTable.h b/src/Common/HashTable/TimeBucketHashTable.h
index 77c1cdbe8aa..9bff2271aa3 100644
--- a/src/Common/HashTable/TimeBucketHashTable.h
+++ b/src/Common/HashTable/TimeBucketHashTable.h
@@ -108,7 +108,9 @@ class TimeBucketHashTable : private boost::noncopyable, protected Hash /// empty
     using ConstLookupResult = typename Impl::ConstLookupResult;
 
     /// FIXME, choose a better perf data structure
+    /// Usually we don't have too many time buckets
     std::map<Int64, Impl> impls;
+    std::unordered_map<Int64, bool/*updated*/> bucket_updated_flags;
     Impl sentinel;
 
     TimeBucketHashTable() { }
@@ -263,6 +265,7 @@ class TimeBucketHashTable : private boost::noncopyable, protected Hash /// empty
     {
         auto window = windowKey(key_holder);
         impls[window].emplace(key_holder, it, inserted, hash_value);
+        bucket_updated_flags[window] = true; /// updated
     }
 
     LookupResult ALWAYS_INLINE find(Key x, size_t hash_value)
@@ -289,6 +292,7 @@ class TimeBucketHashTable : private boost::noncopyable, protected Hash /// empty
         {
             DB::writeIntBinary(p.first);
             p.second.write(wb);
+            DB::writeBoolText(bucket_updated_flags[p.first], wb);
         }
     }
 
@@ -309,7 +313,12 @@ class TimeBucketHashTable : private boost::noncopyable, protected Hash /// empty
             /// Write key and key-value separator
             DB::writeIntText(p.first, wb);
             DB::writeChar(KEY_VALUE_SEPARATOR, wb);
+            /// <impl,updated>
+            DB::writeChar('<', wb);
             p.second.writeText(wb);
+            DB::writeChar(',', wb);
+            DB::writeBoolText(bucket_updated_flags[p.first], wb);
+            DB::writeChar('>', wb);
         }
         DB::writeChar(END_BUCKET_MARKER, wb);
     }
@@ -327,6 +336,7 @@ class TimeBucketHashTable : private boost::noncopyable, protected Hash /// empty
             assert(key != 0);
             assert(!impls.contains(key));
             impls[key].read(rb);
+            DB::readBoolText(bucket_updated_flags[key], rb);
         }
     }
 
@@ -349,7 +359,12 @@ class TimeBucketHashTable : private boost::noncopyable, protected Hash /// empty
 
             assert(key != 0);
             assert(!impls.contains(key));
+            /// <impl,updated>
+            DB::assertChar('<', rb);
             impls[key].readText(rb);
+            DB::assertChar(',', rb);
+            DB::readBoolText(bucket_updated_flags[key], rb);
+            DB::assertChar('>', rb);
         }
         DB::assertChar(END_BUCKET_MARKER, rb);
     }
@@ -402,6 +417,7 @@ class TimeBucketHashTable : private boost::noncopyable, protected Hash /// empty
                 last_removed_watermark = it->first;
                 ++removed;
 
+                bucket_updated_flags.erase(it->first);
                 it = impls.erase(it);
             }
             else
@@ -438,4 +454,45 @@ class TimeBucketHashTable : private boost::noncopyable, protected Hash /// empty
 
         return buckets;
     }
+
+    bool isUpdatedBucket(Int64 bucket_) const
+    {
+        auto it = bucket_updated_flags.find(bucket_);
+        if (it != bucket_updated_flags.end())
+            return it->second;
+
+        return false;
+    }
+
+    void resetUpdated(Int64 bucket_)
+    {
+        auto it = bucket_updated_flags.find(bucket_);
+        if (it != bucket_updated_flags.end())
+            it->second = false;
+    }
+
+    void writeBucketUpdatedFlags(DB::WriteBuffer & wb) const
+    {
+        DB::writeVarUInt(bucket_updated_flags.size(), wb);
+        for (const auto & [bucket, updated] : bucket_updated_flags)
+        {
+            DB::writeIntBinary(bucket, wb);
+            DB::writeBoolText(updated, wb);
+        }
+    }
+
+    void readBucketUpdatedFlags(DB::ReadBuffer & rb)
+    {
+        size_t size = 0;
+        DB::readVarUInt(size, rb);
+        bucket_updated_flags.clear();
+        Int64 bucket = 0;
+        bool updated = false;
+        for (size_t i = 0; i < size; ++i)
+        {
+            DB::readIntBinary(bucket, rb);
+            DB::readBoolText(updated, rb);
+            bucket_updated_flags.emplace(bucket, updated);
+        }
+    }
 };
diff --git a/src/Common/HashTable/TwoLevelHashMap.h b/src/Common/HashTable/TwoLevelHashMap.h
index 3e618ca0a50..5c87d5e6eb0 100644
--- a/src/Common/HashTable/TwoLevelHashMap.h
+++ b/src/Common/HashTable/TwoLevelHashMap.h
@@ -38,6 +38,20 @@ class TwoLevelHashMapTable : public TwoLevelHashTable<Key, Cell, Hash, Grower, A
             this->impls[i].forEachValue(func);
     }
 
+    template <typename Func>
+    void ALWAYS_INLINE forEachValueOfUpdatedBuckets(Func && func, bool reset_updated = false)
+    {
+        for (auto i = 0u; i < this->NUM_BUCKETS; ++i)
+        {
+            if (this->isUpdatedBucket(i))
+            {
+                this->impls[i].forEachValue(func);
+                if (reset_updated)
+                    this->resetUpdated(i);
+            }
+        }
+    }
+
     template <typename Func>
     void ALWAYS_INLINE mergeToViaEmplace(Self & that, Func && func)
     {
diff --git a/src/Common/HashTable/TwoLevelHashTable.h b/src/Common/HashTable/TwoLevelHashTable.h
index 7e865cb48da..4dd13e6e7e4 100644
--- a/src/Common/HashTable/TwoLevelHashTable.h
+++ b/src/Common/HashTable/TwoLevelHashTable.h
@@ -90,6 +90,7 @@ class TwoLevelHashTable :
     using ConstLookupResult = typename Impl::ConstLookupResult;
 
     Impl impls[NUM_BUCKETS];
+    bool bucket_updated_flags[NUM_BUCKETS] = {false};
 
 
     TwoLevelHashTable() = default;
@@ -119,6 +120,7 @@ class TwoLevelHashTable :
             size_t hash_value = cell->getHash(src);
             size_t buck = getBucketFromHash(hash_value);
             impls[buck].insertUniqueNonZero(cell, hash_value);
+            bucket_updated_flags[buck] = true;
         }
     }
 
@@ -271,6 +273,7 @@ class TwoLevelHashTable :
     {
         size_t buck = getBucketFromHash(hash_value);
         impls[buck].emplace(key_holder, it, inserted, hash_value);
+        bucket_updated_flags[buck] = true;
     }
 
     LookupResult ALWAYS_INLINE find(Key x, size_t hash_value)
@@ -292,7 +295,10 @@ class TwoLevelHashTable :
     void write(DB::WriteBuffer & wb) const
     {
         for (UInt32 i = 0; i < NUM_BUCKETS; ++i)
+        {
             impls[i].write(wb);
+            DB::writeBoolText(bucket_updated_flags[i], wb);
+        }
     }
 
     void writeText(DB::WriteBuffer & wb) const
@@ -301,14 +307,22 @@ class TwoLevelHashTable :
         {
             if (i != 0)
                 DB::writeChar(',', wb);
+            /// <impl,updated>
+            DB::writeChar('<', wb);
             impls[i].writeText(wb);
+            DB::writeChar(',', wb);
+            DB::writeBoolText(bucket_updated_flags[i], wb);
+            DB::writeChar('>', wb);
         }
     }
 
     void read(DB::ReadBuffer & rb)
     {
         for (UInt32 i = 0; i < NUM_BUCKETS; ++i)
+        {
             impls[i].read(rb);
+            DB::readBoolText(bucket_updated_flags[i], rb);
+        }
     }
 
     void readText(DB::ReadBuffer & rb)
@@ -317,7 +331,13 @@ class TwoLevelHashTable :
         {
             if (i != 0)
                 DB::assertChar(',', rb);
+            
+            /// <impl,updated>
+            DB::assertChar('<', rb);
             impls[i].readText(rb);
+            DB::assertChar(',', rb);
+            DB::readBoolText(bucket_updated_flags[i], rb);
+            DB::assertChar('>', rb);
         }
     }
 
@@ -365,5 +385,31 @@ class TwoLevelHashTable :
         std::iota(bucket_ids.begin(), bucket_ids.end(), 0);
         return bucket_ids;
     }
+
+    bool isUpdatedBucket(Int64 bucket_) const
+    {
+        return bucket_updated_flags[bucket_];
+    }
+
+    void resetUpdated(Int64 bucket_)
+    {
+        bucket_updated_flags[bucket_] = false;
+    }
+
+    void writeBucketUpdatedFlags(DB::WriteBuffer & wb) const
+    {
+        DB::writeVarUInt(NUM_BUCKETS, wb);
+        for (const auto & elem : bucket_updated_flags)
+            DB::writeBoolText(elem, wb);
+    }
+
+    void readBucketUpdatedFlags(DB::ReadBuffer & rb)
+    {
+        size_t size = 0;
+        DB::readVarUInt(size, rb);
+        assert(size == NUM_BUCKETS);
+        for (auto & elem : bucket_updated_flags)
+            DB::readBoolText(elem, rb);
+    }
     /// proton : ends
 };
diff --git a/src/Common/HashTable/TwoLevelStringHashMap.h b/src/Common/HashTable/TwoLevelStringHashMap.h
index a351543edb0..9f2c5ba00d3 100644
--- a/src/Common/HashTable/TwoLevelStringHashMap.h
+++ b/src/Common/HashTable/TwoLevelStringHashMap.h
@@ -29,6 +29,20 @@ class TwoLevelStringHashMap : public TwoLevelStringHashTable<StringHashMapSubMap
             this->impls[i].forEachValue(func);
     }
 
+    template <typename Func>
+    void ALWAYS_INLINE forEachValueOfUpdatedBuckets(Func && func, bool reset_updated = false)
+    {
+        for (auto i = 0u; i < this->NUM_BUCKETS; ++i)
+        {
+            if (this->isUpdatedBucket(i))
+            {
+                this->impls[i].forEachValue(func);
+                if (reset_updated)
+                    this->resetUpdated(i);
+            }
+        }
+    }
+
     template <typename Func>
     void ALWAYS_INLINE mergeToViaEmplace(Self & that, Func && func)
     {
diff --git a/src/Common/HashTable/TwoLevelStringHashTable.h b/src/Common/HashTable/TwoLevelStringHashTable.h
index e0485f5aaa6..e74ae676143 100644
--- a/src/Common/HashTable/TwoLevelStringHashTable.h
+++ b/src/Common/HashTable/TwoLevelStringHashTable.h
@@ -39,6 +39,7 @@ class TwoLevelStringHashTable : private boost::noncopyable
     using ConstLookupResult = typename Impl::ConstLookupResult;
 
     Impl impls[NUM_BUCKETS];
+    bool bucket_updated_flags[NUM_BUCKETS] = {false};
 
     TwoLevelStringHashTable() {}
 
@@ -53,24 +54,28 @@ class TwoLevelStringHashTable : private boost::noncopyable
             size_t hash_value = v.getHash(src.m1);
             size_t buck = getBucketFromHash(hash_value);
             impls[buck].m1.insertUniqueNonZero(&v, hash_value);
+            bucket_updated_flags[buck] = true;
         }
         for (auto & v : src.m2)
         {
             size_t hash_value = v.getHash(src.m2);
             size_t buck = getBucketFromHash(hash_value);
             impls[buck].m2.insertUniqueNonZero(&v, hash_value);
+            bucket_updated_flags[buck] = true;
         }
         for (auto & v : src.m3)
         {
             size_t hash_value = v.getHash(src.m3);
             size_t buck = getBucketFromHash(hash_value);
             impls[buck].m3.insertUniqueNonZero(&v, hash_value);
+            bucket_updated_flags[buck] = true;
         }
         for (auto & v : src.ms)
         {
             size_t hash_value = v.getHash(src.ms);
             size_t buck = getBucketFromHash(hash_value);
             impls[buck].ms.insertUniqueNonZero(&v, hash_value);
+            bucket_updated_flags[buck] = true;
         }
     }
 
@@ -84,6 +89,9 @@ class TwoLevelStringHashTable : private boost::noncopyable
         const size_t sz = x.size;
         if (sz == 0)
         {
+            if constexpr (std::is_same_v<Func, typename Impl::EmplaceCallable>)
+                self.bucket_updated_flags[0] = true;
+
             keyHolderDiscardKey(key_holder);
             return func(self.impls[0].m0, VoidKey{}, 0);
         }
@@ -94,6 +102,9 @@ class TwoLevelStringHashTable : private boost::noncopyable
             // string keys. Put them to the generic table.
             auto res = hash(x);
             auto buck = getBucketFromHash(res);
+            if constexpr (std::is_same_v<Func, typename Impl::EmplaceCallable>)
+                self.bucket_updated_flags[buck] = true;
+
             return func(self.impls[buck].ms, std::forward<KeyHolder>(key_holder),
                 res);
         }
@@ -126,6 +137,9 @@ class TwoLevelStringHashTable : private boost::noncopyable
                 }
                 auto res = hash(k8);
                 auto buck = getBucketFromHash(res);
+                if constexpr (std::is_same_v<Func, typename Impl::EmplaceCallable>)
+                    self.bucket_updated_flags[buck] = true;
+
                 keyHolderDiscardKey(key_holder);
                 return func(self.impls[buck].m1, k8, res);
             }
@@ -137,6 +151,9 @@ class TwoLevelStringHashTable : private boost::noncopyable
                 n[1] >>= s;
                 auto res = hash(k16);
                 auto buck = getBucketFromHash(res);
+                if constexpr (std::is_same_v<Func, typename Impl::EmplaceCallable>)
+                    self.bucket_updated_flags[buck] = true;
+
                 keyHolderDiscardKey(key_holder);
                 return func(self.impls[buck].m2, k16, res);
             }
@@ -148,6 +165,9 @@ class TwoLevelStringHashTable : private boost::noncopyable
                 n[2] >>= s;
                 auto res = hash(k24);
                 auto buck = getBucketFromHash(res);
+                if constexpr (std::is_same_v<Func, typename Impl::EmplaceCallable>)
+                    self.bucket_updated_flags[buck] = true;
+
                 keyHolderDiscardKey(key_holder);
                 return func(self.impls[buck].m3, k24, res);
             }
@@ -155,6 +175,9 @@ class TwoLevelStringHashTable : private boost::noncopyable
             {
                 auto res = hash(x);
                 auto buck = getBucketFromHash(res);
+                if constexpr (std::is_same_v<Func, typename Impl::EmplaceCallable>)
+                    self.bucket_updated_flags[buck] = true;
+
                 return func(self.impls[buck].ms, std::forward<KeyHolder>(key_holder), res);
             }
         }
@@ -179,7 +202,10 @@ class TwoLevelStringHashTable : private boost::noncopyable
     void write(DB::WriteBuffer & wb) const
     {
         for (UInt32 i = 0; i < NUM_BUCKETS; ++i)
+        {
             impls[i].write(wb);
+            DB::writeBoolText(bucket_updated_flags[i], wb);
+        }
     }
 
     void writeText(DB::WriteBuffer & wb) const
@@ -188,14 +214,22 @@ class TwoLevelStringHashTable : private boost::noncopyable
         {
             if (i != 0)
                 DB::writeChar(',', wb);
+            /// <impl,updated>
+            DB::writeChar('<', wb);
             impls[i].writeText(wb);
+            DB::writeChar(',', wb);
+            DB::writeBoolText(bucket_updated_flags[i], wb);
+            DB::writeChar('>', wb);
         }
     }
 
     void read(DB::ReadBuffer & rb)
     {
         for (UInt32 i = 0; i < NUM_BUCKETS; ++i)
+        {
             impls[i].read(rb);
+            DB::readBoolText(bucket_updated_flags[i], rb);
+        }
     }
 
     void readText(DB::ReadBuffer & rb)
@@ -205,6 +239,12 @@ class TwoLevelStringHashTable : private boost::noncopyable
             if (i != 0)
                 DB::assertChar(',', rb);
             impls[i].readText(rb);
+            /// <impl,updated>
+            DB::assertChar('<', rb);
+            impls[i].readText(rb);
+            DB::assertChar(',', rb);
+            DB::readBoolText(bucket_updated_flags[i], rb);
+            DB::assertChar('>', rb);
         }
     }
 
@@ -252,4 +292,30 @@ class TwoLevelStringHashTable : private boost::noncopyable
         std::iota(bucket_ids.begin(), bucket_ids.end(), 0);
         return bucket_ids;
     }
+
+    bool isUpdatedBucket(Int64 bucket_) const
+    {
+        return bucket_updated_flags[bucket_];
+    }
+
+    void resetUpdated(Int64 bucket_)
+    {
+        bucket_updated_flags[bucket_] = false;
+    }
+
+    void writeBucketUpdatedFlags(DB::WriteBuffer & wb) const
+    {
+        DB::writeVarUInt(NUM_BUCKETS, wb);
+        for (const auto & elem : bucket_updated_flags)
+            DB::writeBoolText(elem, wb);
+    }
+
+    void readBucketUpdatedFlags(DB::ReadBuffer & rb)
+    {
+        size_t size = 0;
+        DB::readVarUInt(size, rb);
+        assert(size == NUM_BUCKETS);
+        for (auto & elem : bucket_updated_flags)
+            DB::readBoolText(elem, rb);
+    }
 };

From a298a8e166c201069fa9decd3d345606ae1dd3a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lisen=20=E6=9D=A8?= <li.yang@timeplus.io>
Date: Tue, 30 Jan 2024 14:44:53 +0800
Subject: [PATCH 2/5] add expanded data in aggregate state to track updated and
 retracted

---
 cmake/autogenerated_versions.txt              |    2 +-
 src/Common/HashMapsTemplate.h                 |   26 +-
 src/Common/serde.h                            |   21 +
 src/Interpreters/InterpreterSelectQuery.cpp   |    3 +-
 src/Interpreters/Streaming/AggregateDataEx.h  |  124 +
 .../Streaming/AggregationUtils.cpp            |  113 +
 src/Interpreters/Streaming/AggregationUtils.h |   27 +
 src/Interpreters/Streaming/Aggregator.cpp     | 2525 ++++++++---------
 src/Interpreters/Streaming/Aggregator.h       |  236 +-
 .../Streaming/AggregatingHelper.cpp           |   73 +-
 .../Transforms/Streaming/AggregatingHelper.h  |   11 +-
 .../Streaming/AggregatingTransform.cpp        |    4 +-
 .../AggregatingTransformWithSubstream.cpp     |    8 +-
 .../Streaming/GlobalAggregatingTransform.cpp  |   82 +-
 .../Streaming/GlobalAggregatingTransform.h    |    2 +
 ...lobalAggregatingTransformWithSubstream.cpp |   64 +-
 .../GlobalAggregatingTransformWithSubstream.h |    2 +
 17 files changed, 1798 insertions(+), 1525 deletions(-)
 create mode 100644 src/Interpreters/Streaming/AggregateDataEx.h
 create mode 100644 src/Interpreters/Streaming/AggregationUtils.cpp
 create mode 100644 src/Interpreters/Streaming/AggregationUtils.h

diff --git a/cmake/autogenerated_versions.txt b/cmake/autogenerated_versions.txt
index 29ccf0cc41c..2f61abb85dc 100644
--- a/cmake/autogenerated_versions.txt
+++ b/cmake/autogenerated_versions.txt
@@ -2,7 +2,7 @@
 
 # NOTE: has nothing common with DBMS_TCP_PROTOCOL_VERSION,
 # only DBMS_TCP_PROTOCOL_VERSION should be incremented on protocol changes.
-SET(VERSION_REVISION 2)
+SET(VERSION_REVISION 3)
 SET(VERSION_MAJOR 1)
 SET(VERSION_MINOR 4)
 SET(VERSION_PATCH 1)
diff --git a/src/Common/HashMapsTemplate.h b/src/Common/HashMapsTemplate.h
index 8eb33d1d1d4..53df5ecd69f 100644
--- a/src/Common/HashMapsTemplate.h
+++ b/src/Common/HashMapsTemplate.h
@@ -4,7 +4,8 @@
 #include <Common/ColumnsHashing.h>
 #include <Common/HashTable/FixedHashMap.h>
 #include <Common/HashTable/HashMap.h>
-#include <Common/HashTable/StringHashTable.h>
+#include <Common/HashTable/StringHashMap.h>
+#include <Common/HashTable/TwoLevelStringHashMap.h>
 
 namespace DB
 {
@@ -24,9 +25,14 @@ void serializeHashMap(const Map & map, MappedSerializer && mapped_serializer, Wr
     });
 }
 
-template <bool is_string_hash_map, typename Map, typename MappedDeserializer>
+template <typename Map, typename MappedDeserializer>
 void deserializeHashMap(Map & map, MappedDeserializer && mapped_deserializer, Arena & pool, ReadBuffer & rb)
 {
+    using Mapped = std::decay_t<Map>::mapped_type;
+
+    constexpr bool is_string_hash_map
+        = std::is_same_v<std::decay_t<Map>, StringHashMap<Mapped>> || std::is_same_v<std::decay_t<Map>, TwoLevelStringHashMap<Mapped>>;
+
     /// For StringHashMap or TwoLevelStringHashMap, it requires StringRef key padded 8 keys(left and right).
     /// So far, the Arena's MemoryChunk is always padding right 15, so we just pad left 8 here
     if constexpr (is_string_hash_map)
@@ -60,6 +66,20 @@ void deserializeHashMap(Map & map, MappedDeserializer && mapped_deserializer, Ar
         pool.setPaddingLeft(0);
 }
 
+template <typename Map, typename MappedSerializer>
+void serializeTwoLevelHashMap(const Map & map, MappedSerializer && mapped_serializer, WriteBuffer & wb)
+{
+    serializeHashMap<Map, MappedSerializer>(map, std::move(mapped_serializer), wb);
+    map.writeBucketUpdatedFlags(wb);
+}
+
+template <typename Map, typename MappedDeserializer>
+void deserializeTwoLevelHashMap(Map & map, MappedDeserializer && mapped_deserializer, Arena & pool, ReadBuffer & rb)
+{
+    deserializeHashMap<Map, MappedDeserializer>(map, std::move(mapped_deserializer), pool, rb);
+    map.readBucketUpdatedFlags(rb); /// recover buckets updated status
+}
+
 /// HashMapsTemplate is a taken from HashJoin class and make it standalone
 /// and could be shared among different components
 
@@ -187,7 +207,7 @@ struct HashMapsTemplate
 #define M(NAME) \
     case HashType::NAME: { \
         assert(NAME); \
-        deserializeHashMap<false>(*NAME, mapped_deserializer, pool, rb); \
+        deserializeHashMap(*NAME, mapped_deserializer, pool, rb); \
         return; \
     }
             APPLY_FOR_HASH_KEY_VARIANTS(M)
diff --git a/src/Common/serde.h b/src/Common/serde.h
index ce44f491337..d6e51e17dc0 100644
--- a/src/Common/serde.h
+++ b/src/Common/serde.h
@@ -27,6 +27,27 @@ void ALWAYS_INLINE deserialize(S & s, RB & rb, VersionType version, Args &&... a
     s.deserialize(rb, version, std::forward<Args>(args)...);
 }
 
+/// With owned versions
+template <typename S, typename WB, typename... Args>
+concept Serializable
+    = requires(const S & s, WB & wb, Args &&... args) { s.serialize(wb, std::forward<Args>(args)...); };
+
+template <typename S, typename RB, typename... Args>
+concept Deserializable
+    = requires(S & s, RB & rb, Args &&... args) { s.deserialize(rb, std::forward<Args>(args)...); };
+
+template <typename WB, typename... Args, Serializable<WB, Args...> S>
+void ALWAYS_INLINE serialize(const S & s, WB & wb, Args &&... args)
+{
+    s.serialize(wb, std::forward<Args>(args)...);
+}
+
+template <typename RB, typename... Args, Deserializable<RB, Args...> S>
+void ALWAYS_INLINE deserialize(S & s, RB & rb, Args &&... args)
+{
+    s.deserialize(rb, std::forward<Args>(args)...);
+}
+
 /// macro tag to indicate the data members or struct or class will
 /// be serialized / deserialized via network or file system IO.
 /// Hence, data structure versioning / backward / forward compatibility
diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp
index ae40014e4ba..66d99a2c0fb 100644
--- a/src/Interpreters/InterpreterSelectQuery.cpp
+++ b/src/Interpreters/InterpreterSelectQuery.cpp
@@ -3261,7 +3261,8 @@ void InterpreterSelectQuery::executeStreamingAggregation(
         streaming_group_by,
         delta_col_pos,
         window_keys_num,
-        query_info.streaming_window_params);
+        query_info.streaming_window_params,
+        data_stream_semantic_pair.isChangelogOutput());
 
     auto merge_threads = max_streams;
     auto temporary_data_merge_threads = settings.aggregation_memory_efficient_merge_threads
diff --git a/src/Interpreters/Streaming/AggregateDataEx.h b/src/Interpreters/Streaming/AggregateDataEx.h
new file mode 100644
index 00000000000..2b969018a7d
--- /dev/null
+++ b/src/Interpreters/Streaming/AggregateDataEx.h
@@ -0,0 +1,124 @@
+#pragma once
+
+#include <IO/ReadHelpers.h>
+#include <IO/WriteHelpers.h>
+#include <Common/serde.h>
+
+namespace DB
+{
+using AggregateDataPtr = char *;
+using ConstAggregateDataPtr = const char *;
+
+namespace Streaming
+{
+SERDE struct UpdatedDataEx
+{
+    static ALWAYS_INLINE UpdatedDataEx & data(AggregateDataPtr __restrict place) { return *reinterpret_cast<UpdatedDataEx *>(place); }
+    static ALWAYS_INLINE const UpdatedDataEx & data(ConstAggregateDataPtr __restrict place) { return *reinterpret_cast<const UpdatedDataEx *>(place); }
+
+    static ALWAYS_INLINE bool isEmpty(ConstAggregateDataPtr __restrict place) { return data(place).final_count == 0; }
+    static ALWAYS_INLINE bool isUpdated(ConstAggregateDataPtr __restrict place) { return data(place).updated_since_last_finalization; }
+    static ALWAYS_INLINE void setUpdated(AggregateDataPtr __restrict place) { data(place).updated_since_last_finalization = true; }
+    static ALWAYS_INLINE void resetUpdated(AggregateDataPtr __restrict place) { data(place).updated_since_last_finalization = false; }
+
+    static void addBatch(size_t row_begin, size_t row_end, AggregateDataPtr * places, const IColumn * delta_col)
+    {
+        if (delta_col == nullptr)
+        {
+            for (size_t i = row_begin; i < row_end; ++i)
+                if (places[i])
+                    data(places[i]).add();
+        }
+        else
+        {
+            const auto & delta_flags = assert_cast<const ColumnInt8 &>(*delta_col).getData();
+            for (size_t i = row_begin; i < row_end; ++i)
+            {
+                if (places[i])
+                {
+                    if (delta_flags[i] >= 0)
+                        data(places[i]).add();
+                    else
+                        data(places[i]).negate();
+                }
+            }
+        }
+    }
+
+    static void addBatchSinglePlace(size_t row_begin, size_t row_end, AggregateDataPtr __restrict place, const IColumn * delta_col)
+    {
+        if (!place)
+            return;
+
+        auto & metadata = data(place);
+        if (delta_col == nullptr)
+            metadata.final_count += row_end - row_begin;
+        else
+        {
+            const auto & delta_flags = assert_cast<const ColumnInt8 &>(*delta_col).getData();
+            metadata.final_count = std::accumulate(delta_flags.begin(), delta_flags.end(), metadata.final_count);
+        }
+
+        metadata.updated_since_last_finalization = true;
+    }
+
+    static void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & wb)
+    {
+        const auto & data_ex = data(place);
+        writeIntBinary(data_ex.final_count, wb);
+        writeBoolText(data_ex.updated_since_last_finalization, wb);
+    }
+
+    static void deserialize(AggregateDataPtr __restrict place, ReadBuffer & rb)
+    {
+        auto & data_ex = data(place);
+        readIntBinary(data_ex.final_count, rb);
+        readBoolText(data_ex.updated_since_last_finalization, rb);
+    }
+
+    ALWAYS_INLINE void add()
+    {
+        ++final_count;
+        updated_since_last_finalization = true;
+    }
+
+    ALWAYS_INLINE void negate()
+    {
+        --final_count;
+        updated_since_last_finalization = true;
+    }
+
+    /// Used for tracking the group is empty or not
+    UInt32 final_count = 0;
+
+    /// Used for tracking the group is updated or not
+    bool updated_since_last_finalization = true;
+};
+
+SERDE struct RetractedDataEx : UpdatedDataEx
+{
+    static ALWAYS_INLINE AggregateDataPtr & getRetracted(AggregateDataPtr & place) { return reinterpret_cast<RetractedDataEx *>(place)->retracted_data; }
+    static ALWAYS_INLINE bool hasRetracted(ConstAggregateDataPtr __restrict place) { return reinterpret_cast<const RetractedDataEx *>(place)->retracted_data; }
+
+    template <bool use_retracted_data>
+    static ALWAYS_INLINE AggregateDataPtr & getData(AggregateDataPtr & place)
+    {
+        if constexpr (use_retracted_data)
+            return getRetracted(place);
+        else
+            return place;
+    }
+
+    /// Used for tracking group changes
+    AggregateDataPtr retracted_data = nullptr;
+};
+
+enum class ExpandedDataType : uint8_t
+{
+    None = 0,
+    Updated = 1, /// Allow tracking group is empty or updated
+    UpdatedWithRetracted = 2, /// Allow tracking group is empty or updated and changes
+};
+
+}
+}
diff --git a/src/Interpreters/Streaming/AggregationUtils.cpp b/src/Interpreters/Streaming/AggregationUtils.cpp
new file mode 100644
index 00000000000..b40851b65e6
--- /dev/null
+++ b/src/Interpreters/Streaming/AggregationUtils.cpp
@@ -0,0 +1,113 @@
+#include <Interpreters/Streaming/AggregationUtils.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+extern const int LOGICAL_ERROR;
+}
+
+namespace Streaming
+{
+OutputBlockColumns prepareOutputBlockColumns(
+    const Aggregator::Params & params,
+    const Aggregator::AggregateFunctionsPlainPtrs & aggregate_functions,
+    const Block & res_header,
+    Arenas & aggregates_pools,
+    bool final,
+    size_t rows)
+{
+    MutableColumns key_columns(params.keys_size);
+    MutableColumns aggregate_columns(params.aggregates_size);
+    MutableColumns final_aggregate_columns(params.aggregates_size);
+    Aggregator::AggregateColumnsData aggregate_columns_data(params.aggregates_size);
+
+    for (size_t i = 0; i < params.keys_size; ++i)
+    {
+        key_columns[i] = res_header.safeGetByPosition(i).type->createColumn();
+        key_columns[i]->reserve(rows);
+    }
+
+    for (size_t i = 0; i < params.aggregates_size; ++i)
+    {
+        if (!final)
+        {
+            const auto & aggregate_column_name = params.aggregates[i].column_name;
+            aggregate_columns[i] = res_header.getByName(aggregate_column_name).type->createColumn();
+
+            /// The ColumnAggregateFunction column captures the shared ownership of the arena with the aggregate function states.
+            ColumnAggregateFunction & column_aggregate_func = assert_cast<ColumnAggregateFunction &>(*aggregate_columns[i]);
+
+            for (auto & pool : aggregates_pools)
+                column_aggregate_func.addArena(pool);
+
+            aggregate_columns_data[i] = &column_aggregate_func.getData();
+            aggregate_columns_data[i]->reserve(rows);
+        }
+        else
+        {
+            final_aggregate_columns[i] = aggregate_functions[i]->getReturnType()->createColumn();
+            final_aggregate_columns[i]->reserve(rows);
+
+            if (aggregate_functions[i]->isState())
+            {
+                auto callback = [&](IColumn & subcolumn)
+                {
+                    /// The ColumnAggregateFunction column captures the shared ownership of the arena with aggregate function states.
+                    if (auto * column_aggregate_func = typeid_cast<ColumnAggregateFunction *>(&subcolumn))
+                        for (auto & pool : aggregates_pools)
+                            column_aggregate_func->addArena(pool);
+                };
+
+                callback(*final_aggregate_columns[i]);
+                final_aggregate_columns[i]->forEachSubcolumnRecursively(callback);
+            }
+        }
+    }
+
+    if (key_columns.size() != params.keys_size)
+        throw Exception{"Aggregate. Unexpected key columns size.", ErrorCodes::LOGICAL_ERROR};
+
+    std::vector<IColumn *> raw_key_columns;
+    raw_key_columns.reserve(key_columns.size());
+    for (auto & column : key_columns)
+        raw_key_columns.push_back(column.get());
+
+    return {
+        .key_columns = std::move(key_columns),
+        .raw_key_columns = std::move(raw_key_columns),
+        .aggregate_columns = std::move(aggregate_columns),
+        .final_aggregate_columns = std::move(final_aggregate_columns),
+        .aggregate_columns_data = std::move(aggregate_columns_data),
+    };
+}
+
+Block finalizeBlock(const Aggregator::Params & params, const Block & res_header, OutputBlockColumns && out_cols, bool final, size_t rows)
+{
+    auto && [key_columns, raw_key_columns, aggregate_columns, final_aggregate_columns, aggregate_columns_data] = out_cols;
+
+    Block res = res_header.cloneEmpty();
+
+    for (size_t i = 0; i < params.keys_size; ++i)
+        res.getByPosition(i).column = std::move(key_columns[i]);
+
+    for (size_t i = 0; i < params.aggregates_size; ++i)
+    {
+        const auto & aggregate_column_name = params.aggregates[i].column_name;
+        if (final)
+            res.getByName(aggregate_column_name).column = std::move(final_aggregate_columns[i]);
+        else
+            res.getByName(aggregate_column_name).column = std::move(aggregate_columns[i]);
+    }
+
+    /// Change the size of the columns-constants in the block.
+    size_t columns = res_header.columns();
+    for (size_t i = 0; i < columns; ++i)
+        if (isColumnConst(*res.getByPosition(i).column))
+            res.getByPosition(i).column = res.getByPosition(i).column->cut(0, rows);
+
+    return res;
+}
+}
+}
diff --git a/src/Interpreters/Streaming/AggregationUtils.h b/src/Interpreters/Streaming/AggregationUtils.h
new file mode 100644
index 00000000000..6f6875e72fc
--- /dev/null
+++ b/src/Interpreters/Streaming/AggregationUtils.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <Interpreters/Streaming/Aggregator.h>
+
+namespace DB::Streaming
+{
+
+struct OutputBlockColumns
+{
+    MutableColumns key_columns;
+    std::vector<IColumn *> raw_key_columns;
+    MutableColumns aggregate_columns;
+    MutableColumns final_aggregate_columns;
+    Aggregator::AggregateColumnsData aggregate_columns_data;
+};
+
+
+OutputBlockColumns prepareOutputBlockColumns(
+    const Aggregator::Params & params,
+    const Aggregator::AggregateFunctionsPlainPtrs & aggregate_functions,
+    const Block & res_header,
+    Arenas & aggregates_pools,
+    bool final,
+    size_t rows);
+
+Block finalizeBlock(const Aggregator::Params & params, const Block & res_header, OutputBlockColumns && out_cols, bool final, size_t rows);
+}
diff --git a/src/Interpreters/Streaming/Aggregator.cpp b/src/Interpreters/Streaming/Aggregator.cpp
index 7273c6ab81d..f1937f482d7 100644
--- a/src/Interpreters/Streaming/Aggregator.cpp
+++ b/src/Interpreters/Streaming/Aggregator.cpp
@@ -31,6 +31,7 @@
 #include <Formats/SimpleNativeWriter.h>
 #include <Interpreters/CompiledAggregateFunctionsHolder.h>
 #include <Interpreters/Streaming/AggregatedDataMetrics.h>
+#include <Interpreters/Streaming/AggregationUtils.h>
 #include <Common/HashMapsTemplate.h>
 #include <Common/VersionRevision.h>
 #include <Common/logger_useful.h>
@@ -79,6 +80,9 @@ inline void initDataVariants(
     result.keys_size = params.keys_size;
     result.key_sizes = key_sizes;
     result.init(method_chosen);
+
+    if (params.tracking_changes)
+        result.resetRetractedPool();
 }
 
 Columns materializeKeyColumns(Columns & columns, ColumnRawPtrs & key_columns, const Aggregator::Params & params, bool is_low_cardinality)
@@ -103,6 +107,73 @@ Columns materializeKeyColumns(Columns & columns, ColumnRawPtrs & key_columns, co
 
     return materialized_columns;
 }
+
+Arena * getArena(AggregatedDataVariants & variants, AggregateStateType type)
+{
+    if (type == AggregateStateType::OnlyRetracted)
+        return variants.retracted_pool.get();
+    else
+        return variants.aggregates_pool;
+}
+
+template <typename BucketConverter>
+BlocksList concurrentBucketConvert(ThreadPool * thread_pool, const std::vector<Int64> & buckets, Arena * arena, Arenas & pools, BucketConverter && bucket_converter)
+{
+    std::atomic<UInt32> next_bucket_idx_to_merge = 0;
+    auto converter = [&](Arena * pool, ThreadGroupStatusPtr thread_group, const std::atomic_flag * cancelled) {
+        SCOPE_EXIT_SAFE(
+            if (thread_group)
+                CurrentThread::detachQueryIfNotDetached();
+        );
+
+        if (thread_group)
+            CurrentThread::attachToIfDetached(thread_group);
+
+        BlocksList blocks;
+        while (true)
+        {
+            if (cancelled && cancelled->test())
+                break;
+
+            UInt32 bucket_idx = next_bucket_idx_to_merge.fetch_add(1);
+            if (bucket_idx >= buckets.size())
+                break;
+
+            auto bucket = buckets[bucket_idx];
+            blocks.splice(blocks.end(), bucket_converter(bucket, pool));
+        }
+        return blocks;
+    };
+
+    size_t num_threads = thread_pool ? std::min(thread_pool->getMaxThreads(), buckets.size()) : 1;
+    if (num_threads <= 1)
+        return converter(arena, nullptr, nullptr);
+
+    /// Process in parallel
+    for (size_t i = pools.size(); i < num_threads; ++i)
+        pools.push_back(std::make_shared<Arena>());
+
+    auto results = std::make_shared<std::vector<BlocksList>>();
+    results->resize(num_threads);
+    thread_pool->setMaxThreads(num_threads);
+    {
+        std::atomic_flag cancelled;
+        SCOPE_EXIT_SAFE(cancelled.test_and_set(););
+
+        for (size_t thread_id = 0; thread_id < num_threads; ++thread_id)
+            thread_pool->scheduleOrThrowOnError([&pools, thread_id, group = CurrentThread::getGroup(), results, &converter, &cancelled] {
+                (*results)[thread_id] = converter(pools[thread_id].get(), group, &cancelled);
+            });
+
+        thread_pool->wait();
+    }
+
+    BlocksList blocks;
+    for (auto & result : *results)
+        blocks.splice(blocks.end(), std::move(result));
+
+    return blocks;
+}
 }
 
 AggregatedDataVariants::~AggregatedDataVariants()
@@ -120,6 +191,31 @@ AggregatedDataVariants::~AggregatedDataVariants()
     }
 }
 
+void AggregatedDataVariants::reset()
+{
+    assert(aggregator);
+    /// Clear states
+    if (!aggregator->all_aggregates_has_trivial_destructor)
+        aggregator->destroyAllAggregateStates(*this);
+
+    /// Clear hash map
+    switch (type)
+    {
+        case AggregatedDataVariants::Type::EMPTY:       break;
+        case AggregatedDataVariants::Type::without_key: break;
+
+    #define M(NAME, IS_TWO_LEVEL) \
+        case AggregatedDataVariants::Type::NAME: NAME.reset(); break;
+        APPLY_FOR_AGGREGATED_VARIANTS_STREAMING(M)
+    #undef M
+    }
+    invalidate();
+
+    /// Reset pool
+    resetAggregatesPool();
+    retracted_pool.reset();
+}
+
 void AggregatedDataVariants::convertToTwoLevel()
 {
     if (aggregator)
@@ -143,6 +239,17 @@ void AggregatedDataVariants::convertToTwoLevel()
     }
 }
 
+void AggregatedDataVariants::serialize(WriteBuffer & wb, const Aggregator & aggregator_) const
+{
+    aggregator_.checkpoint(*this, wb);
+}
+
+void AggregatedDataVariants::deserialize(ReadBuffer & rb, const Aggregator & aggregator_)
+{
+    aggregator = &aggregator_;
+    aggregator_.recover(*this, rb);
+}
+
 Block Aggregator::getHeader(bool final) const
 {
     return params.getHeader(final);
@@ -282,8 +389,21 @@ Aggregator::Aggregator(const Params & params_) : params(params_),  log(&Poco::Lo
     total_size_of_aggregate_states = 0;
     all_aggregates_has_trivial_destructor = true;
 
+    if (params.tracking_changes)
+    {
+        total_size_of_aggregate_states = sizeof(RetractedDataEx);
+        align_aggregate_states = alignof(RetractedDataEx);
+        expanded_data_type = ExpandedDataType::UpdatedWithRetracted;
+    }
+    else if (params.tracking_updated)
+    {
+        total_size_of_aggregate_states = sizeof(UpdatedDataEx);
+        align_aggregate_states = alignof(UpdatedDataEx);
+        expanded_data_type = ExpandedDataType::Updated;
+    }
+
     // aggregate_states will be aligned as below:
-    // |<-- state_1 -->|<-- pad_1 -->|<-- state_2 -->|<-- pad_2 -->| .....
+    // |<-- [ExpandedDataEx] -->||<-- state_1 -->|<-- pad_1 -->|<-- state_2 -->|<-- pad_2 -->| .....
     //
     // pad_N will be used to match alignment requirement for each next state.
     // The address of state_1 is aligned based on maximum alignment requirements in states
@@ -650,12 +770,43 @@ AggregatedDataVariants::Type Aggregator::chooseAggregationMethodTimeBucketTwoLev
 }
 /// proton: ends
 
-template <bool skip_compiled_aggregate_functions>
+template <bool use_compiled_functions, bool skip_expanded_data>
 void Aggregator::createAggregateStates(AggregateDataPtr & aggregate_data) const
 {
+    /// Initialize reserved UpdatedDataEx
+    assert(aggregate_data);
+    if constexpr (!skip_expanded_data)
+    {
+        if (expanded_data_type == ExpandedDataType::UpdatedWithRetracted)
+            new (aggregate_data) RetractedDataEx();
+        else if (expanded_data_type == ExpandedDataType::Updated)
+            new (aggregate_data) UpdatedDataEx();
+    }
+
+    if constexpr (use_compiled_functions)
+    {
+        assert(compiled_aggregate_functions_holder);
+        const auto & compiled_aggregate_functions = compiled_aggregate_functions_holder->compiled_aggregate_functions;
+        compiled_aggregate_functions.create_aggregate_states_function(aggregate_data);
+
+#if defined(MEMORY_SANITIZER)
+
+        /// We compile only functions that do not allocate some data in Arena. Only store necessary state in AggregateData place.
+        for (size_t aggregate_function_index = 0; aggregate_function_index < aggregate_functions.size(); ++aggregate_function_index)
+        {
+            if (!is_aggregate_function_compiled[aggregate_function_index])
+                continue;
+
+            auto aggregate_data_with_offset = aggregate_data + offsets_of_aggregate_states[aggregate_function_index];
+            auto data_size = params.aggregates[aggregate_function_index].function->sizeOfData();
+            __msan_unpoison(aggregate_data_with_offset, data_size);
+        }
+#endif
+    }
+
     for (size_t j = 0; j < params.aggregates_size; ++j)
     {
-        if constexpr (skip_compiled_aggregate_functions)
+        if constexpr (use_compiled_functions)
             if (is_aggregate_function_compiled[j])
                 continue;
 
@@ -671,7 +822,7 @@ void Aggregator::createAggregateStates(AggregateDataPtr & aggregate_data) const
         {
             for (size_t rollback_j = 0; rollback_j < j; ++rollback_j)
             {
-                if constexpr (skip_compiled_aggregate_functions)
+                if constexpr (use_compiled_functions)
                     if (is_aggregate_function_compiled[j])
                         continue;
 
@@ -719,23 +870,18 @@ template <typename Method>
     AggregateDataPtr overflow_row) const
 {
     typename Method::State state(key_columns, key_sizes, aggregation_state_cache);
+    assert(!no_more_keys);
 
-    if (!no_more_keys)
-    {
 #if USE_EMBEDDED_COMPILER
-        if (compiled_aggregate_functions_holder)
-        {
-            return executeImplBatch<false, true>(method, state, aggregates_pool, row_begin, row_end, aggregate_instructions, overflow_row);
-        }
-        else
-#endif
-        {
-            return executeImplBatch<false, false>(method, state, aggregates_pool, row_begin, row_end, aggregate_instructions, overflow_row);
-        }
+    /// TODO: So far not support compiled functions with expanded data
+    if (compiled_aggregate_functions_holder && !hasExpandedData())
+    {
+        return executeImplBatch<false, true>(method, state, aggregates_pool, row_begin, row_end, aggregate_instructions, overflow_row);
     }
     else
+#endif
     {
-        return executeImplBatch<true, false>(method, state, aggregates_pool, row_begin, row_end, aggregate_instructions, overflow_row);
+        return executeImplBatch<false, false>(method, state, aggregates_pool, row_begin, row_end, aggregate_instructions, overflow_row);
     }
 }
 
@@ -750,7 +896,7 @@ template <bool no_more_keys, bool use_compiled_functions, typename Method>
     AggregateDataPtr overflow_row) const
 {
     /// Optimization for special case when there are no aggregate functions.
-    if (params.aggregates_size == 0)
+    if (params.aggregates_size == 0 && !hasExpandedData())
     {
         if constexpr (no_more_keys)
             return false;
@@ -778,7 +924,7 @@ template <bool no_more_keys, bool use_compiled_functions, typename Method>
             }
         }
 
-        if (!has_arrays)
+        if (!has_arrays && !hasExpandedData())
         {
             for (AggregateFunctionInstruction * inst = aggregate_instructions; inst->that; ++inst)
             {
@@ -789,8 +935,9 @@ template <bool no_more_keys, bool use_compiled_functions, typename Method>
                     inst->state_offset,
                     [&](AggregateDataPtr & aggregate_data)
                     {
-                        aggregate_data = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
-                        createAggregateStates(aggregate_data);
+                        auto data = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
+                        createAggregateStates<use_compiled_functions, /*skip_expanded_data*/ true>(data);
+                        aggregate_data = data;
                     },
                     state.getKeyData(),
                     inst->batch_arguments,
@@ -821,66 +968,24 @@ template <bool no_more_keys, bool use_compiled_functions, typename Method>
     {
         AggregateDataPtr aggregate_data = nullptr;
 
-        if constexpr (!no_more_keys)
-        {
-            auto emplace_result = state.emplaceKey(method.data, i, *aggregates_pool);
-
-            /// If a new key is inserted, initialize the states of the aggregate functions, and possibly something related to the key.
-            if (emplace_result.isInserted())
-            {
-                /// exception-safety - if you can not allocate memory or create states, then destructors will not be called.
-                emplace_result.setMapped(nullptr);
-
-                aggregate_data = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
-
-#if USE_EMBEDDED_COMPILER
-                if constexpr (use_compiled_functions)
-                {
-                    const auto & compiled_aggregate_functions = compiled_aggregate_functions_holder->compiled_aggregate_functions;
-                    compiled_aggregate_functions.create_aggregate_states_function(aggregate_data);
-                    if (compiled_aggregate_functions.functions_count != aggregate_functions.size())
-                    {
-                        static constexpr bool skip_compiled_aggregate_functions = true;
-                        createAggregateStates<skip_compiled_aggregate_functions>(aggregate_data);
-                    }
-
-#if defined(MEMORY_SANITIZER)
-
-                    /// We compile only functions that do not allocate some data in Arena. Only store necessary state in AggregateData place.
-                    for (size_t aggregate_function_index = 0; aggregate_function_index < aggregate_functions.size(); ++aggregate_function_index)
-                    {
-                        if (!is_aggregate_function_compiled[aggregate_function_index])
-                            continue;
+        assert(!no_more_keys);
+        auto emplace_result = state.emplaceKey(method.data, i, *aggregates_pool);
 
-                        auto aggregate_data_with_offset = aggregate_data + offsets_of_aggregate_states[aggregate_function_index];
-                        auto data_size = params.aggregates[aggregate_function_index].function->sizeOfData();
-                        __msan_unpoison(aggregate_data_with_offset, data_size);
-                    }
-#endif
-                }
-                else
-#endif
-                {
-                    createAggregateStates(aggregate_data);
-                }
+        /// If a new key is inserted, initialize the states of the aggregate functions, and possibly something related to the key.
+        if (emplace_result.isInserted())
+        {
+            /// exception-safety - if you can not allocate memory or create states, then destructors will not be called.
+            emplace_result.setMapped(nullptr);
 
-                emplace_result.setMapped(aggregate_data);
-            }
-            else
-                aggregate_data = emplace_result.getMapped();
+            aggregate_data = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
+            createAggregateStates<use_compiled_functions>(aggregate_data);
 
-            assert(aggregate_data != nullptr);
+            emplace_result.setMapped(aggregate_data);
         }
         else
-        {
-            /// Add only if the key already exists.
-            auto find_result = state.findKey(method.data, i, *aggregates_pool);
-            if (find_result.isFound())
-                aggregate_data = find_result.getMapped();
-            else
-                aggregate_data = overflow_row;
-        }
+            aggregate_data = emplace_result.getMapped();
 
+        assert(aggregate_data != nullptr);
         places[i] = aggregate_data;
     }
 
@@ -938,6 +1043,9 @@ template <bool no_more_keys, bool use_compiled_functions, typename Method>
         }
     }
 
+    if (hasExpandedData())
+        UpdatedDataEx::addBatch(row_begin, row_end, places.get(), aggregate_instructions ? aggregate_instructions->delta_column : nullptr);
+
     return need_finalization;
 }
 
@@ -1026,28 +1134,12 @@ template <bool use_compiled_functions>
         }
     }
 
-    return should_finalize;
-}
+    if (hasExpandedData())
+        UpdatedDataEx::addBatchSinglePlace(row_begin, row_end, res, aggregate_instructions ? aggregate_instructions->delta_column : nullptr);
 
-void NO_INLINE Aggregator::executeOnIntervalWithoutKeyImpl(
-    AggregatedDataWithoutKey & res,
-    size_t row_begin,
-    size_t row_end,
-    AggregateFunctionInstruction * aggregate_instructions,
-    Arena * arena,
-    const IColumn * delta_col)
-{
-    /// Adding values
-    for (AggregateFunctionInstruction * inst = aggregate_instructions; inst->that; ++inst)
-    {
-        if (inst->offsets)
-            inst->batch_that->addBatchSinglePlaceFromInterval(inst->offsets[row_begin], inst->offsets[row_end - 1], res + inst->state_offset, inst->batch_arguments, arena, -1, delta_col);
-        else
-            inst->batch_that->addBatchSinglePlaceFromInterval(row_begin, row_end, res + inst->state_offset, inst->batch_arguments, arena, -1, delta_col);
-    }
+    return should_finalize;
 }
 
-
 void Aggregator::prepareAggregateInstructions(Columns columns, AggregateColumns & aggregate_columns, Columns & materialized_columns,
      AggregateFunctionInstructions & aggregate_functions_instructions, NestedColumnsHolder & nested_columns_holder) const
 {
@@ -1170,14 +1262,14 @@ std::pair<bool, bool> Aggregator::executeOnBlock(
     /// For the case when there are no keys (all aggregate into one row).
     if (result.type == AggregatedDataVariants::Type::without_key)
     {
-        /// TODO: Enable compilation after investigation
-// #if USE_EMBEDDED_COMPILER
-//         if (compiled_aggregate_functions_holder)
-//         {
-//             executeWithoutKeyImpl<true>(result.without_key, row_begin, row_end, aggregate_functions_instructions.data(), result.aggregates_pool);
-//         }
-//         else
-// #endif
+        /// TODO: So far not support compiled functions with expanded data
+#if USE_EMBEDDED_COMPILER
+        if (compiled_aggregate_functions_holder && !hasExpandedData())
+        {
+            need_finalization = executeWithoutKeyImpl<true>(result.without_key, row_begin, row_end, aggregate_functions_instructions.data(), result.aggregates_pool);
+        }
+        else
+#endif
         {
             need_finalization = executeWithoutKeyImpl<false>(result.without_key, row_begin, row_end, aggregate_functions_instructions.data(), result.aggregates_pool);
         }
@@ -1274,283 +1366,70 @@ Block Aggregator::convertOneBucketToBlockImpl(
     Arena * arena,
     bool final,
     bool clear_states,
-    size_t bucket) const
+    Int64 bucket,
+    AggregateStateType type) const
 {
-    Block block = prepareBlockAndFill(data_variants, final, clear_states, method.data.impls[bucket].size(),
-        [bucket, &method, arena, this] (
-            MutableColumns & key_columns,
-            AggregateColumnsData & aggregate_columns,
-            MutableColumns & final_aggregate_columns,
-            bool final_,
-            bool clear_states_)
-        {
-            convertToBlockImpl(method, method.data.impls[bucket],
-                key_columns, aggregate_columns, final_aggregate_columns, arena, final_, clear_states_);
-        });
-
-    block.info.bucket_num = static_cast<Int32>(bucket);
+    Block block = convertToBlockImpl(method, method.data.impls[bucket], arena, data_variants.aggregates_pools, final, method.data.impls[bucket].size(), clear_states, type);
+    block.info.bucket_num = static_cast<int>(bucket);
+    method.data.resetUpdated(bucket); /// finalized
     return block;
 }
 
-Block Aggregator::convertOneBucketToBlock(AggregatedDataVariants & variants, bool final, ConvertAction action, size_t bucket) const
+template <typename Method>
+void Aggregator::writeToTemporaryFileImpl(
+    AggregatedDataVariants & data_variants,
+    Method & method,
+    NativeWriter & out) const
 {
-    auto method = variants.type;
-    Block block;
-    bool clear_states = shouldClearStates(action, final);
-    if (false) {} // NOLINT
-#define M(NAME) \
-    else if (method == AggregatedDataVariants::Type::NAME) \
-        block = convertOneBucketToBlockImpl(variants, *variants.NAME, variants.aggregates_pool, final, clear_states, bucket);
-
-    APPLY_FOR_VARIANTS_TIME_BUCKET_TWO_LEVEL(M)
-#undef M
-    else
-        throw Exception("Unknown aggregated data variant.", ErrorCodes::UNKNOWN_AGGREGATED_DATA_VARIANT);
+    size_t max_temporary_block_size_rows = 0;
+    size_t max_temporary_block_size_bytes = 0;
 
-    return block;
-}
+    auto update_max_sizes = [&](const Block & block)
+    {
+        size_t block_size_rows = block.rows();
+        size_t block_size_bytes = block.bytes();
 
-Block Aggregator::mergeAndConvertOneBucketToBlock(
-    ManyAggregatedDataVariants & variants, bool final, ConvertAction action, size_t bucket) const
-{
-    auto prepared_data_ptr = prepareVariantsToMerge(variants);
-    if (prepared_data_ptr->empty())
-        return {};
+        if (block_size_rows > max_temporary_block_size_rows)
+            max_temporary_block_size_rows = block_size_rows;
+        if (block_size_bytes > max_temporary_block_size_bytes)
+            max_temporary_block_size_bytes = block_size_bytes;
+    };
 
-    auto & merged_data = *prepared_data_ptr->at(0);
-    auto method = merged_data.type;
-    Arena * arena = merged_data.aggregates_pool;
-    bool clear_states = shouldClearStates(action, final);
-    Block block;
+    for (auto bucket : method.data.buckets())
+    {
+        Block block = convertOneBucketToBlockImpl(data_variants, method, data_variants.aggregates_pool, false, false, bucket);
+        out.write(block);
+        update_max_sizes(block);
+    }
 
-    if (false) {} // NOLINT
-#define M(NAME) \
-    else if (method == AggregatedDataVariants::Type::NAME) \
-    { \
-        mergeBucketImpl<decltype(merged_data.NAME)::element_type>(*prepared_data_ptr, final, clear_states, bucket, arena); \
-        block = convertOneBucketToBlockImpl(merged_data, *merged_data.NAME, arena, final, clear_states, bucket); \
+    if (params.overflow_row)
+    {
+        Block block = prepareBlockAndFillWithoutKey(data_variants, false, true, false);
+        out.write(block);
+        update_max_sizes(block);
     }
 
-    APPLY_FOR_VARIANTS_ALL_TWO_LEVEL(M)
-#undef M
+    /// Pass ownership of the aggregate functions states:
+    /// `data_variants` will not destroy them in the destructor, they are now owned by ColumnAggregateFunction objects.
+    data_variants.aggregator = nullptr;
 
-    return block;
+    LOG_DEBUG(log, "Max size of temporary block: {} rows, {}.", max_temporary_block_size_rows, ReadableSize(max_temporary_block_size_bytes));
 }
 
-BlocksList
-Aggregator::mergeAndConvertToBlocks(ManyAggregatedDataVariants & data_variants, bool final, ConvertAction action, size_t max_threads) const
-{
-    auto prepared_data_ptr = prepareVariantsToMerge(data_variants);
-    if (prepared_data_ptr->empty())
-        return {};
 
-    if (unlikely(params.overflow_row))
-        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Overflow row processing is not implemented in streaming aggregation");
+bool Aggregator::checkLimits(size_t result_size, bool & no_more_keys) const
+{
+    if (!no_more_keys && params.max_rows_to_group_by && result_size > params.max_rows_to_group_by)
+    {
+        switch (params.group_by_overflow_mode)
+        {
+            case OverflowMode::THROW:
+                throw Exception("Limit for rows to GROUP BY exceeded: has " + toString(result_size)
+                    + " rows, maximum: " + toString(params.max_rows_to_group_by),
+                    ErrorCodes::TOO_MANY_ROWS);
 
-    bool clear_states = shouldClearStates(action, final);
-    BlocksList blocks;
-    auto & first = *prepared_data_ptr->at(0);
-    if (first.type == AggregatedDataVariants::Type::without_key)
-        blocks.emplace_back(mergeAndConvertWithoutKeyToBlock(*prepared_data_ptr, final, clear_states));
-    else if (first.isTwoLevel())
-        blocks.splice(blocks.end(), mergeAndConvertTwoLevelToBlocks(*prepared_data_ptr, final, max_threads, clear_states));
-    else
-        blocks.emplace_back(mergeAndConvertSingleLevelToBlock(*prepared_data_ptr, final, clear_states));
-
-    if (clear_states)
-        clearDataVariants(first);
-
-    return blocks;
-}
-
-Block Aggregator::mergeAndConvertWithoutKeyToBlock(ManyAggregatedDataVariants & non_empty_data, bool final, bool clear_states) const
-{
-    auto & first = *non_empty_data.at(0);
-    assert(first.type == AggregatedDataVariants::Type::without_key);
-    mergeWithoutKeyDataImpl(non_empty_data, clear_states);
-    return prepareBlockAndFillWithoutKey(first, final, false, clear_states);
-}
-
-Block Aggregator::mergeAndConvertSingleLevelToBlock(ManyAggregatedDataVariants & non_empty_data, bool final, bool clear_states) const
-{
-    auto & first = *non_empty_data.at(0);
-    if (false)
-    {
-    } // NOLINT
-#define M(NAME) \
-    else if (first.type == AggregatedDataVariants::Type::NAME) \
-        mergeSingleLevelDataImpl<decltype(first.NAME)::element_type>(non_empty_data, clear_states);
-
-    APPLY_FOR_VARIANTS_SINGLE_LEVEL_STREAMING(M)
-#undef M
-    else throw Exception("Unknown single level aggregated data variant.", ErrorCodes::UNKNOWN_AGGREGATED_DATA_VARIANT);
-
-    return prepareBlockAndFillSingleLevel(first, final, clear_states);
-}
-
-BlocksList Aggregator::mergeAndConvertTwoLevelToBlocks(
-    ManyAggregatedDataVariants & non_empty_data, bool final, size_t max_threads, bool clear_states) const
-{
-    auto & first = *non_empty_data.at(0);
-    assert(first.isTwoLevel());
-#define M(NAME) \
-    else if (first.type == AggregatedDataVariants::Type::NAME) return mergeAndConvertTwoLevelToBlocksImpl< \
-        decltype(first.NAME)::element_type>(non_empty_data, final, max_threads, clear_states);
-
-    if (false)
-    {
-    } // NOLINT
-    APPLY_FOR_VARIANTS_ALL_TWO_LEVEL(M)
-#undef M
-    else throw Exception("Unknown aggregated data variant.", ErrorCodes::UNKNOWN_AGGREGATED_DATA_VARIANT);
-}
-
-template <typename Method>
-BlocksList Aggregator::mergeAndConvertTwoLevelToBlocksImpl(
-    ManyAggregatedDataVariants & non_empty_data, bool final, size_t max_threads, bool clear_states) const
-{
-    auto & first = *non_empty_data.at(0);
-
-    std::vector<Int64> buckets;
-    if (first.isStaticBucketTwoLevel())
-        buckets = getDataVariant<Method>(first).data.buckets();
-    else
-    {
-        assert(first.isTimeBucketTwoLevel());
-        std::set<Int64> buckets_set;
-        for (auto & data_variants : non_empty_data)
-        {
-            auto tmp_buckets = getDataVariant<Method>(*data_variants).data.buckets();
-            buckets_set.insert(tmp_buckets.begin(), tmp_buckets.end());
-        }
-        buckets.assign(buckets_set.begin(), buckets_set.end());
-    }
-
-    std::atomic<size_t> next_bucket_idx_to_merge = 0;
-    auto converter = [&](size_t thread_id, ThreadGroupStatusPtr thread_group, const std::atomic_flag * cancelled) {
-        SCOPE_EXIT_SAFE(if (thread_group) CurrentThread::detachQueryIfNotDetached(););
-        if (thread_group)
-            CurrentThread::attachToIfDetached(thread_group);
-
-        BlocksList blocks;
-        while (true)
-        {
-            if (cancelled && cancelled->test())
-                break;
-
-            UInt32 bucket_idx = next_bucket_idx_to_merge.fetch_add(1);
-            if (bucket_idx >= buckets.size())
-                break;
-
-            auto bucket = buckets[bucket_idx];
-
-            /// Merge one bucket into first one
-            Arena * arena = first.aggregates_pools.at(thread_id).get();
-            mergeBucketImpl<Method>(non_empty_data, final, clear_states, bucket, arena);
-            auto & method = getDataVariant<Method>(first);
-            if (method.data.impls[bucket].empty())
-                continue;
-
-            /// Convert one bucket of first one
-            blocks.emplace_back(convertOneBucketToBlockImpl(first, method, arena, final, clear_states, bucket));
-        }
-
-        if (clear_states)
-        {
-            for (size_t i = 1; i < non_empty_data.size(); ++i)
-                clearDataVariants(*non_empty_data[i]);
-        }
-
-        return blocks;
-    };
-
-    auto num_threads = std::min(max_threads, buckets.size());
-    if (num_threads <= 1)
-        return converter(0, nullptr, nullptr);
-
-    /// Process in parallel
-    /// proton FIXME : separate final vs non-final converting. For non-final converting, we don't need
-    /// each arena for each thread.
-    for (size_t i = first.aggregates_pools.size(); i < num_threads; ++i)
-        first.aggregates_pools.push_back(std::make_shared<Arena>());
-
-    auto results = std::make_shared<std::vector<BlocksList>>();
-    results->resize(num_threads);
-    ThreadPool thread_pool(num_threads);
-    {
-        std::atomic_flag cancelled;
-        SCOPE_EXIT_SAFE(cancelled.test_and_set(););
-
-        for (size_t thread_id = 0; thread_id < num_threads; ++thread_id)
-            thread_pool.scheduleOrThrowOnError([thread_id, group = CurrentThread::getGroup(), results, &converter, &cancelled] {
-                (*results)[thread_id] = converter(thread_id, group, &cancelled);
-            });
-
-        thread_pool.wait();
-    }
-
-    BlocksList blocks;
-    for (auto & result : *results)
-        blocks.splice(blocks.end(), std::move(result));
-
-    return blocks;
-}
-
-template <typename Method>
-void Aggregator::writeToTemporaryFileImpl(
-    AggregatedDataVariants & data_variants,
-    Method & method,
-    NativeWriter & out) const
-{
-    size_t max_temporary_block_size_rows = 0;
-    size_t max_temporary_block_size_bytes = 0;
-
-    auto update_max_sizes = [&](const Block & block)
-    {
-        size_t block_size_rows = block.rows();
-        size_t block_size_bytes = block.bytes();
-
-        if (block_size_rows > max_temporary_block_size_rows)
-            max_temporary_block_size_rows = block_size_rows;
-        if (block_size_bytes > max_temporary_block_size_bytes)
-            max_temporary_block_size_bytes = block_size_bytes;
-    };
-
-    for (auto bucket : method.data.buckets())
-    {
-        Block block = convertOneBucketToBlockImpl(data_variants, method, data_variants.aggregates_pool, false, false, bucket);
-        out.write(block);
-        update_max_sizes(block);
-    }
-
-    if (params.overflow_row)
-    {
-        Block block = prepareBlockAndFillWithoutKey(data_variants, false, true, false);
-        out.write(block);
-        update_max_sizes(block);
-    }
-
-    /// Pass ownership of the aggregate functions states:
-    /// `data_variants` will not destroy them in the destructor, they are now owned by ColumnAggregateFunction objects.
-    data_variants.aggregator = nullptr;
-
-    LOG_DEBUG(log, "Max size of temporary block: {} rows, {}.", max_temporary_block_size_rows, ReadableSize(max_temporary_block_size_bytes));
-}
-
-
-bool Aggregator::checkLimits(size_t result_size, bool & no_more_keys) const
-{
-    if (!no_more_keys && params.max_rows_to_group_by && result_size > params.max_rows_to_group_by)
-    {
-        switch (params.group_by_overflow_mode)
-        {
-            case OverflowMode::THROW:
-                throw Exception("Limit for rows to GROUP BY exceeded: has " + toString(result_size)
-                    + " rows, maximum: " + toString(params.max_rows_to_group_by),
-                    ErrorCodes::TOO_MANY_ROWS);
-
-            case OverflowMode::BREAK:
-                return false;
+            case OverflowMode::BREAK:
+                return false;
 
             case OverflowMode::ANY:
                 no_more_keys = true;
@@ -1566,51 +1445,51 @@ bool Aggregator::checkLimits(size_t result_size, bool & no_more_keys) const
 
 
 template <typename Method, typename Table>
-void Aggregator::convertToBlockImpl(
-    Method & method,
-    Table & data,
-    MutableColumns & key_columns,
-    AggregateColumnsData & aggregate_columns,
-    MutableColumns & final_aggregate_columns,
-    Arena * arena,
-    bool final,
-    bool clear_states) const
+Block Aggregator::convertToBlockImpl(
+    Method & method, Table & data, Arena * arena, Arenas & aggregates_pools, bool final, size_t rows, bool clear_states, AggregateStateType type) const
 {
     if (data.empty())
-        return;
-
-    if (key_columns.size() != params.keys_size)
-        throw Exception{"Aggregate. Unexpected key columns size.", ErrorCodes::LOGICAL_ERROR};
+    {
+        auto && out_cols = prepareOutputBlockColumns(params, aggregate_functions, getHeader(final), aggregates_pools, final, rows);
+        return {finalizeBlock(params, getHeader(final), std::move(out_cols), final, rows)};
+    }
 
-    std::vector<IColumn *> raw_key_columns;
-    raw_key_columns.reserve(key_columns.size());
-    for (auto & column : key_columns)
-        raw_key_columns.push_back(column.get());
+    Block res;
 
     if (final)
     {
 #if USE_EMBEDDED_COMPILER
-        if (compiled_aggregate_functions_holder)
+        /// TODO: So far not support compiled functions with expanded data
+        if (compiled_aggregate_functions_holder && !hasExpandedData())
         {
             static constexpr bool use_compiled_functions = !Method::low_cardinality_optimization;
-            convertToBlockImplFinal<Method, use_compiled_functions>(method, data, std::move(raw_key_columns), final_aggregate_columns, arena, clear_states);
+            assert(type == AggregateStateType::Normal);
+            res = convertToBlockImplFinal<Method, use_compiled_functions, AggregateStateType::Normal>(method, data, arena, aggregates_pools, rows, clear_states);
         }
         else
 #endif
         {
-            convertToBlockImplFinal<Method, false>(method, data, std::move(raw_key_columns), final_aggregate_columns, arena, clear_states);
+            if (type == AggregateStateType::OnlyUpdated)
+                res = convertToBlockImplFinal<Method, false, AggregateStateType::OnlyUpdated>(method, data, arena, aggregates_pools, rows, clear_states);
+            else if (type == AggregateStateType::OnlyRetracted)
+                res = convertToBlockImplFinal<Method, false, AggregateStateType::OnlyRetracted>(method, data, arena, aggregates_pools, rows, clear_states);
+            else
+                res = convertToBlockImplFinal<Method, false, AggregateStateType::Normal>(method, data, arena, aggregates_pools, rows, clear_states);
         }
     }
     else
     {
-        convertToBlockImplNotFinal(method, data, std::move(raw_key_columns), aggregate_columns);
+        assert(type == AggregateStateType::Normal);
+        res = convertToBlockImplNotFinal(method, data, aggregates_pools, rows);
     }
 
     /// In order to release memory early.
     /// proton: starts. For streaming aggr, we hold on to the states
-    if (clear_states)
+    if (clear_states && type == AggregateStateType::Normal)
         data.clearAndShrink();
     /// proton: ends
+
+    return res;
 }
 
 
@@ -1618,7 +1497,8 @@ template <typename Mapped>
 inline void Aggregator::insertAggregatesIntoColumns(
     Mapped & mapped,
     MutableColumns & final_aggregate_columns,
-    Arena * arena) const
+    Arena * arena,
+    bool clear_states) const
 {
     /** Final values of aggregate functions are inserted to columns.
       * Then states of aggregate functions, that are not longer needed, are destroyed.
@@ -1657,7 +1537,7 @@ inline void Aggregator::insertAggregatesIntoColumns(
 
     /// proton: starts
     /// For streaming aggregation, we hold up to the states
-    if (params.keep_state)
+    if (!clear_states)
     {
         if (exception)
             std::rethrow_exception(exception);
@@ -1690,76 +1570,14 @@ inline void Aggregator::insertAggregatesIntoColumns(
         std::rethrow_exception(exception);
 }
 
-
-template <typename Method, bool use_compiled_functions, typename Table>
-void NO_INLINE Aggregator::convertToBlockImplFinal(
-    Method & method,
-    Table & data,
-    std::vector<IColumn *>  key_columns,
-    MutableColumns & final_aggregate_columns,
-    Arena * arena,
-    bool clear_states) const
+template <bool use_compiled_functions>
+Block Aggregator::insertResultsIntoColumns(PaddedPODArray<AggregateDataPtr> & places, OutputBlockColumns && out_cols, Arena * arena, bool clear_states) const
 {
-    if constexpr (Method::low_cardinality_optimization)
-    {
-        if (data.hasNullKeyData())
-        {
-            key_columns[0]->insertDefault();
-            insertAggregatesIntoColumns(data.getNullKeyData(), final_aggregate_columns, arena);
-        }
-    }
-
-    auto shuffled_key_sizes = method.shuffleKeyColumns(key_columns, key_sizes);
-    const auto & key_sizes_ref = shuffled_key_sizes ? *shuffled_key_sizes :  key_sizes;
-
-    PaddedPODArray<AggregateDataPtr> places;
-    places.reserve(data.size());
-
-    data.forEachValue([&](const auto & key, auto & mapped)
-    {
-        /// Ingore invalid mapped, there are two cases:
-        /// 1) mapped was destroyed (it's a bug)
-        /// 2) no mapped states for retracted data (means it's an new group key, but no retracted data)
-        if (!mapped)
-            return;
-
-        /// For UDA with own emit strategy, there are two special cases to be handled:
-        /// 1. not all groups need to  be emitted. therefore proton needs to pick groups
-        /// that should emits, and only emit those groups while keep other groups unchanged.
-        /// 2. a single block trigger multiple emits. In this case, proton need insert the
-        /// same key multiple times for each emit result of this group.
-
-        /// for non-UDA or UDA without emit strategy, 'should_emit' is always true.
-        /// For UDA with emit strategy, it is true only if the group should emit.
-        size_t emit_times = 1;
-        if (params.group_by == Params::GroupBy::USER_DEFINED)
-        {
-            assert(aggregate_functions.size() == 1);
-            emit_times = aggregate_functions[0]->getEmitTimes(mapped + offsets_of_aggregate_states[0]);
-        }
-
-        if (emit_times > 0)
-        {
-            /// duplicate key for each emit
-            for (size_t i = 0; i < emit_times; i++)
-                method.insertKeyIntoColumns(key, key_columns, key_sizes_ref);
-
-            places.emplace_back(mapped);
-
-            /// Mark the cell as destroyed so it will not be destroyed in destructor.
-            /// proton: starts. Here we push the `mapped` to `places`, for streaming
-            /// case, we don't want aggregate function to destroy the places
-            if (clear_states)
-                mapped = nullptr;
-        }
-    });
-
     std::exception_ptr exception;
     size_t aggregate_functions_destroy_index = 0;
 
     try
     {
-#if USE_EMBEDDED_COMPILER
         if constexpr (use_compiled_functions)
         {
             /** For JIT compiled functions we need to resize columns before pass them into compiled code.
@@ -1774,7 +1592,7 @@ void NO_INLINE Aggregator::convertToBlockImplFinal(
                 if (!is_aggregate_function_compiled[i])
                     continue;
 
-                auto & final_aggregate_column = final_aggregate_columns[i];
+                auto & final_aggregate_column = out_cols.final_aggregate_columns[i];
                 final_aggregate_column = final_aggregate_column->cloneResized(places.size());
                 columns_data.emplace_back(getColumnData(final_aggregate_column.get()));
             }
@@ -1782,7 +1600,6 @@ void NO_INLINE Aggregator::convertToBlockImplFinal(
             auto insert_aggregates_into_columns_function = compiled_functions.insert_aggregates_into_columns_function;
             insert_aggregates_into_columns_function(0, places.size(), columns_data.data(), places.data());
         }
-#endif
 
         for (; aggregate_functions_destroy_index < params.aggregates_size;)
         {
@@ -1795,7 +1612,7 @@ void NO_INLINE Aggregator::convertToBlockImplFinal(
                 }
             }
 
-            auto & final_aggregate_column = final_aggregate_columns[aggregate_functions_destroy_index];
+            auto & final_aggregate_column = out_cols.final_aggregate_columns[aggregate_functions_destroy_index];
             size_t offset = offsets_of_aggregate_states[aggregate_functions_destroy_index];
 
             /** We increase aggregate_functions_destroy_index because by function contract if insertResultIntoBatch
@@ -1877,137 +1694,126 @@ void NO_INLINE Aggregator::convertToBlockImplFinal(
 
     if (exception)
         std::rethrow_exception(exception);
+
+    return finalizeBlock(params, getHeader(/* final */ true), std::move(out_cols), /* final */ true, places.size());
 }
 
-template <typename Method, typename Table>
-void NO_INLINE Aggregator::convertToBlockImplNotFinal(
-    Method & method,
-    Table & data,
-    std::vector<IColumn *>  key_columns,
-    AggregateColumnsData & aggregate_columns) const
+template <typename Method, bool use_compiled_functions, AggregateStateType type, typename Table>
+Block NO_INLINE Aggregator::convertToBlockImplFinal(
+    Method & method, Table & data, Arena * arena, Arenas & aggregates_pools, size_t rows, bool clear_states) const
 {
+    constexpr bool final = true;
+    auto out_cols = prepareOutputBlockColumns(params, aggregate_functions, getHeader(final), aggregates_pools, final, rows);
+
     if constexpr (Method::low_cardinality_optimization)
     {
         if (data.hasNullKeyData())
         {
-            key_columns[0]->insertDefault();
-
-            for (size_t i = 0; i < params.aggregates_size; ++i)
-                aggregate_columns[i]->push_back(data.getNullKeyData() + offsets_of_aggregate_states[i]);
-
-            data.getNullKeyData() = nullptr;
+            assert(type == AggregateStateType::Normal);
+            out_cols.key_columns[0]->insertDefault();
+            insertAggregatesIntoColumns(data.getNullKeyData(), out_cols.final_aggregate_columns, arena, clear_states);
         }
     }
 
-    auto shuffled_key_sizes = method.shuffleKeyColumns(key_columns, key_sizes);
-    const auto & key_sizes_ref = shuffled_key_sizes ? *shuffled_key_sizes :  key_sizes;
+    auto shuffled_key_sizes = method.shuffleKeyColumns(out_cols.raw_key_columns, key_sizes);
+    const auto & key_sizes_ref = shuffled_key_sizes ? *shuffled_key_sizes : key_sizes;
+
+    PaddedPODArray<AggregateDataPtr> places;
+    places.reserve(rows);
+
+    constexpr bool only_updated = (type == AggregateStateType::OnlyUpdated);
+    constexpr bool only_retracted = (type == AggregateStateType::OnlyRetracted);
 
     data.forEachValue([&](const auto & key, auto & mapped)
     {
-        method.insertKeyIntoColumns(key, key_columns, key_sizes_ref);
+        if constexpr (only_updated)
+        {
+            if (!UpdatedDataEx::isUpdated(mapped))
+                return;
 
-        /// reserved, so push_back does not throw exceptions
-        for (size_t i = 0; i < params.aggregates_size; ++i)
-            aggregate_columns[i]->push_back(mapped + offsets_of_aggregate_states[i]);
+            /// Finalized it for current coverting
+            UpdatedDataEx::resetUpdated(mapped);
+        }
+        else if constexpr (only_retracted)
+        {
+            if (!RetractedDataEx::hasRetracted(mapped))
+                return;
+        }
 
-        /// proton: starts. For streaming aggr, we hold on to the states
-        /// Since it is not final, we shall never clear the state
-        /// if (!params.keep_state)
-        ///    mapped = nullptr;
-        /// proton: ends.
-    });
-}
+        auto & place = RetractedDataEx::getData<only_retracted>(mapped);
 
+        /// For UDA with own emit strategy, there are two special cases to be handled:
+        /// 1. not all groups need to  be emitted. therefore proton needs to pick groups
+        /// that should emits, and only emit those groups while keep other groups unchanged.
+        /// 2. a single block trigger multiple emits. In this case, proton need insert the
+        /// same key multiple times for each emit result of this group.
 
-template <typename Filler>
-Block Aggregator::prepareBlockAndFill(
-    AggregatedDataVariants & data_variants,
-    bool final,
-    bool clear_states,
-    size_t rows,
-    Filler && filler) const
-{
-    MutableColumns key_columns(params.keys_size);
-    MutableColumns aggregate_columns(params.aggregates_size);
-    MutableColumns final_aggregate_columns(params.aggregates_size);
-    AggregateColumnsData aggregate_columns_data(params.aggregates_size);
-
-    Block header = getHeader(final);
-
-    for (size_t i = 0; i < params.keys_size; ++i)
-    {
-        key_columns[i] = header.safeGetByPosition(i).type->createColumn();
-        key_columns[i]->reserve(rows);
-    }
+        /// for non-UDA or UDA without emit strategy, 'should_emit' is always true.
+        /// For UDA with emit strategy, it is true only if the group should emit.
+        size_t emit_times = 1;
+        if (params.group_by == Params::GroupBy::USER_DEFINED)
+        {
+            assert(aggregate_functions.size() == 1);
+            emit_times = aggregate_functions[0]->getEmitTimes(place + offsets_of_aggregate_states[0]);
+        }
 
-    for (size_t i = 0; i < params.aggregates_size; ++i)
-    {
-        if (!final)
+        if (emit_times > 0)
         {
-            const auto & aggregate_column_name = params.aggregates[i].column_name;
-            aggregate_columns[i] = header.getByName(aggregate_column_name).type->createColumn();
+            /// duplicate key for each emit
+            for (size_t i = 0; i < emit_times; i++)
+                method.insertKeyIntoColumns(key, out_cols.raw_key_columns, key_sizes_ref);
 
-            /// The ColumnAggregateFunction column captures the shared ownership of the arena with the aggregate function states.
-            ColumnAggregateFunction & column_aggregate_func = assert_cast<ColumnAggregateFunction &>(*aggregate_columns[i]);
+            places.emplace_back(place);
 
-            /// proton: starts
-            column_aggregate_func.setKeepState(params.keep_state);
-            /// proton: ends
+            /// Mark the cell as destroyed so it will not be destroyed in destructor.
+            /// proton: starts. Here we push the `place` to `places`, for streaming
+            /// case, we don't want aggregate function to destroy the places
+            if (clear_states)
+                place = nullptr;
+        }
+    });
 
-            /// Add arenas to ColumnAggregateFunction, which can result in moving ownership to it if reference count
-            /// get dropped in other places
-            for (auto & pool : data_variants.aggregates_pools)
-                column_aggregate_func.addArena(pool);
+    return insertResultsIntoColumns<use_compiled_functions>(places, std::move(out_cols), arena, clear_states);
+}
 
-            aggregate_columns_data[i] = &column_aggregate_func.getData();
-            aggregate_columns_data[i]->reserve(rows);
-        }
-        else
+template <typename Method, typename Table>
+Block NO_INLINE Aggregator::convertToBlockImplNotFinal(Method & method, Table & data, Arenas & aggregates_pools, size_t rows) const
+{
+    constexpr bool final = false;
+    auto out_cols = prepareOutputBlockColumns(params, aggregate_functions, getHeader(final), aggregates_pools, final, rows);
+
+    if constexpr (Method::low_cardinality_optimization)
+    {
+        if (data.hasNullKeyData())
         {
-            final_aggregate_columns[i] = aggregate_functions[i]->getReturnType()->createColumn();
-            final_aggregate_columns[i]->reserve(rows);
+            out_cols.raw_key_columns[0]->insertDefault();
 
-            if (aggregate_functions[i]->isState())
-            {
-                /// The ColumnAggregateFunction column captures the shared ownership of the arena with aggregate function states.
-                if (auto * column_aggregate_func = typeid_cast<ColumnAggregateFunction *>(final_aggregate_columns[i].get()))
-                    for (auto & pool : data_variants.aggregates_pools)
-                        column_aggregate_func->addArena(pool);
+            for (size_t i = 0; i < params.aggregates_size; ++i)
+                out_cols.aggregate_columns_data[i]->push_back(data.getNullKeyData() + offsets_of_aggregate_states[i]);
 
-                /// Aggregate state can be wrapped into array if aggregate function ends with -Resample combinator.
-                final_aggregate_columns[i]->forEachSubcolumn([&data_variants](IColumn::WrappedPtr & subcolumn)
-                {
-                    if (auto * column_aggregate_func = typeid_cast<ColumnAggregateFunction *>(subcolumn.get()))
-                        for (auto & pool : data_variants.aggregates_pools)
-                            column_aggregate_func->addArena(pool);
-                });
-            }
+            data.getNullKeyData() = nullptr;
         }
     }
 
-    filler(key_columns, aggregate_columns_data, final_aggregate_columns, final, clear_states);
-
-    Block res = header.cloneEmpty();
-
-    for (size_t i = 0; i < params.keys_size; ++i)
-        res.getByPosition(i).column = std::move(key_columns[i]);
+    auto shuffled_key_sizes = method.shuffleKeyColumns(out_cols.raw_key_columns, key_sizes);
+    const auto & key_sizes_ref = shuffled_key_sizes ? *shuffled_key_sizes :  key_sizes;
 
-    for (size_t i = 0; i < params.aggregates_size; ++i)
+    data.forEachValue([&](const auto & key, auto & mapped)
     {
-        const auto & aggregate_column_name = params.aggregates[i].column_name;
-        if (final)
-            res.getByName(aggregate_column_name).column = std::move(final_aggregate_columns[i]);
-        else
-            res.getByName(aggregate_column_name).column = std::move(aggregate_columns[i]);
-    }
+        method.insertKeyIntoColumns(key, out_cols.raw_key_columns, key_sizes_ref);
 
-    /// Change the size of the columns-constants in the block.
-    size_t columns = header.columns();
-    for (size_t i = 0; i < columns; ++i)
-        if (isColumnConst(*res.getByPosition(i).column))
-            res.getByPosition(i).column = res.getByPosition(i).column->cut(0, rows);
+        /// reserved, so push_back does not throw exceptions
+        for (size_t i = 0; i < params.aggregates_size; ++i)
+            out_cols.aggregate_columns_data[i]->push_back(mapped + offsets_of_aggregate_states[i]);
 
-    return res;
+        /// proton: starts. For streaming aggr, we hold on to the states
+        /// Since it is not final, we shall never clear the state
+        /// if (!params.keep_state)
+        ///    mapped = nullptr;
+        /// proton: ends.
+    });
+
+    return finalizeBlock(params, getHeader(final), std::move(out_cols), final, rows);
 }
 
 void Aggregator::addSingleKeyToAggregateColumns(
@@ -2034,71 +1840,50 @@ void Aggregator::addArenasToAggregateColumns(
     }
 }
 
-void Aggregator::createStatesAndFillKeyColumnsWithSingleKey(
-    AggregatedDataVariants & data_variants,
-    Columns & key_columns,
-    size_t key_row,
-    MutableColumns & final_key_columns) const
-{
-    AggregateDataPtr place = data_variants.aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
-    createAggregateStates(place);
-    data_variants.without_key = place;
-
-    for (size_t i = 0; i < params.keys_size; ++i)
-    {
-        final_key_columns[i]->insertFrom(*key_columns[i].get(), key_row);
-    }
-}
-
-Block Aggregator::prepareBlockAndFillWithoutKey(AggregatedDataVariants & data_variants, bool final, bool is_overflows, bool clear_states) const
+Block Aggregator::prepareBlockAndFillWithoutKey(AggregatedDataVariants & data_variants, bool final, bool is_overflows, bool clear_states, AggregateStateType type) const
 {
-    /// proton: starts.
-    if (!data_variants.without_key)
-    {
-        data_variants.invalidate();
-        return {};
-    }
-    /// proton: ends.
-
+    auto res_header = getHeader(final);
     size_t rows = 1;
+    auto && out_cols = prepareOutputBlockColumns(params, aggregate_functions, res_header, data_variants.aggregates_pools, final, rows);
+    auto && [key_columns, raw_key_columns, aggregate_columns, final_aggregate_columns, aggregate_columns_data] = out_cols;
 
-    auto filler = [&data_variants, this](
-        MutableColumns & key_columns,
-        AggregateColumnsData & aggregate_columns,
-        MutableColumns & final_aggregate_columns,
-        bool final_,
-        bool clear_states_)
-    {
-        if (data_variants.type == AggregatedDataVariants::Type::without_key || params.overflow_row)
-        {
-            AggregatedDataWithoutKey & data = data_variants.without_key;
+    /// TODO: support overflow row ?
+    assert(!is_overflows);
+    assert(!params.overflow_row);
+    assert(data_variants.type == AggregatedDataVariants::Type::without_key);
 
-            if (!data)
-                throw Exception("Wrong data variant passed.", ErrorCodes::LOGICAL_ERROR);
+    if ((type == AggregateStateType::OnlyUpdated && !UpdatedDataEx::isUpdated(data_variants.without_key))
+        || (type == AggregateStateType::OnlyRetracted && !RetractedDataEx::hasRetracted(data_variants.without_key)))
+        return res_header.cloneEmpty();
 
-            if (!final_)
-            {
-                for (size_t i = 0; i < params.aggregates_size; ++i)
-                    aggregate_columns[i]->push_back(data + offsets_of_aggregate_states[i]);
+    AggregatedDataWithoutKey & data = [&]() -> AggregateDataPtr & {
+        if (type == AggregateStateType::OnlyUpdated)
+        {
+            UpdatedDataEx::resetUpdated( data_variants.without_key);
+            return data_variants.without_key;
+        }
+        else if (type == AggregateStateType::OnlyRetracted)
+            return RetractedDataEx::getRetracted(data_variants.without_key);
+        else
+            return data_variants.without_key;
+    }();
 
-                /// proton: starts
-                if (clear_states_)
-                    data = nullptr;
-                /// proton: ends
-            }
-            else
-            {
-                /// Always single-thread. It's safe to pass current arena from 'aggregates_pool'.
-                insertAggregatesIntoColumns(data, final_aggregate_columns, data_variants.aggregates_pool);
-            }
+    if (!data)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Wrong data variant passed.");
 
-            if (params.overflow_row)
-                for (size_t i = 0; i < params.keys_size; ++i)
-                    key_columns[i]->insertDefault();
-        }
-    };
+    if (!final)
+    {
+        for (size_t i = 0; i < params.aggregates_size; ++i)
+            aggregate_columns_data[i]->push_back(data + offsets_of_aggregate_states[i]);
+        data = nullptr;
+    }
+    else
+    {
+        /// Always single-thread. It's safe to pass current arena from 'aggregates_pool'.
+        insertAggregatesIntoColumns(data, final_aggregate_columns, getArena(data_variants, type), clear_states);
+    }
 
-    Block block = prepareBlockAndFill(data_variants, final, clear_states, rows, filler);
+    Block block = finalizeBlock(params, res_header, std::move(out_cols), final, rows);
 
     if (is_overflows)
         block.info.is_overflows = true;
@@ -2106,143 +1891,65 @@ Block Aggregator::prepareBlockAndFillWithoutKey(AggregatedDataVariants & data_va
     return block;
 }
 
-Block Aggregator::prepareBlockAndFillSingleLevel(AggregatedDataVariants & data_variants, bool final, bool clear_states) const
+Block Aggregator::prepareBlockAndFillSingleLevel(AggregatedDataVariants & data_variants, bool final, bool clear_states, AggregateStateType type) const
 {
-    size_t rows = data_variants.sizeWithoutOverflowRow();
-
-    auto filler = [&data_variants, this](
-        MutableColumns & key_columns,
-        AggregateColumnsData & aggregate_columns,
-        MutableColumns & final_aggregate_columns,
-        bool final_,
-        bool clear_states_)
-    {
-    #define M(NAME) \
-        else if (data_variants.type == AggregatedDataVariants::Type::NAME) \
-            convertToBlockImpl(*data_variants.NAME, data_variants.NAME->data, \
-                key_columns, aggregate_columns, final_aggregate_columns, data_variants.aggregates_pool, final_, clear_states_);
-
-        if (false) {} // NOLINT
-        APPLY_FOR_VARIANTS_SINGLE_LEVEL_STREAMING(M)
-    #undef M
-        else
-            throw Exception("Unknown aggregated data variant.", ErrorCodes::UNKNOWN_AGGREGATED_DATA_VARIANT);
-    };
+    const size_t rows = data_variants.sizeWithoutOverflowRow();
+#define M(NAME) \
+    else if (data_variants.type == AggregatedDataVariants::Type::NAME) \
+        return convertToBlockImpl(*data_variants.NAME, data_variants.NAME->data, getArena(data_variants, type), data_variants.aggregates_pools, final, rows, clear_states, type);
 
-    return prepareBlockAndFill(data_variants, final, clear_states, rows, filler);
+    if (false) {} // NOLINT
+    APPLY_FOR_VARIANTS_SINGLE_LEVEL(M)
+#undef M
+    else throw Exception(ErrorCodes::UNKNOWN_AGGREGATED_DATA_VARIANT, "Unknown aggregated data variant.");
 }
 
-
-BlocksList Aggregator::prepareBlocksAndFillTwoLevel(AggregatedDataVariants & data_variants, bool final, size_t max_threads, bool clear_states) const
+BlocksList Aggregator::prepareBlocksAndFillTwoLevel(AggregatedDataVariants & data_variants, bool final, bool clear_states, size_t max_threads, AggregateStateType type) const
 {
+    /// TODO Make a custom threshold.
+    /// TODO Use the shared thread pool with the `merge` function.
     std::unique_ptr<ThreadPool> thread_pool;
-    if (max_threads > 1 && data_variants.sizeWithoutOverflowRow() > 100000 /// TODO Make a custom threshold.
-        && data_variants.isStaticBucketTwoLevel()) /// TODO Use the shared thread pool with the `merge` function.
+    if (max_threads > 1 && data_variants.sizeWithoutOverflowRow() > 100000
+        && final && type == AggregateStateType::Normal) /// use single thread for non-final or retracted data or updated data
         thread_pool = std::make_unique<ThreadPool>(max_threads);
 
+    if (false) {} // NOLINT
 #define M(NAME) \
     else if (data_variants.type == AggregatedDataVariants::Type::NAME) \
-        return prepareBlocksAndFillTwoLevelImpl(data_variants, *data_variants.NAME, final, clear_states, thread_pool.get());
+        return prepareBlocksAndFillTwoLevelImpl(data_variants, *data_variants.NAME, final, clear_states, thread_pool.get(), type);
 
-    if (false) {} // NOLINT
     APPLY_FOR_VARIANTS_ALL_TWO_LEVEL(M)
 #undef M
     else
         throw Exception("Unknown aggregated data variant.", ErrorCodes::UNKNOWN_AGGREGATED_DATA_VARIANT);
 }
 
-
 template <typename Method>
 BlocksList Aggregator::prepareBlocksAndFillTwoLevelImpl(
     AggregatedDataVariants & data_variants,
     Method & method,
     bool final,
     bool clear_states,
-    ThreadPool * thread_pool) const
+    ThreadPool * thread_pool,
+    AggregateStateType type) const
 {
-    size_t max_threads = thread_pool ? thread_pool->getMaxThreads() : 1;
-    /// proton FIXME : separate final vs non-final converting. For non-final converting, we don't need
-    /// each arena for each thread.
-    for (size_t i = data_variants.aggregates_pools.size(); i < max_threads; ++i)
-        data_variants.aggregates_pools.push_back(std::make_shared<Arena>());
-
-    auto buckets = method.data.buckets();
-    std::atomic<UInt32> next_bucket_idx_to_merge = 0;
-
-    auto converter = [&](size_t thread_id, ThreadGroupStatusPtr thread_group)
-    {
-        SCOPE_EXIT_SAFE(
-            if (thread_group)
-                CurrentThread::detachQueryIfNotDetached();
-        );
-        if (thread_group)
-            CurrentThread::attachToIfDetached(thread_group);
-
-        BlocksList blocks;
-        while (true)
-        {
-            UInt32 bucket_idx = next_bucket_idx_to_merge.fetch_add(1);
-
-            if (bucket_idx >= buckets.size())
-                break;
-
-            auto bucket = buckets[bucket_idx];
-            if (method.data.impls[bucket].empty())
-                continue;
-
-            /// Select Arena to avoid race conditions
-            Arena * arena = data_variants.aggregates_pools.at(thread_id).get();
-            blocks.emplace_back(convertOneBucketToBlockImpl(data_variants, method, arena, final, clear_states, bucket));
-        }
-        return blocks;
-    };
-
-    /// packaged_task is used to ensure that exceptions are automatically thrown into the main stream.
-
-    std::vector<std::packaged_task<BlocksList()>> tasks(max_threads);
-
-    try
-    {
-        for (size_t thread_id = 0; thread_id < max_threads; ++thread_id)
-        {
-            tasks[thread_id] = std::packaged_task<BlocksList()>(
-                [group = CurrentThread::getGroup(), thread_id, &converter] { return converter(thread_id, group); });
-
-            if (thread_pool)
-                thread_pool->scheduleOrThrowOnError([thread_id, &tasks] { tasks[thread_id](); });
-            else
-                tasks[thread_id]();
-        }
-    }
-    catch (...)
-    {
-        /// If this is not done, then in case of an exception, tasks will be destroyed before the threads are completed, and it will be bad.
-        if (thread_pool)
-            thread_pool->wait();
-
-        throw;
-    }
-
-    if (thread_pool)
-        thread_pool->wait();
-
-    BlocksList blocks;
-
-    for (auto & task : tasks)
-    {
-        if (!task.valid())
-            continue;
-
-        blocks.splice(blocks.end(), task.get_future().get());
-    }
-
-    return blocks;
+    return concurrentBucketConvert(
+        thread_pool,
+        method.data.buckets(),
+        getArena(data_variants, type),
+        data_variants.aggregates_pools,
+        [&](Int64 bucket, Arena * arena) -> BlocksList {
+            /// Skip no changed bucket if only updated is requested
+            if (type == AggregateStateType::OnlyUpdated && !method.data.isUpdatedBucket(bucket))
+                return {};
+
+            return {convertOneBucketToBlockImpl(data_variants, method, arena, final, clear_states, bucket, type)};
+        });
 }
 
-
-BlocksList Aggregator::convertToBlocks(AggregatedDataVariants & data_variants, bool final, ConvertAction action, size_t max_threads) const
+BlocksList Aggregator::convertToBlocks(AggregatedDataVariants & data_variants, bool final, bool clear_states, size_t max_threads) const
 {
-    LOG_TRACE(log, "Converting aggregated data to blocks");
+    LOG_DEBUG(log, "Converting aggregated data to blocks");
 
     Stopwatch watch;
 
@@ -2252,29 +1959,19 @@ BlocksList Aggregator::convertToBlocks(AggregatedDataVariants & data_variants, b
     if (data_variants.empty())
         return blocks;
 
-    bool clear_states = shouldClearStates(action, final);
-
-    if (data_variants.without_key)
-        /// When without_key is setup, it doesn't necessary mean no GROUP BY keys, it may be overflow
-        blocks.emplace_back(prepareBlockAndFillWithoutKey(
-            data_variants, final, data_variants.type != AggregatedDataVariants::Type::without_key, clear_states));
+    if (unlikely(params.overflow_row))
+        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Overflow row processing is not implemented in streaming aggregation");
 
-    if (data_variants.type != AggregatedDataVariants::Type::without_key)
-    {
-        if (data_variants.isTwoLevel())
-            blocks.splice(blocks.end(), prepareBlocksAndFillTwoLevel(data_variants, final, max_threads, clear_states));
-        else
-            blocks.emplace_back(prepareBlockAndFillSingleLevel(data_variants, final, clear_states));
-    }
+    if (data_variants.type == AggregatedDataVariants::Type::without_key)
+        blocks.emplace_back(prepareBlockAndFillWithoutKey(data_variants, final, false, clear_states));
+    else if (!data_variants.isTwoLevel())
+        blocks.emplace_back(prepareBlockAndFillSingleLevel(data_variants, final, clear_states));
+    else
+        blocks.splice(blocks.end(), prepareBlocksAndFillTwoLevel(data_variants, final, clear_states, max_threads));
 
     /// proton: starts.
     if (clear_states)
-    {
-        /// `data_variants` will not destroy the states of aggregate functions in the destructor,
-        /// since already cleared up in `prepareBlocksAndFill...()`
-        data_variants.aggregator = nullptr;
-        clearDataVariants(data_variants);
-    }
+        data_variants.reset();
     /// proton: ends
 
     size_t rows = 0;
@@ -2287,7 +1984,7 @@ BlocksList Aggregator::convertToBlocks(AggregatedDataVariants & data_variants, b
     }
 
     double elapsed_seconds = watch.elapsedSeconds();
-    LOG_INFO(log,
+    LOG_DEBUG(log,
         "Converted aggregated data to blocks. {} rows, {} in {} sec. ({:.3f} rows/sec., {}/sec.)",
         rows, ReadableSize(bytes),
         elapsed_seconds, rows / elapsed_seconds,
@@ -2296,7 +1993,6 @@ BlocksList Aggregator::convertToBlocks(AggregatedDataVariants & data_variants, b
     return blocks;
 }
 
-
 template <typename Method, typename Table>
 void NO_INLINE Aggregator::mergeDataNullKey(
     Table & table_dst,
@@ -2352,7 +2048,6 @@ void NO_INLINE Aggregator::mergeDataImpl(
 
     auto func = [&](AggregateDataPtr & __restrict dst, AggregateDataPtr & __restrict src, bool inserted)
     {
-        /// proton: starts
         if (inserted)
         {
             /// If there are multiple sources, there are more than one AggregatedDataVariant. Aggregator always creates a new AggregatedDataVariant and merge all other
@@ -2360,72 +2055,12 @@ void NO_INLINE Aggregator::mergeDataImpl(
             /// If it does not alloc new memory for the 'dst' (i.e. aggregate state of the new AggregatedDataVariant which get destroyed after finalize()) but reuse
             /// that from the 'src' to store the final aggregated result, it will cause the data from other AggregatedDataVariant will be merged multiple times and
             /// generate incorrect aggregated result.
-            dst = arena->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
-#if USE_EMBEDDED_COMPILER
-            if constexpr (use_compiled_functions)
-            {
-                const auto & compiled_aggregate_functions = compiled_aggregate_functions_holder->compiled_aggregate_functions;
-                compiled_aggregate_functions.create_aggregate_states_function(dst);
-                if (compiled_aggregate_functions.functions_count != aggregate_functions.size())
-                {
-                    static constexpr bool skip_compiled_aggregate_functions = true;
-                    createAggregateStates<skip_compiled_aggregate_functions>(dst);
-                }
-
-#if defined(MEMORY_SANITIZER)
-
-                /// We compile only functions that do not allocate some data in Arena. Only store necessary state in AggregateData place.
-                for (size_t aggregate_function_index = 0; aggregate_function_index < aggregate_functions.size(); ++aggregate_function_index)
-                {
-                    if (!is_aggregate_function_compiled[aggregate_function_index])
-                        continue;
-
-                    auto aggregate_data_with_offset = dst + offsets_of_aggregate_states[aggregate_function_index];
-                    auto data_size = params.aggregates[aggregate_function_index].function->sizeOfData();
-                    __msan_unpoison(aggregate_data_with_offset, data_size);
-                }
-#endif
-            }
-            else
-#endif
-            {
-                createAggregateStates(dst);
-            }
-        }
-        /// proton: ends
-
-#if USE_EMBEDDED_COMPILER
-        if constexpr (use_compiled_functions)
-        {
-            const auto & compiled_functions = compiled_aggregate_functions_holder->compiled_aggregate_functions;
-            compiled_functions.merge_aggregate_states_function(dst, src);
-
-            if (compiled_aggregate_functions_holder->compiled_aggregate_functions.functions_count != params.aggregates_size)
-            {
-                for (size_t i = 0; i < params.aggregates_size; ++i)
-                {
-                    if (!is_aggregate_function_compiled[i])
-                        aggregate_functions[i]->merge(dst + offsets_of_aggregate_states[i], src + offsets_of_aggregate_states[i], arena);
-                }
-
-//                    for (size_t i = 0; i < params.aggregates_size; ++i)
-//                    {
-//                        /// proton: starts
-//                        if (!is_aggregate_function_compiled[i] && !params.streaming)
-//                            aggregate_functions[i]->destroy(src + offsets_of_aggregate_states[i]);
-//                        /// proton: ends
-//                    }
-            }
-        }
-        else
-#endif
-        {
-            for (size_t i = 0; i < params.aggregates_size; ++i)
-                aggregate_functions[i]->merge(dst + offsets_of_aggregate_states[i], src + offsets_of_aggregate_states[i], arena);
+            auto aggregate_data = arena->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
+            createAggregateStates<use_compiled_functions>(aggregate_data);
+            dst = aggregate_data;
         }
 
-        if (clear_states)
-            destroyAggregateStates(src);
+        mergeAggregateStates(dst, src, arena, clear_states);
     };
 
     if constexpr (std::is_same_v<KeyHandler, EmptyKeyHandler>)
@@ -2433,152 +2068,65 @@ void NO_INLINE Aggregator::mergeDataImpl(
     else
         table_src.mergeToViaEmplace(table_dst, func, std::move(key_handler));
 
+    /// In order to release memory early.
     if (clear_states)
         table_src.clearAndShrink();
     /// proton: ends
 }
 
-
-template <typename Method, typename Table>
-void NO_INLINE Aggregator::mergeDataNoMoreKeysImpl(
-    Table & table_dst,
-    AggregatedDataWithoutKey & overflows,
-    Table & table_src,
-    Arena * arena,
-    bool clear_states) const
+void NO_INLINE Aggregator::mergeWithoutKeyDataImpl(ManyAggregatedDataVariants & non_empty_data, bool clear_states) const
 {
-    /// Note : will create data for NULL key if not exist
-    if constexpr (Method::low_cardinality_optimization)
-        mergeDataNullKey<Method, Table>(table_dst, table_src, arena, clear_states);
+    AggregatedDataVariantsPtr & res = non_empty_data[0];
 
-    table_src.mergeToViaFind(table_dst, [&](AggregateDataPtr dst, AggregateDataPtr & src, bool found)
+    /// We merge all aggregation results to the first.
+    for (size_t result_num = 1, size = non_empty_data.size(); result_num < size; ++result_num)
     {
-        AggregateDataPtr res_data = found ? dst : overflows;
-
-        for (size_t i = 0; i < params.aggregates_size; ++i)
-            aggregate_functions[i]->merge(
-                res_data + offsets_of_aggregate_states[i],
-                src + offsets_of_aggregate_states[i],
-                arena);
+        AggregatedDataVariants & current = *non_empty_data[result_num];
+        mergeAggregateStates(res->without_key, current.without_key, res->aggregates_pool, clear_states);
 
-        /// proton : starts
+        /// In order to release memory early.
         if (clear_states)
-            destroyAggregateStates(src);
-        /// proton : ends
-    });
-
-    if (clear_states)
-        table_src.clearAndShrink();
+            current.reset();
+    }
 }
 
-template <typename Method, typename Table>
-void NO_INLINE Aggregator::mergeDataOnlyExistingKeysImpl(
-    Table & table_dst,
-    Table & table_src,
-    Arena * arena,
-    bool clear_states) const
+template <typename Method>
+void NO_INLINE Aggregator::mergeSingleLevelDataImpl(ManyAggregatedDataVariants & non_empty_data, bool clear_states) const
 {
-    /// Note : will create data for NULL key if not exist
-    if constexpr (Method::low_cardinality_optimization)
-        mergeDataNullKey<Method, Table>(table_dst, table_src, arena, clear_states);
+    AggregatedDataVariantsPtr & res = non_empty_data[0];
+    bool no_more_keys = false;
 
-    table_src.mergeToViaFind(table_dst,
-        [&](AggregateDataPtr dst, AggregateDataPtr & src, bool found)
+    /// We merge all aggregation results to the first.
+    for (size_t result_num = 1, size = non_empty_data.size(); result_num < size; ++result_num)
     {
-        if (!found)
-            return;
-
-        for (size_t i = 0; i < params.aggregates_size; ++i)
-            aggregate_functions[i]->merge(
-                dst + offsets_of_aggregate_states[i],
-                src + offsets_of_aggregate_states[i],
-                arena);
-
-        /// proton : starts
-        if (clear_states)
-            destroyAggregateStates(src);
-    });
-
-    if (clear_states)
-        table_src.clearAndShrink();
-    /// proton : ends
-}
-
-
-void NO_INLINE Aggregator::mergeWithoutKeyDataImpl(ManyAggregatedDataVariants & non_empty_data, bool clear_states) const
-{
-    AggregatedDataVariantsPtr & res = non_empty_data[0];
-
-    /// We merge all aggregation results to the first.
-    for (size_t result_num = 1, size = non_empty_data.size(); result_num < size; ++result_num)
-    {
-        /// proton: starts.
-        mergeAggregateStates(res->without_key, non_empty_data[result_num]->without_key, res->aggregates_pool, clear_states);
-        /// proton: ends.
-    }
-}
-
-
-template <typename Method>
-void NO_INLINE Aggregator::mergeSingleLevelDataImpl(ManyAggregatedDataVariants & non_empty_data, bool clear_states) const
-{
-    AggregatedDataVariantsPtr & res = non_empty_data[0];
-    bool no_more_keys = false;
-
-    /// We merge all aggregation results to the first.
-    for (size_t result_num = 1, size = non_empty_data.size(); result_num < size; ++result_num)
-    {
-        if (!checkLimits(res->sizeWithoutOverflowRow(), no_more_keys))
-            break;
+        if (!checkLimits(res->sizeWithoutOverflowRow(), no_more_keys))
+            break;
 
         AggregatedDataVariants & current = *non_empty_data[result_num];
 
-        if (!no_more_keys)
-        {
+        assert(!no_more_keys);
 #if USE_EMBEDDED_COMPILER
-            if (compiled_aggregate_functions_holder)
-            {
-                mergeDataImpl<Method, true>(
-                    getDataVariant<Method>(*res).data,
-                    getDataVariant<Method>(current).data,
-                    res->aggregates_pool,
-                    clear_states);
-            }
-            else
-#endif
-            {
-                mergeDataImpl<Method, false>(
-                    getDataVariant<Method>(*res).data,
-                    getDataVariant<Method>(current).data,
-                    res->aggregates_pool,
-                    clear_states);
-            }
-        }
-        else if (res->without_key)
+        if (compiled_aggregate_functions_holder)
         {
-            mergeDataNoMoreKeysImpl<Method>(
+            mergeDataImpl<Method, true>(
                 getDataVariant<Method>(*res).data,
-                res->without_key,
                 getDataVariant<Method>(current).data,
                 res->aggregates_pool,
                 clear_states);
         }
         else
+ #endif
         {
-            mergeDataOnlyExistingKeysImpl<Method>(
+            mergeDataImpl<Method, false>(
                 getDataVariant<Method>(*res).data,
                 getDataVariant<Method>(current).data,
                 res->aggregates_pool,
                 clear_states);
         }
 
+        /// In order to release memory early.
         if (clear_states)
-        {
-            /// `current` will not destroy the states of aggregate functions in the destructor,
-            /// since already cleared up in `mergeData...Impl()`
-            current.aggregator = nullptr;
-            clearDataVariants(current);
-        }
+            current.reset();
     }
 }
 
@@ -2588,9 +2136,98 @@ void NO_INLINE Aggregator::mergeSingleLevelDataImpl(ManyAggregatedDataVariants &
     APPLY_FOR_VARIANTS_SINGLE_LEVEL_STREAMING(M)
 #undef M
 
+
+BlocksList
+Aggregator::mergeAndConvertToBlocks(ManyAggregatedDataVariants & data_variants, bool final, bool clear_states, size_t max_threads) const
+{
+    auto prepared_data_ptr = prepareVariantsToMerge(data_variants);
+    if (prepared_data_ptr->empty())
+        return {};
+
+    if (unlikely(params.overflow_row))
+        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Overflow row processing is not implemented in streaming aggregation");
+
+    BlocksList blocks;
+    auto & first = *prepared_data_ptr->at(0);
+    if (first.type == AggregatedDataVariants::Type::without_key)
+    {
+        mergeWithoutKeyDataImpl(*prepared_data_ptr, clear_states);
+        blocks.emplace_back(prepareBlockAndFillWithoutKey(first, final, false, clear_states));
+    }
+    else if (!first.isTwoLevel())
+    {
+        if (false) { } // NOLINT
+#define M(NAME) \
+        else if (first.type == AggregatedDataVariants::Type::NAME) \
+            mergeSingleLevelDataImpl<decltype(first.NAME)::element_type>(*prepared_data_ptr, clear_states);
+
+        APPLY_FOR_VARIANTS_SINGLE_LEVEL_STREAMING(M)
+#undef M
+        else throw Exception("Unknown single level aggregated data variant.", ErrorCodes::UNKNOWN_AGGREGATED_DATA_VARIANT);
+
+        blocks.emplace_back(prepareBlockAndFillSingleLevel(first, final, clear_states));
+    }
+    else
+    {
+        auto total_size = std::accumulate(prepared_data_ptr->begin(), prepared_data_ptr->end(), 0ull, [](size_t size, const auto & variants) {
+            return size + variants->sizeWithoutOverflowRow();
+        });
+        /// TODO Make a custom threshold.
+        /// TODO Use the shared thread pool with the `merge` function.
+        std::unique_ptr<ThreadPool> thread_pool;
+        if (max_threads > 1 && total_size > 100000 && final)
+            thread_pool = std::make_unique<ThreadPool>(max_threads);
+
+        if (false) { } // NOLINT
+#define M(NAME) \
+        else if (first.type == AggregatedDataVariants::Type::NAME) \
+            blocks = mergeAndConvertTwoLevelToBlocksImpl<decltype(first.NAME)::element_type>(*prepared_data_ptr, final, clear_states, thread_pool.get());
+
+        APPLY_FOR_VARIANTS_ALL_TWO_LEVEL(M)
+#undef M
+        else throw Exception("Unknown aggregated data variant.", ErrorCodes::UNKNOWN_AGGREGATED_DATA_VARIANT);
+    }
+
+    if (clear_states)
+    {
+        for (auto & variants : *prepared_data_ptr)
+            variants->reset();
+    }
+
+    return blocks;
+}
+
+template <typename Method>
+BlocksList Aggregator::mergeAndConvertTwoLevelToBlocksImpl(
+    ManyAggregatedDataVariants & non_empty_data, bool final, bool clear_states, ThreadPool * thread_pool) const
+{
+    auto & first = *non_empty_data.at(0);
+
+    std::vector<Int64> buckets;
+    if (first.isStaticBucketTwoLevel())
+        buckets = getDataVariant<Method>(first).data.buckets();
+    else
+    {
+        assert(first.isTimeBucketTwoLevel());
+        std::unordered_set<Int64> buckets_set;
+        for (auto & data_variants : non_empty_data)
+        {
+            auto tmp_buckets = getDataVariant<Method>(*data_variants).data.buckets();
+            buckets_set.insert(tmp_buckets.begin(), tmp_buckets.end());
+        }
+        buckets.assign(buckets_set.begin(), buckets_set.end());
+    }
+
+    return concurrentBucketConvert(
+        thread_pool, buckets, first.aggregates_pool, first.aggregates_pools, [&](Int64 bucket, Arena * arena) -> BlocksList {
+            mergeBucketImpl<Method>(non_empty_data, bucket, arena, clear_states);
+            return {convertOneBucketToBlockImpl(first, getDataVariant<Method>(first), arena, final, clear_states, bucket)};
+        });
+}
+
 template <typename Method>
 void NO_INLINE Aggregator::mergeBucketImpl(
-    ManyAggregatedDataVariants & data, bool final, bool clear_states, Int64 bucket, Arena * arena, std::atomic<bool> * is_cancelled) const
+    ManyAggregatedDataVariants & data, Int64 bucket, Arena * arena, bool clear_states, std::atomic<bool> * is_cancelled) const
 {
     /// We merge all aggregation results to the first.
     AggregatedDataVariantsPtr & res = data[0];
@@ -2618,6 +2255,9 @@ void NO_INLINE Aggregator::mergeBucketImpl(
                 arena,
                 clear_states);
         }
+
+        /// Assume the current bucket has been finalized.
+        getDataVariant<Method>(current).data.resetUpdated(bucket);
     }
 }
 
@@ -2731,6 +2371,7 @@ void NO_INLINE Aggregator::mergeStreamsImplCase(
             auto emplace_result = state.emplaceKey(data, i, *aggregates_pool);
             if (emplace_result.isInserted())
             {
+                /// exception-safety - if you can not allocate memory or create states, then destructors will not be called.
                 emplace_result.setMapped(nullptr);
 
                 aggregate_data = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
@@ -2974,7 +2615,7 @@ void Aggregator::mergeBlocks(BucketToBlocks bucket_to_blocks, AggregatedDataVari
 }
 
 
-Block Aggregator::mergeBlocks(BlocksList & blocks, bool final, ConvertAction action)
+Block Aggregator::mergeBlocks(BlocksList & blocks, bool final, bool clear_states, bool only_updated)
 {
     if (blocks.empty())
         return {};
@@ -3036,7 +2677,6 @@ Block Aggregator::mergeBlocks(BlocksList & blocks, bool final, ConvertAction act
             throw Exception("Unknown aggregated data variant.", ErrorCodes::UNKNOWN_AGGREGATED_DATA_VARIANT);
     }
 
-    bool clear_states = shouldClearStates(action, final);
     Block block;
     if (result.type == AggregatedDataVariants::Type::without_key || is_overflows)
         block = prepareBlockAndFillWithoutKey(result, final, is_overflows, clear_states);
@@ -3321,33 +2961,40 @@ std::vector<Int64> Aggregator::bucketsBefore(const AggregatedDataVariants & resu
 /// 1) The keys can reside in hashmap or in arena
 /// 2) The state can reside in arena or in the aggregation function
 /// And there is a special one which is group without key
-void Aggregator::checkpoint(const AggregatedDataVariants & data_variants, WriteBuffer & wb)
+void Aggregator::checkpoint(const AggregatedDataVariants & data_variants, WriteBuffer & wb) const
 {
+    auto version = getVersion();
     /// Serialization layout
     /// [version] + [states layout]
-    VersionType version = getVersion();
     writeIntBinary(version, wb);
 
     if (version <= 1)
-        return doCheckpointLegacy(data_variants, wb);
+        return const_cast<Aggregator *>(this)->doCheckpointLegacy(data_variants, wb);
 
-    return doCheckpoint(data_variants, wb);
+    if (version <= 2)
+        return doCheckpointV2(data_variants, wb);
+    else
+        return doCheckpointV3(data_variants, wb);
 }
 
-void Aggregator::recover(AggregatedDataVariants & data_variants, ReadBuffer & rb)
+void Aggregator::recover(AggregatedDataVariants & data_variants, ReadBuffer & rb) const
 {
     /// Serialization layout
     /// [version] + [states layout]
     VersionType recovered_version = 0;
     readIntBinary(recovered_version, rb);
+
     assert(recovered_version <= getVersion());
+    /// So far, no broken changes from `recovered_version` to `version`.
 
     /// FIXME: Legacy layout needs to be cleaned after no use
     if (recovered_version <= 1)
-        return doRecoverLegacy(data_variants, rb);
+        return const_cast<Aggregator *>(this)->doRecoverLegacy(data_variants, rb);
 
-    /// Recover STATE V2
-    return doRecover(data_variants, rb);
+    if (recovered_version <= 2)
+        return doRecoverV2(data_variants, rb);
+    else
+        return doRecoverV3(data_variants, rb);
 }
 
 void Aggregator::doCheckpointLegacy(const AggregatedDataVariants & data_variants, WriteBuffer & wb)
@@ -3370,7 +3017,7 @@ void Aggregator::doCheckpointLegacy(const AggregatedDataVariants & data_variants
 
     /// FIXME, set a good max_threads
     /// For ConvertAction::Checkpoint, don't clear state `data_variants`
-    auto blocks = convertToBlocks(const_cast<AggregatedDataVariants &>(data_variants), false, ConvertAction::Checkpoint, 8);
+    auto blocks = convertToBlocks(const_cast<AggregatedDataVariants &>(data_variants), false, false, 8);
 
     /// assert(!blocks.empty());
 
@@ -3614,7 +3261,7 @@ void Aggregator::recoverStatesTwoLevel(AggregatedDataVariants & data_variants, B
 /// The complexity of checkpoint the state of Aggregator is a combination of the following 2 cases
 /// 1) without key states (without_key or overflow rows)
 /// 2) hash table states
-void Aggregator::doCheckpoint(const AggregatedDataVariants & data_variants, WriteBuffer & wb)
+void Aggregator::doCheckpointV2(const AggregatedDataVariants & data_variants, WriteBuffer & wb) const
 {
     /// Serialization layout, there are 2 cases:
     /// 1) Without key: [uint8][uint16][aggr-func-state-without-key]
@@ -3652,7 +3299,7 @@ void Aggregator::doCheckpoint(const AggregatedDataVariants & data_variants, Writ
     else throw Exception("Unknown aggregated data variant.", ErrorCodes::UNKNOWN_AGGREGATED_DATA_VARIANT);
 }
 
-void Aggregator::doRecover(AggregatedDataVariants & data_variants, ReadBuffer & rb)
+void Aggregator::doRecoverV2(AggregatedDataVariants & data_variants, ReadBuffer & rb) const
 {
     UInt8 inited = 0;
     readIntBinary(inited, rb);
@@ -3687,61 +3334,24 @@ void Aggregator::doRecover(AggregatedDataVariants & data_variants, ReadBuffer &
     if (is_two_level && !data_variants.isTwoLevel())
         data_variants.convertToTwoLevel();
 
-    bool use_string_hash_map = data_variants.type == AggregatedDataVariants::Type::key_string
-        || data_variants.type == AggregatedDataVariants::Type::key_string_two_level
-        || data_variants.type == AggregatedDataVariants::Type::key_fixed_string
-        || data_variants.type == AggregatedDataVariants::Type::key_fixed_string_two_level;
-
     /// [aggr-func-state-in-hash-map]
     if (false)
     {
     } // NOLINT
 #define M(NAME, IS_TWO_LEVEL) \
-    else if (data_variants.type == AggregatedDataVariants::Type::NAME) { \
-        if (use_string_hash_map) \
-            DB::deserializeHashMap<true>(data_variants.NAME->data, [this](auto & mapped, Arena & pool, ReadBuffer & rb_) { deserializeAggregateStates(mapped, rb_, &pool); }, *data_variants.aggregates_pool, rb); \
-        else \
-            DB::deserializeHashMap<false>(data_variants.NAME->data, [this](auto & mapped, Arena & pool, ReadBuffer & rb_) { deserializeAggregateStates(mapped, rb_, &pool); }, *data_variants.aggregates_pool, rb); \
-    }
+    else if (data_variants.type == AggregatedDataVariants::Type::NAME) \
+        DB::deserializeHashMap(data_variants.NAME->data, [this](auto & mapped, Arena & pool, ReadBuffer & rb_) { deserializeAggregateStates(mapped, rb_, &pool); }, *data_variants.aggregates_pool, rb);
 
     APPLY_FOR_AGGREGATED_VARIANTS_STREAMING(M)
 #undef M
     else throw Exception("Unknown aggregated data variant.", ErrorCodes::UNKNOWN_AGGREGATED_DATA_VARIANT);
 }
 
-bool Aggregator::shouldClearStates(ConvertAction action, bool final_) const
-{
-    /// For streaming processing, data_variants.aggregator will never be nullptr once set
-    /// and we will never move the ownership of the states to `ColumnAggregateFunction`
-    /// unless we don't need keep the states
-
-    switch (action)
-    {
-        case ConvertAction::DistributedMerge:
-            /// Distributed processing case. Only clear states on initiator
-            return final_;
-        case ConvertAction::WriteToTmpFS:
-            /// We are dumping all states to file system in case of memory is not efficient
-            /// In this case, we should not keep the states
-            return true;
-        case ConvertAction::Checkpoint:
-            /// Checkpoint is snapshot of in-memory states, we shall not clear the states
-            return false;
-        case ConvertAction::InternalMerge:
-            return false;
-        case ConvertAction::RetractedEmit:
-            return true;
-        case ConvertAction::StreamingEmit:
-            [[fallthrough]];
-        default:
-            /// By default, streaming processing needs hold on to the states
-            return !params.keep_state;
-    }
-}
-
 VersionType Aggregator::getVersionFromRevision(UInt64 revision) const
 {
-    if (revision >= STATE_V2_MIN_REVISION)
+    if (revision >= STATE_V3_MIN_REVISION)
+        return static_cast<VersionType>(3);
+    else if (revision >= STATE_V2_MIN_REVISION)
         return static_cast<VersionType>(2);
     else
         throw Exception(
@@ -3757,10 +3367,9 @@ template <typename Method>
 void NO_INLINE Aggregator::spliceBucketsImpl(
     AggregatedDataVariants & data_dest,
     AggregatedDataVariants & data_src,
-    bool final,
-    bool clear_states,
     const std::vector<Int64> & gcd_buckets,
-    Arena * arena) const
+    Arena * arena,
+    bool clear_states) const
 {
     /// In order to merge state with same other keys of different gcd buckets, reset the window group keys to zero
     /// create a new key, where the window key part is 0, and the other key parts are the same as the original value.
@@ -3809,22 +3418,24 @@ void NO_INLINE Aggregator::spliceBucketsImpl(
 }
 
 Block Aggregator::spliceAndConvertBucketsToBlock(
-    AggregatedDataVariants & variants, bool final, ConvertAction action, const std::vector<Int64> & gcd_buckets) const
+    AggregatedDataVariants & variants, bool final, bool clear_states, const std::vector<Int64> & gcd_buckets) const
 {
-    AggregatedDataVariants result_variants;
-    result_variants.aggregator = this;
-    initDataVariants(result_variants, method_chosen, key_sizes, params);
-    initStatesForWithoutKeyOrOverflow(result_variants);
-
-    auto method = result_variants.type;
-    Arena * arena = result_variants.aggregates_pool;
-    bool clear_states = shouldClearStates(action, final);
+    assert(variants.isTimeBucketTwoLevel());
+
     if (false) {} // NOLINT
 #define M(NAME) \
-    else if (method == AggregatedDataVariants::Type::NAME) \
+    else if (variants.type == AggregatedDataVariants::Type::NAME) \
     { \
-        spliceBucketsImpl<decltype(result_variants.NAME)::element_type>(result_variants, variants, final, clear_states, gcd_buckets, arena); \
-        return convertOneBucketToBlockImpl(result_variants, *result_variants.NAME, arena, final, clear_states, 0); \
+        if (gcd_buckets.size() > 1) \
+        { \
+            AggregatedDataVariants result_variants; \
+            result_variants.aggregator = this; \
+            initDataVariants(result_variants, method_chosen, key_sizes, params); \
+            spliceBucketsImpl<decltype(result_variants.NAME)::element_type>(result_variants, variants, gcd_buckets, result_variants.aggregates_pool, clear_states); \
+            return convertOneBucketToBlockImpl(result_variants, *result_variants.NAME, result_variants.aggregates_pool, final, /*clear_states*/ true, 0); \
+        } \
+        else \
+            return convertOneBucketToBlockImpl(variants, *variants.NAME, variants.aggregates_pool, final, clear_states, gcd_buckets[0]); \
     }
 
     APPLY_FOR_VARIANTS_TIME_BUCKET_TWO_LEVEL(M)
@@ -3836,30 +3447,31 @@ Block Aggregator::spliceAndConvertBucketsToBlock(
 }
 
 Block Aggregator::mergeAndSpliceAndConvertBucketsToBlock(
-    ManyAggregatedDataVariants & variants, bool final, ConvertAction action, const std::vector<Int64> & gcd_buckets) const
+    ManyAggregatedDataVariants & variants, bool final, bool clear_states, const std::vector<Int64> & gcd_buckets) const
 {
-    auto prepared_data = prepareVariantsToMerge(variants);
+    bool need_splice = gcd_buckets.size() > 1;
+    auto prepared_data = prepareVariantsToMerge(variants, /*always_merge_into_empty*/ need_splice);
     if (prepared_data->empty())
         return {};
 
-    AggregatedDataVariants result_variants;
-    result_variants.aggregator = this;
-    initDataVariants(result_variants, method_chosen, key_sizes, params);
-    initStatesForWithoutKeyOrOverflow(result_variants);
-
-    auto method = result_variants.type;
-    Arena * arena = result_variants.aggregates_pool;
-    bool clear_states = shouldClearStates(action, final);
+    auto & first = *prepared_data->at(0);
+    assert(first.isTimeBucketTwoLevel());
+    Arena * arena = first.aggregates_pool;
 
     if (false) {} // NOLINT
 #define M(NAME) \
-    else if (method == AggregatedDataVariants::Type::NAME) \
+    else if (first.type == AggregatedDataVariants::Type::NAME) \
     { \
-        using Method = decltype(result_variants.NAME)::element_type; \
+        using Method = decltype(first.NAME)::element_type; \
         for (auto bucket : gcd_buckets) \
-            mergeBucketImpl<Method>(*prepared_data, final, clear_states, bucket, arena); \
-        spliceBucketsImpl<Method>(result_variants, *prepared_data->at(0), final, clear_states, gcd_buckets, arena); \
-        return convertOneBucketToBlockImpl(result_variants, *result_variants.NAME, arena, final, clear_states, 0); \
+            mergeBucketImpl<Method>(*prepared_data, bucket, arena, clear_states); \
+        if (need_splice) \
+        { \
+            spliceBucketsImpl<Method>(first, first, gcd_buckets, arena, /*clear_states*/ true); \
+            return convertOneBucketToBlockImpl(first, *first.NAME, arena, final, /*clear_states*/ true, 0); \
+        } \
+        else \
+            return convertOneBucketToBlockImpl(first, *first.NAME, arena, final, clear_states, gcd_buckets[0]); \
     }
 
     APPLY_FOR_VARIANTS_TIME_BUCKET_TWO_LEVEL(M)
@@ -3870,105 +3482,320 @@ Block Aggregator::mergeAndSpliceAndConvertBucketsToBlock(
     UNREACHABLE();
 }
 
-template <typename Method>
-bool Aggregator::executeAndRetractImpl(
-    Method & method,
-    Arena * aggregates_pool,
-    Method & retracted_method,
-    Arena * retracted_pool,
-    size_t row_begin,
-    size_t row_end,
-    ColumnRawPtrs & key_columns,
-    AggregateFunctionInstruction * aggregate_instructions) const
+void Aggregator::mergeAggregateStates(AggregateDataPtr & dst, AggregateDataPtr & src, Arena * arena, bool clear_states) const
 {
-    typename Method::State state(key_columns, key_sizes, aggregation_state_cache);
-    typename Method::State retracted_state(key_columns, key_sizes, nullptr);
+    assert(src);
+    assert(dst);
 
-    /// Optimization for special case when there are no aggregate functions.
-    if (params.aggregates_size == 0)
+    for (size_t i = 0; i < params.aggregates_size; ++i)
+        aggregate_functions[i]->merge(dst + offsets_of_aggregate_states[i], src + offsets_of_aggregate_states[i], arena);
+
+    if (clear_states)
+        destroyAggregateStates(src);
+}
+
+void Aggregator::destroyAggregateStates(AggregateDataPtr & place) const
+{
+    if (place)
     {
-        if (params.delta_col_pos >= 0)
-            throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Changelog aggregating must have aggregate functions");
+        for (size_t i = 0; i < params.aggregates_size; ++i)
+            aggregate_functions[i]->destroy(place + offsets_of_aggregate_states[i]);
 
-        /// For all rows.
-        AggregateDataPtr place = aggregates_pool->alloc(0);
-        for (size_t i = row_begin; i < row_end; ++i)
-        {
-            auto emplace_result = state.emplaceKey(method.data, i, *aggregates_pool);
-            if (emplace_result.isInserted())
-            {
-                emplace_result.setMapped(place);
-                /// Only add new key
-                retracted_state.emplaceKey(retracted_method.data, i, *retracted_pool).setMapped(place);
-            }
-        }
-        return false;
+        place = nullptr;
     }
+}
 
-    bool need_finalization = false;
-
-    /// NOTE: only row_end-row_start is required, but:
-    /// - this affects only optimize_aggregation_in_order,
-    /// - this is just a pointer, so it should not be significant,
-    /// - and plus this will require other changes in the interface.
-    std::unique_ptr<AggregateDataPtr[]> places(new AggregateDataPtr[row_end]);
+void Aggregator::serializeAggregateStates(const AggregateDataPtr & place, WriteBuffer & wb) const
+{
+    UInt8 has_states = place ? 1 : 0;
+    writeIntBinary(has_states, wb);
+    if (has_states)
+    {
+        for (size_t i = 0; i < params.aggregates_size; ++i)
+            aggregate_functions[i]->serialize(place + offsets_of_aggregate_states[i], wb);
+    }
+}
 
-    /// For all rows.
-    for (size_t i = row_begin; i < row_end; ++i)
+void Aggregator::deserializeAggregateStates(AggregateDataPtr & place, ReadBuffer & rb, Arena * arena) const
+{
+    UInt8 has_states;
+    readIntBinary(has_states, rb);
+    if (has_states)
     {
-        AggregateDataPtr aggregate_data = nullptr;
+        if (!place)
+        {
+            /// Allocate states for all aggregate functions
+            AggregateDataPtr aggregate_data = arena->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
+            createAggregateStates(aggregate_data);
+            place = aggregate_data;
+        }
 
-        auto emplace_result = state.emplaceKey(method.data, i, *aggregates_pool);
+        for (size_t i = 0; i < params.aggregates_size; ++i)
+            aggregate_functions[i]->deserialize(place + offsets_of_aggregate_states[i], rb, std::nullopt, arena);
+    }
+}
 
-        /// If a new key is inserted, initialize the states of the aggregate functions, and possibly something related to the key.
-        if (emplace_result.isInserted())
-        {
-            /// exception-safety - if you can not allocate memory or create states, then destructors will not be called.
-            emplace_result.setMapped(nullptr);
+void Aggregator::doCheckpointV3(const AggregatedDataVariants & data_variants, WriteBuffer & wb) const
+{
+    /// Serialization layout, there are 2 cases:
+    /// 1) Without key: [uint8][uint16][aggr-func-state-without-key]
+    /// 2) Otherwise: [uint8][uint16][aggr-func-state-for-overflow-row][is_two_level][aggr-func-state-in-hash-map]
+    bool inited = !data_variants.empty();
+    writeBoolText(inited, wb);
+    if (!inited)
+        return; /// No aggregated data yet
 
-            aggregate_data = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
-            /// TODO: support use_compiled_functions
-            createAggregateStates(aggregate_data);
-            emplace_result.setMapped(aggregate_data);
+    writeIntBinary<UInt8>(static_cast<UInt8>(data_variants.type), wb);
 
-            /// Save new group without retracted state (used for emit new key group)
-            /// FIXME: There is a bug when use hash table (key8 or key16), it use a optimzed FixedImplicitZeroHashMap that the empty mapped directly means zero (i.e. invalid insertion).
-            /// But in retract group scenario, we need to use an empty mapped to represent no ratracted value for new group
-            /// Use a non-optimized FixedHashMap ? or revisit retract implementation ?
-            retracted_state.emplaceKey(retracted_method.data, i, *retracted_pool).setMapped(nullptr);
-        }
-        else
+    writeIntBinary<UInt8>(static_cast<UInt8>(expanded_data_type), wb);
+
+    auto state_serializer = [this](auto place, auto & wb_) {
+        assert(place);
+        if (expanded_data_type == ExpandedDataType::UpdatedWithRetracted)
         {
-            aggregate_data = emplace_result.getMapped();
+            UpdatedDataEx::serialize(place, wb_);
 
-            /// Save changed group with retracted state (used for emit changed group)
-            auto retracted_result = retracted_state.emplaceKey(retracted_method.data, i, *retracted_pool);
-            if (retracted_result.isInserted())
-            {
-                retracted_result.setMapped(nullptr);
-                auto retracted_data = retracted_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
-                createAggregateStates(retracted_data);
-                /// Copy aggregate data to retracted data before changed
-                mergeAggregateStates(retracted_data, aggregate_data, retracted_pool, /*clear_states*/ false);
-                retracted_result.setMapped(retracted_data);
-            }
+            auto & retracted_place = RetractedDataEx::getRetracted(place);
+            bool has_retracted = retracted_place != nullptr;
+            writeBoolText(has_retracted, wb_);
+            if (has_retracted)
+                for (size_t i = 0; i < params.aggregates_size; ++i)
+                    aggregate_functions[i]->serialize(retracted_place + offsets_of_aggregate_states[i], wb_);
         }
+        else if (expanded_data_type == ExpandedDataType::Updated)
+            UpdatedDataEx::serialize(place, wb_);
 
-        assert(aggregate_data != nullptr);
-        places[i] = aggregate_data;
-    }
+        for (size_t i = 0; i < params.aggregates_size; ++i)
+            aggregate_functions[i]->serialize(place + offsets_of_aggregate_states[i], wb_);
+    };
 
-    /// Add values to the aggregate functions.
-    for (size_t i = 0; i < aggregate_functions.size(); ++i)
-    {
-        AggregateFunctionInstruction * inst = aggregate_instructions + i;
+    /// [aggr-func-state-without-key]
+    assert(!params.overflow_row);
+    if (data_variants.type == AggregatedDataVariants::Type::without_key)
+        state_serializer(data_variants.without_key, wb);
 
-        if (inst->offsets)
-            inst->batch_that->addBatchArray(row_begin, row_end, places.get(), inst->state_offset, inst->batch_arguments, inst->offsets, aggregates_pool);
-        else
-            inst->batch_that->addBatch(row_begin, row_end, places.get(), inst->state_offset, inst->batch_arguments, aggregates_pool, -1, inst->delta_column);
+    /// [aggr-func-state-in-hash-map]
+#define M(NAME, IS_TWO_LEVEL) \
+    else if (data_variants.type == AggregatedDataVariants::Type::NAME) \
+    { \
+        if constexpr (IS_TWO_LEVEL) \
+            DB::serializeTwoLevelHashMap(data_variants.NAME->data, [&](const auto & mapped, WriteBuffer & wb_) { state_serializer(mapped, wb_); }, wb); \
+        else \
+            DB::serializeHashMap(data_variants.NAME->data, [&](const auto & mapped, WriteBuffer & wb_) { state_serializer(mapped, wb_); }, wb); \
+    }
 
-        if (inst->batch_that->isUserDefined())
+    APPLY_FOR_AGGREGATED_VARIANTS_STREAMING(M)
+#undef M
+    else throw Exception("Unknown aggregated data variant.", ErrorCodes::UNKNOWN_AGGREGATED_DATA_VARIANT);
+}
+
+void Aggregator::doRecoverV3(AggregatedDataVariants & data_variants, ReadBuffer & rb) const
+{
+    bool inited = !data_variants.empty();
+    readBoolText(inited, rb);
+    if (!inited)
+        return;
+
+    UInt8 recovered_data_variants_type_uint8;
+    readIntBinary<UInt8>(recovered_data_variants_type_uint8, rb);
+    AggregatedDataVariants::Type recovered_data_variants_type = static_cast<AggregatedDataVariants::Type>(recovered_data_variants_type_uint8);
+
+    data_variants.aggregator = this;
+    initDataVariants(data_variants, method_chosen, key_sizes, params);
+    /// Data variants is inited with single level hashmap, however the checkpoint states are 2 levels
+    /// which means data variants was converted to two level
+    if (data_variants.type != recovered_data_variants_type)
+        if (data_variants.isConvertibleToTwoLevel())
+            data_variants.convertToTwoLevel();
+
+    if (data_variants.type != recovered_data_variants_type)
+        throw Exception(
+            ErrorCodes::RECOVER_CHECKPOINT_FAILED,
+            "Failed to recover aggregation checkpoint. Aggregated data variant type is not compatible, checkpointed={}, current={}",
+            magic_enum::enum_name(recovered_data_variants_type),
+            magic_enum::enum_name(method_chosen));
+
+    UInt8 recovered_expanded_data_type_uint8;
+    readIntBinary<UInt8>(recovered_expanded_data_type_uint8, rb);
+    ExpandedDataType recovered_expanded_data_type = static_cast<ExpandedDataType>(recovered_expanded_data_type_uint8);
+    if (recovered_expanded_data_type != expanded_data_type)
+        throw Exception(
+            ErrorCodes::RECOVER_CHECKPOINT_FAILED,
+            "Failed to recover aggregation checkpoint. Expanded data type is not the same, checkpointed={}, current={}",
+            magic_enum::enum_name(recovered_expanded_data_type),
+            magic_enum::enum_name(expanded_data_type));
+
+    auto state_deserializer = [this](auto & place, auto & rb_, Arena * arena) {
+        place = nullptr; /// exception-safety - if you can not allocate memory or create states, then destructors will not be called.
+        auto aggregate_data = arena->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
+        createAggregateStates(aggregate_data);
+        place = aggregate_data;
+
+        if (expanded_data_type == ExpandedDataType::UpdatedWithRetracted)
+        {
+            UpdatedDataEx::deserialize(place, rb_);
+
+            auto & retracted = RetractedDataEx::getRetracted(place);
+            bool has_retracted = false;
+            readBoolText(has_retracted, rb_);
+            if (has_retracted)
+            {
+                auto tmp_retracted = arena->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
+                createAggregateStates</*use_compiled_functions*/ false, /*skip_expanded_data*/ true>(tmp_retracted);
+                retracted = tmp_retracted;
+                for (size_t i = 0; i < params.aggregates_size; ++i)
+                    aggregate_functions[i]->deserialize(retracted + offsets_of_aggregate_states[i], rb_, std::nullopt, arena);
+            }
+        }
+        else if (expanded_data_type == ExpandedDataType::Updated)
+            UpdatedDataEx::deserialize(place, rb_);
+
+        for (size_t i = 0; i < params.aggregates_size; ++i)
+            aggregate_functions[i]->deserialize(place + offsets_of_aggregate_states[i], rb_, std::nullopt, arena);
+    };
+
+    /// [aggr-func-state-without-key]
+    assert(!params.overflow_row);
+    if (data_variants.type == AggregatedDataVariants::Type::without_key)
+        state_deserializer(data_variants.without_key, rb, data_variants.aggregates_pool);
+
+    /// [aggr-func-state-in-hash-map]
+#define M(NAME, IS_TWO_LEVEL) \
+    else if (data_variants.type == AggregatedDataVariants::Type::NAME) \
+    { \
+        if constexpr (IS_TWO_LEVEL) \
+            DB::deserializeTwoLevelHashMap(data_variants.NAME->data, [&](auto & mapped, Arena & pool, ReadBuffer & rb_) { state_deserializer(mapped, rb_, &pool); }, *data_variants.aggregates_pool, rb); \
+        else \
+            DB::deserializeHashMap(data_variants.NAME->data, [&](auto & mapped, Arena & pool, ReadBuffer & rb_) { state_deserializer(mapped, rb_, &pool); }, *data_variants.aggregates_pool, rb); \
+    }
+
+    APPLY_FOR_AGGREGATED_VARIANTS_STREAMING(M)
+#undef M
+    else throw Exception("Unknown aggregated data variant.", ErrorCodes::UNKNOWN_AGGREGATED_DATA_VARIANT);
+}
+
+bool Aggregator::checkAndProcessResult(AggregatedDataVariants & result, bool & no_more_keys) const
+{
+    size_t result_size = result.sizeWithoutOverflowRow();
+    Int64 current_memory_usage = 0;
+    if (auto * memory_tracker_child = CurrentThread::getMemoryTracker())
+        if (auto * memory_tracker = memory_tracker_child->getParent())
+            current_memory_usage = memory_tracker->get();
+
+    /// Here all the results in the sum are taken into account, from different threads.
+    Int64 result_size_bytes = current_memory_usage - memory_usage_before_aggregation;
+
+    bool worth_convert_to_two_level = worthConvertToTwoLevel(
+        params.group_by_two_level_threshold, result_size, params.group_by_two_level_threshold_bytes, result_size_bytes);
+
+    /** Converting to a two-level data structure.
+      * It allows you to make, in the subsequent, an effective merge - either economical from memory or parallel.
+      */
+    if (result.isConvertibleToTwoLevel() && worth_convert_to_two_level)
+        result.convertToTwoLevel();
+
+    /// Checking the constraints.
+    if (!checkLimits(result_size, no_more_keys))
+        return true;
+
+    /** Flush data to disk if too much RAM is consumed.
+      * Data can only be flushed to disk if a two-level aggregation structure is used.
+      */
+    if (params.max_bytes_before_external_group_by
+        && result.isTwoLevel()
+        && current_memory_usage > static_cast<Int64>(params.max_bytes_before_external_group_by)
+        && worth_convert_to_two_level)
+    {
+        size_t size = current_memory_usage + params.min_free_disk_space;
+
+        std::string tmp_path = params.tmp_volume->getDisk()->getPath();
+
+        // enoughSpaceInDirectory() is not enough to make it right, since
+        // another process (or another thread of aggregator) can consume all
+        // space.
+        //
+        // But true reservation (IVolume::reserve()) cannot be used here since
+        // current_memory_usage does not take compression into account and
+        // will reserve way more that actually will be used.
+        //
+        // Hence, let's do a simple check.
+        if (!enoughSpaceInDirectory(tmp_path, size))
+            throw Exception("Not enough space for external aggregation in " + tmp_path, ErrorCodes::NOT_ENOUGH_SPACE);
+
+        writeToTemporaryFile(result, tmp_path);
+    }
+
+    return false;
+}
+
+template <typename Method>
+bool Aggregator::executeAndRetractImpl(
+    Method & method,
+    Arena * aggregates_pool,
+    Arena * retracted_pool,
+    size_t row_begin,
+    size_t row_end,
+    ColumnRawPtrs & key_columns,
+    AggregateFunctionInstruction * aggregate_instructions) const
+{
+    typename Method::State state(key_columns, key_sizes, aggregation_state_cache);
+    bool need_finalization = false;
+
+    /// NOTE: only row_end-row_start is required, but:
+    /// - this affects only optimize_aggregation_in_order,
+    /// - this is just a pointer, so it should not be significant,
+    /// - and plus this will require other changes in the interface.
+    std::unique_ptr<AggregateDataPtr[]> places(new AggregateDataPtr[row_end]);
+
+    /// For all rows.
+    for (size_t i = row_begin; i < row_end; ++i)
+    {
+        AggregateDataPtr aggregate_data = nullptr;
+
+        auto emplace_result = state.emplaceKey(method.data, i, *aggregates_pool);
+
+        /// If a new key is inserted, initialize the states of the aggregate functions, and possibly something related to the key.
+        if (emplace_result.isInserted())
+        {
+            /// exception-safety - if you can not allocate memory or create states, then destructors will not be called.
+            emplace_result.setMapped(nullptr);
+
+            aggregate_data = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
+            /// TODO: support use_compiled_functions
+            createAggregateStates(aggregate_data);
+            emplace_result.setMapped(aggregate_data);
+        }
+        else
+        {
+            aggregate_data = emplace_result.getMapped();
+
+            /// Save changed group with retracted state (used for emit changed group)
+            /// If there are aggregate data and no retracted data, copy aggregate data to retracted data before changed
+            if (!UpdatedDataEx::isEmpty(aggregate_data) && !RetractedDataEx::hasRetracted(aggregate_data))
+            {
+                auto & retracted = RetractedDataEx::getRetracted(aggregate_data);
+                auto tmp_retracted = retracted_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
+                createAggregateStates</*use_compiled_functions*/ false, /*skip_expanded_data*/ true>(tmp_retracted);
+                retracted = tmp_retracted;
+                mergeAggregateStates(retracted, aggregate_data, retracted_pool, /*clear_states*/ false);
+            }
+        }
+
+        assert(aggregate_data != nullptr);
+        places[i] = aggregate_data;
+    }
+
+    /// Add values to the aggregate functions.
+    for (size_t i = 0; i < aggregate_functions.size(); ++i)
+    {
+        AggregateFunctionInstruction * inst = aggregate_instructions + i;
+
+        if (inst->offsets)
+            inst->batch_that->addBatchArray(row_begin, row_end, places.get(), inst->state_offset, inst->batch_arguments, inst->offsets, aggregates_pool);
+        else
+            inst->batch_that->addBatch(row_begin, row_end, places.get(), inst->state_offset, inst->batch_arguments, aggregates_pool, -1, inst->delta_column);
+
+        if (inst->batch_that->isUserDefined())
         {
             AggregateDataPtr * places_ptr = places.get();
             /// It is ok to re-flush if it is flush already, then we don't need maintain a map to check if it is ready flushed
@@ -3984,6 +3811,9 @@ bool Aggregator::executeAndRetractImpl(
         }
     }
 
+    if (hasExpandedData())
+        UpdatedDataEx::addBatch(row_begin, row_end, places.get(), aggregate_instructions ? aggregate_instructions->delta_column : nullptr);
+
     return need_finalization;
 }
 
@@ -3992,7 +3822,6 @@ std::pair<bool, bool> Aggregator::executeAndRetractOnBlock(
     size_t row_begin,
     size_t row_end,
     AggregatedDataVariants & result,
-    AggregatedDataVariants & retracted_result,
     ColumnRawPtrs & key_columns,
     AggregateColumns & aggregate_columns,
     bool & no_more_keys) const
@@ -4008,6 +3837,7 @@ std::pair<bool, bool> Aggregator::executeAndRetractOnBlock(
     if (result.empty())
     {
         initDataVariants(result, method_chosen, key_sizes, params);
+        initStatesForWithoutKeyOrOverflow(result);
         LOG_TRACE(log, "Aggregation method: {}", result.getMethodName());
     }
 
@@ -4020,324 +3850,389 @@ std::pair<bool, bool> Aggregator::executeAndRetractOnBlock(
     prepareAggregateInstructions(columns, aggregate_columns, materialized_columns, aggregate_functions_instructions, nested_columns_holder);
 
     assert(!params.overflow_row && !no_more_keys);
-
-    retracted_result.aggregator = this;
+    assert(expanded_data_type == ExpandedDataType::UpdatedWithRetracted);
     if (result.type == AggregatedDataVariants::Type::without_key)
     {
         /// Save last finalization state into `retracted_result` before processing new data.
         /// We shall clear and reset it after finalization
-        if (retracted_result.empty())
+        if (!UpdatedDataEx::isEmpty(result.without_key) && !RetractedDataEx::hasRetracted(result.without_key))
         {
-            initDataVariants(retracted_result, method_chosen, key_sizes, params);
-
-            if (result.without_key)
-            {
-                initStatesForWithoutKeyOrOverflow(retracted_result);
-                mergeAggregateStates(retracted_result.without_key, result.without_key, retracted_result.aggregates_pool, false);
-            }
+            auto & retracted = RetractedDataEx::getRetracted(result.without_key);
+            auto tmp_retracted = result.retracted_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
+            createAggregateStates</*use_compiled_functions*/ false, /*skip_expanded_data*/ true>(tmp_retracted);
+            retracted = tmp_retracted;
+            mergeAggregateStates(retracted, result.without_key, result.retracted_pool.get(), /*clear_states*/ false);
         }
 
-        initStatesForWithoutKeyOrOverflow(result);
-        need_finalization = executeWithoutKeyImpl<false>(result.without_key, row_begin, row_end, aggregate_functions_instructions.data(), result.aggregates_pool);
+        need_finalization = executeWithoutKeyImpl<false>(
+            result.without_key, row_begin, row_end, aggregate_functions_instructions.data(), result.aggregates_pool);
     }
-    else
-    {
-        if (retracted_result.empty())
-            initDataVariants(retracted_result, method_chosen, key_sizes, params);
 
-        if (result.isTwoLevel() && !retracted_result.isTwoLevel())
-            retracted_result.convertToTwoLevel();
-
-        #define M(NAME, IS_TWO_LEVEL) \
-            else if (result.type == AggregatedDataVariants::Type::NAME) \
-                need_finalization = executeAndRetractImpl(*result.NAME, result.aggregates_pool, *retracted_result.NAME, retracted_result.aggregates_pool, row_begin, row_end, key_columns, aggregate_functions_instructions.data());
+#define M(NAME, IS_TWO_LEVEL) \
+    else if (result.type == AggregatedDataVariants::Type::NAME) \
+        need_finalization = executeAndRetractImpl(*result.NAME, result.aggregates_pool, result.retracted_pool.get(), row_begin, row_end, key_columns, aggregate_functions_instructions.data());
 
-        if (false) {} // NOLINT
-        APPLY_FOR_AGGREGATED_VARIANTS_STREAMING(M)
-        #undef M
-    }
+    APPLY_FOR_AGGREGATED_VARIANTS_STREAMING(M)
+#undef M
 
     need_abort = checkAndProcessResult(result, no_more_keys);
-    /// it's possible for gloabl single level hash table was converted to two level table after `checkAndProcessResult`,
-    /// so we also convert retarcted data to two level.
-    if (result.isTwoLevel() && !retracted_result.isTwoLevel())
-        retracted_result.convertToTwoLevel();
-
     return return_result;
 }
 
-std::pair<AggregatedDataVariantsPtr, AggregatedDataVariantsPtr>
-Aggregator::mergeRetractedGroups(ManyAggregatedDataVariants & aggregated_data, ManyAggregatedDataVariants & retracted_data) const
+BlocksList Aggregator::convertUpdatedToBlocks(AggregatedDataVariants & data_variants) const
 {
-    auto prepared_data = prepareVariantsToMerge(aggregated_data, /*always_merge_into_empty*/ true);
-    if (prepared_data->empty())
-        return {};
+    LOG_DEBUG(log, "Converting updated aggregated data to blocks");
 
-    auto first = prepared_data->at(0);
+    Stopwatch watch;
 
-    auto prepared_retracted_data = prepareVariantsToMerge(retracted_data, first->type != AggregatedDataVariants::Type::without_key);
-    assert(!prepared_retracted_data->empty());
+    BlocksList blocks;
 
-    /// So far, only global aggregation support emit changelog, so time bucket two level is not possible
+    /// In what data structure is the data aggregated?
+    if (data_variants.empty())
+        return blocks;
 
-#define M(NAME, ...) \
-    else if (first->type == AggregatedDataVariants::Type::NAME) \
-        mergeRetractedGroupsImpl<decltype(first->NAME)::element_type>(*prepared_data, *prepared_retracted_data);
+    if (unlikely(params.overflow_row))
+        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Overflow row processing is not implemented in streaming aggregation");
 
-    if (first->type == AggregatedDataVariants::Type::without_key)
-    {
-        mergeWithoutKeyDataImpl(*prepared_retracted_data, true);
-        mergeWithoutKeyDataImpl(*prepared_data, false);
-    }
-    APPLY_FOR_VARIANTS_SINGLE_LEVEL_STREAMING(M)
-    APPLY_FOR_VARIANTS_STATIC_BUCKET_TWO_LEVEL(M)
-#undef M
+    constexpr bool final = true;
+    constexpr bool clear_states = false;
+    if (data_variants.type == AggregatedDataVariants::Type::without_key)
+        blocks.emplace_back(prepareBlockAndFillWithoutKey(data_variants, final, false, clear_states, AggregateStateType::OnlyUpdated));
+    else if (!data_variants.isTwoLevel())
+        blocks.emplace_back(prepareBlockAndFillSingleLevel(data_variants, final, clear_states, AggregateStateType::OnlyUpdated));
     else
-        throw Exception("Unknown aggregated data variant.", ErrorCodes::UNKNOWN_AGGREGATED_DATA_VARIANT);
+        blocks.splice(blocks.end(), prepareBlocksAndFillTwoLevel(data_variants, final, clear_states, /*max_threads*/ 1, AggregateStateType::OnlyUpdated));
 
-    return {prepared_data->at(0), prepared_retracted_data->at(0)};
-}
+    size_t rows = 0;
+    size_t bytes = 0;
 
-template <typename Method>
-void Aggregator::mergeRetractedGroupsImpl(
-    ManyAggregatedDataVariants & aggregated_data, ManyAggregatedDataVariants & retracted_data) const
-{
-    AggregatedDataVariantsPtr & res = aggregated_data[0];
-    AggregatedDataVariantsPtr & retracted_res = retracted_data[0];
+    for (const auto & block : blocks)
+    {
+        rows += block.rows();
+        bytes += block.bytes();
+    }
 
-    bool no_more_keys = false;
+    double elapsed_seconds = watch.elapsedSeconds();
+    LOG_DEBUG(log,
+        "Converted updated aggregated data to blocks. {} rows, {} in {} sec. ({:.3f} rows/sec., {}/sec.)",
+        rows, ReadableSize(bytes),
+        elapsed_seconds, rows / elapsed_seconds,
+        ReadableSize(bytes / elapsed_seconds));
+
+    return blocks;
+}
 
-    using Table = typename Method::Data;
-    Table & dst_table = getDataVariant<Method>(*res).data;
-    Table & dst_retracted_table = getDataVariant<Method>(*retracted_res).data;
 
-    /// First data variants always is empty.
-    assert(dst_table.empty() && dst_retracted_table.empty());
+template <typename Method, bool is_two_level>
+void NO_INLINE Aggregator::mergeUpdatedGroupsImpl(ManyAggregatedDataVariants & non_empty_data, Arena * arena) const
+{
+    AggregatedDataVariantsPtr & res = non_empty_data[0];
+    auto & dst_table = getDataVariant<Method>(*res).data;
+    /// Always merge updated data into empty first.
+    assert(dst_table.empty());
 
     /// For example:
     ///                 thread-1        thread-2
-    ///     group-1      changed        non-changed
-    ///     group-2     non-changed     changed
-    ///     group-3     non-changed     non-changed
-
-    /// Collect all changed groups, then merge retracted/updated data
-    /// 1) Collect changed groups:
-    /// `dst_retracted` <= (thread-1: group-1) + (thread-2: group-2)
-    for (size_t result_num = 1, size = retracted_data.size(); result_num < size; ++result_num)
+    ///     group-1     updated       non-updated
+    ///     group-2     non-updated   updated
+    ///     group-3     non-updated   non-updated
+    ///
+    /// 1) Collect all updated groups
+    /// `dst` <= (group-1, group-2)
+    bool no_more_keys = false;
+    using Table = typename Method::Data;
+    for (size_t result_num = 1, size = non_empty_data.size(); result_num < size; ++result_num)
     {
-        if (!checkLimits(retracted_res->sizeWithoutOverflowRow(), no_more_keys))
+        if (!checkLimits(res->sizeWithoutOverflowRow(), no_more_keys))
             break;
 
         assert(!no_more_keys);
 
-        auto & src_retracted_table = getDataVariant<Method>(*retracted_data[result_num]).data;
-        src_retracted_table.mergeToViaEmplace(dst_retracted_table, [&](AggregateDataPtr & __restrict dst, AggregateDataPtr & __restrict src, bool inserted) {
+        auto & src_table = getDataVariant<Method>(*non_empty_data[result_num]).data;
+        auto merge_updated_func = [&](const auto & key, auto & mapped) {
+            /// Skip no updated group
+            if (!UpdatedDataEx::isUpdated(mapped))
+                return;
+
+            typename Table::LookupResult dst_it;
+            bool inserted;
+            /// For StringRef `key`, it is safe to store to `dst_table`
+            /// since the `dst_table` is temporary and the `src_table` will not be cleaned in the meantime
+            dst_table.emplace(key, dst_it, inserted);
             if (inserted)
-                dst = nullptr;
+            {
+                auto & dst = dst_it->getMapped();
+                dst = nullptr; /// exception-safety - if you can not allocate memory or create states, then destructors will not be called.
+                auto aggregate_data = arena->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
+                createAggregateStates</*use_compiled_functions*/ false, /*skip_expanded_data*/ true>(aggregate_data);
+                dst = aggregate_data;
+            }
+        };
 
-            mergeAggregateStates(dst, src, retracted_res->aggregates_pool, true);
-        });
+        if constexpr (is_two_level)
+            src_table.forEachValueOfUpdatedBuckets(std::move(merge_updated_func), /*reset_updated*/ true);
+        else
+            src_table.forEachValue(std::move(merge_updated_func));
     }
 
-    /// 2) Merge retracted groups non-changed thread parts (based on all changed groups)
-    /// `dst_retracted` <= (thread-1: group-2) + (thread-2: group-1)
-    for (size_t result_num = 1, size = retracted_data.size(); result_num < size; ++result_num)
+    /// 2) Merge all updated groups parts for each thread (based on `1)` )
+    /// `dst` <= (thread-1: group-1  group-2) + (thread-2: group-1 group-2)
+    for (size_t result_num = 1, size = non_empty_data.size(); result_num < size; ++result_num)
     {
-        if (!checkLimits(retracted_res->sizeWithoutOverflowRow(), no_more_keys))
-            break;
-
-        assert(!no_more_keys);
-
-        auto & current_retracted = *retracted_data[result_num];
-        Table & src_retracted_table = getDataVariant<Method>(current_retracted).data;
-        Table & src_aggregated_table = getDataVariant<Method>(*aggregated_data[result_num]).data;
-        dst_retracted_table.forEachValue([&](const auto & key, auto & mapped) {
-            /// Merge retracted groups non-changed thread parts
-            if (!src_retracted_table.find(key))
+        auto & src_table = getDataVariant<Method>(*non_empty_data[result_num]).data;
+        dst_table.forEachValue([&](const auto & key, auto & mapped) {
+            if (auto find_it = src_table.find(key))
             {
-                auto find_it = src_aggregated_table.find(key);
-                if (find_it)
-                    mergeAggregateStates(
-                        mapped,
-                        find_it->getMapped(),
-                        retracted_res->aggregates_pool,
-                        /*clear_states*/ false);
-            }});
-
-        /// Reset retracted data after finalization
-        clearDataVariants(current_retracted);
+                mergeAggregateStates(mapped, find_it->getMapped(), arena, /*clear_states*/ false);
+                /// NOTE: We always reset the updated flag after merged
+                UpdatedDataEx::resetUpdated(find_it->getMapped());
+            }
+        });
     }
+}
 
-    /// 3) Merge new/updated groups (based on all changed groups)
-    /// `dst` <= (thread-1: group-1 group-2) + (thread-2: group-1 group-2)
-    for (size_t result_num = 1, size = aggregated_data.size(); result_num < size; ++result_num)
-    {
-        if (!checkLimits(res->sizeWithoutOverflowRow(), no_more_keys))
-            break;
+AggregatedDataVariantsPtr Aggregator::mergeUpdatedGroups(ManyAggregatedDataVariants & data_variants) const
+{
+    auto prepared_data_ptr = prepareVariantsToMerge(data_variants, /*always_merge_into_empty*/ true);
+    if (prepared_data_ptr->empty())
+        return {};
 
-        assert(!no_more_keys);
-        Table & src_aggregated_table = getDataVariant<Method>(*aggregated_data[result_num]).data;
-        dst_retracted_table.forEachValue([&](const auto & key, auto & mapped) {
-            /// Merge new/updated groups
-            typename Table::LookupResult dst_it;
-            bool inserted;
+    if (unlikely(params.overflow_row))
+        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Overflow row processing is not implemented in streaming aggregation");
 
-            /// NOTE: For StringRef `key`, its memory was allocated in `retracted_res->aggregates_pool`,
-            /// we shall save this key in itself pool (i.e. res->aggregates_pool) if inserted
-            using KeyType = std::decay_t<decltype(key)>;
-            if constexpr (std::is_same_v<KeyType, StringRef>)
-                dst_table.emplace(ArenaKeyHolder{key, *res->aggregates_pool}, dst_it, inserted);
-            else
-                dst_table.emplace(key, dst_it, inserted);
+    BlocksList blocks;
+    auto & first = *prepared_data_ptr->at(0);
+    if (first.type == AggregatedDataVariants::Type::without_key)
+    {
+        if (std::ranges::none_of(*prepared_data_ptr, [](auto & variants) {
+                return variants->without_key && UpdatedDataEx::isUpdated(variants->without_key);
+            }))
+            return {};
 
-            if (inserted)
-                dst_it->getMapped() = nullptr;
-            
-            auto find_it = src_aggregated_table.find(key);
-            if (find_it)
-                mergeAggregateStates(
-                    dst_it->getMapped(),
-                    find_it->getMapped(),
-                    res->aggregates_pool,
-                    /*clear_states*/ false);
-        });
+        mergeWithoutKeyDataImpl(*prepared_data_ptr, /*clear_states*/ false);
     }
+
+#define M(NAME, IS_TWO_LEVEL) \
+    else if (first.type == AggregatedDataVariants::Type::NAME) \
+        mergeUpdatedGroupsImpl<decltype(first.NAME)::element_type, IS_TWO_LEVEL>(*prepared_data_ptr, first.aggregates_pool);
+
+    APPLY_FOR_AGGREGATED_VARIANTS_STREAMING(M)
+#undef M
+    else throw Exception("Unknown aggregated data variant.", ErrorCodes::UNKNOWN_AGGREGATED_DATA_VARIANT);
+
+    return prepared_data_ptr->at(0);
 }
 
-void Aggregator::mergeAggregateStates(AggregateDataPtr & dst, AggregateDataPtr & src, Arena * arena, bool clear_states) const
+BlocksList Aggregator::convertRetractedToBlocks(AggregatedDataVariants & data_variants) const
 {
-    if (!src)
-        return;
+    LOG_DEBUG(log, "Converting retracted aggregated data to blocks");
+
+    Stopwatch watch;
+
+    BlocksList blocks;
+
+    /// In what data structure is the data aggregated?
+    if (data_variants.empty())
+        return blocks;
+
+    if (unlikely(params.overflow_row))
+        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Overflow row processing is not implemented in streaming aggregation");
+
+    constexpr bool final = true;
+    constexpr bool clear_states = true;
+    if (data_variants.type == AggregatedDataVariants::Type::without_key)
+        blocks.emplace_back(prepareBlockAndFillWithoutKey(data_variants, final, false, clear_states, AggregateStateType::OnlyRetracted));
+    else if (!data_variants.isTwoLevel())
+        blocks.emplace_back(prepareBlockAndFillSingleLevel(data_variants, final, clear_states, AggregateStateType::OnlyRetracted));
+    else
+        blocks.splice(blocks.end(), prepareBlocksAndFillTwoLevel(data_variants, final, clear_states, /*max_threads*/ 1, AggregateStateType::OnlyRetracted));
+
+    size_t rows = 0;
+    size_t bytes = 0;
 
-    if (!dst)
+    for (const auto & block : blocks)
     {
-        dst = arena->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
-        createAggregateStates(dst);
+        rows += block.rows();
+        bytes += block.bytes();
     }
 
-    for (size_t i = 0; i < params.aggregates_size; ++i)
-        aggregate_functions[i]->merge(dst + offsets_of_aggregate_states[i], src + offsets_of_aggregate_states[i], arena);
+    double elapsed_seconds = watch.elapsedSeconds();
+    LOG_DEBUG(log,
+        "Converted retracted aggregated data to blocks. {} rows, {} in {} sec. ({:.3f} rows/sec., {}/sec.)",
+        rows, ReadableSize(bytes),
+        elapsed_seconds, rows / elapsed_seconds,
+        ReadableSize(bytes / elapsed_seconds));
 
-    if (clear_states)
-        destroyAggregateStates(src);
+    return blocks;
 }
 
-void Aggregator::destroyAggregateStates(AggregateDataPtr & place) const
+template <typename Method>
+void Aggregator::mergeRetractedGroupsImpl(ManyAggregatedDataVariants & non_empty_data, Arena * arena) const
 {
-    if (place)
+    AggregatedDataVariantsPtr & res = non_empty_data[0];
+    auto & dst_table = getDataVariant<Method>(*res).data;
+    /// Always merge retracted data into empty first.
+    assert(dst_table.empty());
+
+    /// For example:
+    ///                 thread-1        thread-2
+    ///     group-1     retracted       non-retracted
+    ///     group-2     non-retracted   retracted
+    ///     group-3     non-retracted   non-retracted
+    ///
+    /// 1) Collect all retracted groups
+    /// `dst` <= (group-1, group-2)
+    bool no_more_keys = false;
+    using Table = typename Method::Data;
+    for (size_t result_num = 1, size = non_empty_data.size(); result_num < size; ++result_num)
     {
-        for (size_t i = 0; i < params.aggregates_size; ++i)
-            aggregate_functions[i]->destroy(place + offsets_of_aggregate_states[i]);
+        if (!checkLimits(res->sizeWithoutOverflowRow(), no_more_keys))
+            break;
 
-        place = nullptr;
-    }
-}
+        assert(!no_more_keys);
 
-void Aggregator::serializeAggregateStates(const AggregateDataPtr & place, WriteBuffer & wb) const
-{
-    UInt8 has_states = place ? 1 : 0;
-    writeIntBinary(has_states, wb);
-    if (has_states)
-    {
-        for (size_t i = 0; i < params.aggregates_size; ++i)
-            aggregate_functions[i]->serialize(place + offsets_of_aggregate_states[i], wb);
+        auto & src_table = getDataVariant<Method>(*non_empty_data[result_num]).data;
+        src_table.forEachValue([&](const auto & key, auto & mapped) {
+            /// Skip no retracted group
+            if (!RetractedDataEx::hasRetracted(mapped))
+                return;
+
+            typename Table::LookupResult dst_it;
+            bool inserted;
+            /// For StringRef `key`, it is safe to store to `dst_table`
+            /// since the `dst_table` is temporary and the `src_table` will not be cleaned in the meantime
+            dst_table.emplace(key, dst_it, inserted);
+            if (inserted)
+            {
+                auto & dst = dst_it->getMapped();
+                dst = nullptr; /// exception-safety - if you can not allocate memory or create states, then destructors will not be called.
+                auto aggregate_data = arena->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
+                createAggregateStates</*use_compiled_functions*/ false, /*skip_expanded_data*/ true>(aggregate_data);
+                dst = aggregate_data;
+            }
+        });
     }
-}
 
-void Aggregator::deserializeAggregateStates(AggregateDataPtr & place, ReadBuffer & rb, Arena * arena) const
-{
-    UInt8 has_states;
-    readIntBinary(has_states, rb);
-    if (has_states)
+    /// 2) Merge all retracted groups parts for each thread (based on `1)` )
+    /// `dst` <= (thread-1: group-1  group-2) + (thread-2: group-1 group-2)
+    for (size_t result_num = 1, size = non_empty_data.size(); result_num < size; ++result_num)
     {
-        if (!place)
-        {
-            /// Allocate states for all aggregate functions
-            AggregateDataPtr aggregate_data = arena->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
-            createAggregateStates(aggregate_data);
-            place = aggregate_data;
-        }
+        auto & current = *non_empty_data[result_num];
+        auto & src_table = getDataVariant<Method>(current).data;
+        dst_table.forEachValue([&](const auto & key, auto & mapped) {
+            if (auto find_it = src_table.find(key))
+            {
+                auto & src_mapped = find_it->getMapped();
+                if (RetractedDataEx::hasRetracted(src_mapped))
+                    mergeAggregateStates(mapped, RetractedDataEx::getRetracted(src_mapped), arena, /*clear_states*/ true);
+                else
+                    /// If retracted data not exist, assume it does't be changed, we should use original data
+                    mergeAggregateStates(mapped, src_mapped, arena, /*clear_states*/ false);
+            }
+        });
 
-        for (size_t i = 0; i < params.aggregates_size; ++i)
-            aggregate_functions[i]->deserialize(place + offsets_of_aggregate_states[i], rb, std::nullopt, arena);
+        current.resetRetractedPool();
     }
 }
 
-void Aggregator::clearDataVariants(AggregatedDataVariants & data_variants) const
+AggregatedDataVariantsPtr Aggregator::mergeRetractedGroups(ManyAggregatedDataVariants & data_variants) const
 {
-    /// Clear states
-    destroyAllAggregateStates(data_variants);
+    auto prepared_data_ptr = prepareVariantsToMerge(data_variants, /*always_merge_into_empty*/ true);
+    if (prepared_data_ptr->empty())
+        return {};
 
-    /// Clear hash map
-    switch (data_variants.type)
+    if (unlikely(params.overflow_row))
+        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Overflow row processing is not implemented in streaming aggregation");
+
+    auto & first = *prepared_data_ptr->at(0);
+    if (first.type == AggregatedDataVariants::Type::without_key)
     {
-        case AggregatedDataVariants::Type::EMPTY:       break;
-        case AggregatedDataVariants::Type::without_key: break;
+        if (std::ranges::none_of(*prepared_data_ptr, [](auto & variants) { return RetractedDataEx::hasRetracted(variants->without_key); }))
+            return {}; /// Skip if no retracted
 
-    #define M(NAME, IS_TWO_LEVEL) \
-        case AggregatedDataVariants::Type::NAME: data_variants.NAME.reset(); break;
-        APPLY_FOR_AGGREGATED_VARIANTS_STREAMING(M)
-    #undef M
+        for (size_t result_num = 1, size = prepared_data_ptr->size(); result_num < size; ++result_num)
+        {
+            auto & src_without_key = (*prepared_data_ptr)[result_num]->without_key;
+            if (RetractedDataEx::hasRetracted(src_without_key))
+                mergeAggregateStates(first.without_key, RetractedDataEx::getRetracted(src_without_key), first.aggregates_pool, /*clear_states*/ true);
+            else
+                /// If retracted data not exist, assume it does't be changed, we should use original data
+                mergeAggregateStates(first.without_key, src_without_key, first.aggregates_pool, /*clear_states*/ false);
+        }
     }
-    data_variants.invalidate();
 
-    /// Reset pool
-    data_variants.aggregates_pools = Arenas(1, std::make_shared<Arena>());
-    data_variants.aggregates_pool = data_variants.aggregates_pools.back().get();
+#define M(NAME) \
+    else if (first.type == AggregatedDataVariants::Type::NAME) \
+        mergeRetractedGroupsImpl<decltype(first.NAME)::element_type>(*prepared_data_ptr, first.aggregates_pool);
+
+    APPLY_FOR_VARIANTS_SINGLE_LEVEL_STREAMING(M)
+    APPLY_FOR_VARIANTS_ALL_TWO_LEVEL(M)
+#undef M
+    else throw Exception("Unknown aggregated data variant.", ErrorCodes::UNKNOWN_AGGREGATED_DATA_VARIANT);
+
+    return prepared_data_ptr->at(0);
 }
 
-bool Aggregator::checkAndProcessResult(AggregatedDataVariants & result, bool & no_more_keys) const
+template <typename Method>
+void Aggregator::mergeRetractedIntoImpl(Method & method, Method & retracted_method, Arena * arena) const
 {
-    size_t result_size = result.sizeWithoutOverflowRow();
-    Int64 current_memory_usage = 0;
-    if (auto * memory_tracker_child = CurrentThread::getMemoryTracker())
-        if (auto * memory_tracker = memory_tracker_child->getParent())
-            current_memory_usage = memory_tracker->get();
+    using Table = typename Method::Data;
+    Table & table = method.data;
+    Table & retracted_table = retracted_method.data;
 
-    /// Here all the results in the sum are taken into account, from different threads.
-    Int64 result_size_bytes = current_memory_usage - memory_usage_before_aggregation;
+    retracted_table.forEachValue([&](const auto & key, auto & retracted_mapped) {
 
-    bool worth_convert_to_two_level = worthConvertToTwoLevel(
-        params.group_by_two_level_threshold, result_size, params.group_by_two_level_threshold_bytes, result_size_bytes);
+        auto find_it = table.find(key);
+        assert(find_it);
 
-    /** Converting to a two-level data structure.
-      * It allows you to make, in the subsequent, an effective merge - either economical from memory or parallel.
-      */
-    if (result.isConvertibleToTwoLevel() && worth_convert_to_two_level)
-        result.convertToTwoLevel();
+        auto & mapped = find_it->getMapped();
+        assert(!RetractedDataEx::hasRetracted(mapped));
+        UpdatedDataEx::setUpdated(mapped);
 
-    /// Checking the constraints.
-    if (!checkLimits(result_size, no_more_keys))
-        return true;
-
-    /** Flush data to disk if too much RAM is consumed.
-      * Data can only be flushed to disk if a two-level aggregation structure is used.
-      */
-    if (params.max_bytes_before_external_group_by
-        && result.isTwoLevel()
-        && current_memory_usage > static_cast<Int64>(params.max_bytes_before_external_group_by)
-        && worth_convert_to_two_level)
-    {
-        size_t size = current_memory_usage + params.min_free_disk_space;
+        /// For old impl, no retracted data for new group
+        if (!retracted_mapped)
+            return;
 
-        std::string tmp_path = params.tmp_volume->getDisk()->getPath();
+        auto & retracted = RetractedDataEx::getRetracted(mapped);
+        auto tmp_retracted = arena->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
+        createAggregateStates</*use_compiled_functions*/ false, /*skip_expanded_data*/ true>(tmp_retracted);
+        retracted = tmp_retracted;
+        mergeAggregateStates(retracted, retracted_mapped, arena, /*clear_states*/ true);
+    });
+}
 
-        // enoughSpaceInDirectory() is not enough to make it right, since
-        // another process (or another thread of aggregator) can consume all
-        // space.
-        //
-        // But true reservation (IVolume::reserve()) cannot be used here since
-        // current_memory_usage does not take compression into account and
-        // will reserve way more that actually will be used.
-        //
-        // Hence, let's do a simple check.
-        if (!enoughSpaceInDirectory(tmp_path, size))
-            throw Exception("Not enough space for external aggregation in " + tmp_path, ErrorCodes::NOT_ENOUGH_SPACE);
+void Aggregator::mergeRetractedInto(AggregatedDataVariants & result, AggregatedDataVariants && retracted_result) const
+{
+    assert(expanded_data_type == ExpandedDataType::UpdatedWithRetracted);
+    if (result.type != retracted_result.type) [[unlikely]]
+        throw Exception(
+            ErrorCodes::LOGICAL_ERROR,
+            "Don't merge retracted aggregation result, the current data variants type is {}, but retracted data variants type is {}",
+            magic_enum::enum_name(result.type),
+            magic_enum::enum_name(retracted_result.type));
 
-        writeToTemporaryFile(result, tmp_path);
+    Arena * arena = result.retracted_pool.get();
+    if (result.type == AggregatedDataVariants::Type::without_key)
+    {
+        if (retracted_result.without_key)
+        {
+            assert(!RetractedDataEx::hasRetracted(result.without_key));
+            auto & retracted = RetractedDataEx::getRetracted(result.without_key);
+            auto tmp_retracted = arena->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
+            createAggregateStates</*use_compiled_functions*/ false, /*skip_expanded_data*/ true>(tmp_retracted);
+            retracted = tmp_retracted;
+            mergeAggregateStates(retracted, retracted_result.without_key, arena, /*clear_states*/ true);
+        }
     }
 
-    return false;
+#define M(NAME, IS_TWO_LEVEL) \
+    else if (result.type == AggregatedDataVariants::Type::NAME) \
+        mergeRetractedIntoImpl(*result.NAME, *retracted_result.NAME, arena);
+
+    APPLY_FOR_AGGREGATED_VARIANTS_STREAMING(M)
+#undef M
+
+    retracted_result.reset();
 }
 
 void Aggregator::updateMetrics(const AggregatedDataVariants & variants, AggregatedDataMetrics & metrics) const
diff --git a/src/Interpreters/Streaming/Aggregator.h b/src/Interpreters/Streaming/Aggregator.h
index e90535dea75..945260170a0 100644
--- a/src/Interpreters/Streaming/Aggregator.h
+++ b/src/Interpreters/Streaming/Aggregator.h
@@ -37,11 +37,13 @@
 #include <Core/Streaming/SubstreamID.h>
 #include <DataTypes/DataTypeDateTime64.h>
 #include <Interpreters/Aggregator.h>
+#include <Interpreters/Streaming/AggregateDataEx.h>
 #include <Interpreters/Streaming/WindowCommon.h>
 #include <Parsers/ASTFunction.h>
 #include <Common/HashTable/Hash.h>
 #include <Common/HashTable/TimeBucketHashMap.h>
 #include <Common/ProtonCommon.h>
+#include <Common/serde.h>
 
 #include <numeric>
 /// proton: ends
@@ -74,15 +76,11 @@ namespace Streaming
   *  best suited for different cases, and this approach is just one of them, chosen for a combination of reasons.
   */
 
-enum class ConvertAction : uint8_t
+enum class AggregateStateType
 {
-    Unkonwn = 0,
-    DistributedMerge,
-    WriteToTmpFS,
-    Checkpoint,
-    StreamingEmit,
-    InternalMerge,
-    RetractedEmit
+    Normal,
+    OnlyUpdated,
+    OnlyRetracted,
 };
 
 /// using TimeBucketAggregatedDataWithUInt16Key = TimeBucketHashMap<FixedImplicitZeroHashMap<UInt16, AggregateDataPtr>>;
@@ -103,7 +101,8 @@ using TimeBucketAggregatedDataWithKeys256TwoLevel = TimeBucketHashMap<UInt256, A
 
 class Aggregator;
 struct AggregatedDataMetrics;
-struct AggregatedDataVariants : private boost::noncopyable
+
+SERDE struct AggregatedDataVariants : private boost::noncopyable
 {
     /** Working with states of aggregate functions in the pool is arranged in the following (inconvenient) way:
       * - when aggregating, states are created in the pool using IAggregateFunction::create (inside - `placement new` of arbitrary structure);
@@ -130,6 +129,7 @@ struct AggregatedDataVariants : private boost::noncopyable
     /// Pools for states of aggregate functions. Ownership will be later transferred to ColumnAggregateFunction.
     Arenas aggregates_pools;
     Arena * aggregates_pool{};    /// The pool that is currently used for allocation.
+    std::unique_ptr<Arena> retracted_pool;  /// Use an independent pool to manage retracted data, which will be cleared after each finalization
 
     /** Specialization for the case when there are no keys, and for keys not fitted into max_rows_to_group_by.
       */
@@ -371,6 +371,17 @@ struct AggregatedDataVariants : private boost::noncopyable
         /// proton: ends;
     }
 
+    void reset();
+
+    void resetAggregatesPool()
+    {
+        aggregates_pools = Arenas(1, std::make_shared<Arena>());
+        aggregates_pool = aggregates_pools.back().get();
+        aggregates_pool->enableRecycle(true);
+    }
+
+    void resetRetractedPool() { retracted_pool = std::make_unique<Arena>(); }
+
     /// Number of rows (different keys).
     size_t size() const
     {
@@ -558,12 +569,17 @@ struct AggregatedDataVariants : private boost::noncopyable
                 throw Exception("Unknown aggregated data variant.", ErrorCodes::UNKNOWN_AGGREGATED_DATA_VARIANT);
         }
     }
+
+    void serialize(WriteBuffer & wb, const Aggregator & aggregator_) const;
+    void deserialize(ReadBuffer & rb, const Aggregator & aggregator_);
 };
 
 using AggregatedDataVariantsPtr = std::shared_ptr<AggregatedDataVariants>;
 using ManyAggregatedDataVariants = std::vector<AggregatedDataVariantsPtr>;
 using ManyAggregatedDataVariantsPtr = std::shared_ptr<ManyAggregatedDataVariants>;
 
+struct OutputBlockColumns;
+
 /** How are "total" values calculated with WITH TOTALS?
   * (For more details, see TotalsHavingTransform.)
   *
@@ -650,6 +666,9 @@ class Aggregator final
         size_t window_keys_num;
 
         WindowParamsPtr window_params;
+
+        bool tracking_changes = false;
+        bool tracking_updated = false;
         /// proton: ends
 
         /// proton: starts
@@ -670,7 +689,9 @@ class Aggregator final
             GroupBy streaming_group_by_ = GroupBy::OTHER,
             ssize_t delta_col_pos_ = -1,
             size_t window_keys_num_ = 0,
-            WindowParamsPtr window_params_ = nullptr)
+            WindowParamsPtr window_params_ = nullptr,
+            bool tracking_changes_ = false,
+            bool tracking_updated_ = false)
         : src_header(src_header_),
             intermediate_header(intermediate_header_),
             keys(keys_), aggregates(aggregates_), keys_size(keys.size()), aggregates_size(aggregates.size()),
@@ -687,7 +708,9 @@ class Aggregator final
             group_by(streaming_group_by_),
             delta_col_pos(delta_col_pos_),
             window_keys_num(window_keys_num_),
-            window_params(window_params_)
+            window_params(window_params_),
+            tracking_changes(tracking_changes_),
+            tracking_updated(tracking_updated_)
         {
         }
         /// proton: ends
@@ -757,7 +780,6 @@ class Aggregator final
         size_t row_begin,
         size_t row_end,
         AggregatedDataVariants & result,
-        AggregatedDataVariants & retracted_result,
         ColumnRawPtrs & key_columns,
         AggregateColumns & aggregate_columns, /// Passed to not create them anew for each block
         bool & no_more_keys) const;
@@ -787,25 +809,39 @@ class Aggregator final
       *            SELECT avg(i) AS i, sum(k) AS k FROM my_stream GROUP BY device_id <-- first level global aggr, don't prune states
       *          );
       */
-    BlocksList convertToBlocks(AggregatedDataVariants & data_variants, bool final, ConvertAction action, size_t max_threads) const;
-    BlocksList mergeAndConvertToBlocks(ManyAggregatedDataVariants & data_variants, bool final, ConvertAction action, size_t max_threads) const;
+    BlocksList convertToBlocks(AggregatedDataVariants & data_variants, bool final, bool clear_states, size_t max_threads) const;
+    BlocksList mergeAndConvertToBlocks(ManyAggregatedDataVariants & data_variants, bool final, bool clear_states, size_t max_threads) const;
 
-    Block convertOneBucketToBlock(AggregatedDataVariants & data_variants, bool final, ConvertAction action, size_t bucket) const;
-    Block mergeAndConvertOneBucketToBlock(ManyAggregatedDataVariants & variants, bool final, ConvertAction action, size_t bucket) const;
-
-    /// Used by hop window function, merge multiple gcd windows (buckets) to a hop window
+    /// For Tumble/Session window function, there is only one bucket
+    /// For Hop window function, merge multiple gcd windows (buckets) to a hop window
     /// For examples:
     ///   gcd_bucket1 - [00:00, 00:02)
     ///                            =>  result block - [00:00, 00:04)
     ///   gcd_bucket2 - [00:02, 00:04)
     Block spliceAndConvertBucketsToBlock(
-        AggregatedDataVariants & variants, bool final, ConvertAction action, const std::vector<Int64> & gcd_buckets) const;
+        AggregatedDataVariants & variants, bool final, bool clear_states, const std::vector<Int64> & gcd_buckets) const;
     Block mergeAndSpliceAndConvertBucketsToBlock(
-        ManyAggregatedDataVariants & variants, bool final, ConvertAction action, const std::vector<Int64> & gcd_buckets) const;
+        ManyAggregatedDataVariants & variants, bool final, bool clear_states, const std::vector<Int64> & gcd_buckets) const;
+
+    /// Convert the `updated data` (different with `normal data`)
+    BlocksList convertUpdatedToBlocks(AggregatedDataVariants & data_variants) const;
+
+    /// \return: merged updated data if exists
+    /// NOTE: The merged data is as `normal data`, which should use `convertToBlocks` to convert
+    AggregatedDataVariantsPtr mergeUpdatedGroups(ManyAggregatedDataVariants & data_variants) const;
 
-    /// Used for merge changed groups and return the <retracted_state, aggregated_state> of changed groups
-    std::pair<AggregatedDataVariantsPtr, AggregatedDataVariantsPtr>
-    mergeRetractedGroups(ManyAggregatedDataVariants & aggregated_data, ManyAggregatedDataVariants & retracted_data) const;
+    /// Convert the `retracted data` (different with `normal data`)
+    BlocksList convertRetractedToBlocks(AggregatedDataVariants & data_variants) const;
+
+    /// \return: merged retracted data if exists
+    /// NOTE: The merged data is as `normal data`, which should use `convertToBlocks` to convert
+    AggregatedDataVariantsPtr mergeRetractedGroups(ManyAggregatedDataVariants & data_variants) const;
+
+    /// Used for merge legacy retracted data into result
+    void mergeRetractedInto(AggregatedDataVariants & result, AggregatedDataVariants && retracted_result) const;
+
+    bool hasExpandedData() const { return expanded_data_type != ExpandedDataType::None; }
+    ExpandedDataType expandedDataType() const { return expanded_data_type; }
 
     std::vector<Int64> bucketsBefore(const AggregatedDataVariants & result, Int64 max_bucket) const;
     void removeBucketsBefore(AggregatedDataVariants & result, Int64 max_bucket) const;
@@ -821,7 +857,7 @@ class Aggregator final
     /// Precondition: for all blocks block.info.is_overflows flag must be the same.
     /// (either all blocks are from overflow data or none blocks are).
     /// The resulting block has the same value of is_overflows flag.
-    Block mergeBlocks(BlocksList & blocks, bool final, ConvertAction action);
+    Block mergeBlocks(BlocksList & blocks, bool final, bool clear_states, bool only_updated);
 
     /** Split block with partially-aggregated data to many blocks, as if two-level method of aggregation was used.
       * This is needed to simplify merging of that data with other results, that are already two-level.
@@ -904,6 +940,8 @@ class Aggregator final
 
     bool all_aggregates_has_trivial_destructor = false;
 
+    ExpandedDataType expanded_data_type = ExpandedDataType::None;
+
     /// How many RAM were used to process the query before processing the first block.
     Int64 memory_usage_before_aggregation = 0;
 
@@ -933,7 +971,7 @@ class Aggregator final
 
     /** Create states of aggregate functions for one key.
       */
-    template <bool skip_compiled_aggregate_functions = false>
+    template <bool use_compiled_functions = false, bool skip_expanded_data = false>
     void createAggregateStates(AggregateDataPtr & aggregate_data) const;
 
     /** Call `destroy` methods for states of aggregate functions.
@@ -982,6 +1020,7 @@ class Aggregator final
         AggregateFunctionInstruction * aggregate_instructions,
         Arena * arena) const;
 
+#if 0 /// Unused for now
     static void executeOnIntervalWithoutKeyImpl(
         AggregatedDataWithoutKey & res,
         size_t row_begin,
@@ -989,6 +1028,7 @@ class Aggregator final
         AggregateFunctionInstruction * aggregate_instructions,
         Arena * arena,
         const IColumn * delta_col);
+#endif
 
     template <typename Method>
     void writeToTemporaryFileImpl(
@@ -1014,68 +1054,32 @@ class Aggregator final
         bool clear_states,
         KeyHandler && key_handler = nullptr) const;
 
-    /// Merge data from hash table `src` into `dst`, but only for keys that already exist in dst. In other cases, merge the data into `overflows`.
-    template <typename Method, typename Table>
-    void mergeDataNoMoreKeysImpl(
-        Table & table_dst,
-        AggregatedDataWithoutKey & overflows,
-        Table & table_src,
-        Arena * arena,
-        bool clear_states) const;
-
-    /// Same, but ignores the rest of the keys.
-    template <typename Method, typename Table>
-    void mergeDataOnlyExistingKeysImpl(
-        Table & table_dst,
-        Table & table_src,
-        Arena * arena,
-        bool clear_states) const;
-
     void mergeWithoutKeyDataImpl(ManyAggregatedDataVariants & non_empty_data, bool clear_states) const;
 
     template <typename Method>
     void mergeSingleLevelDataImpl(ManyAggregatedDataVariants & non_empty_data, bool clear_states) const;
 
     template <typename Method, typename Table>
-    void convertToBlockImpl(
-        Method & method,
-        Table & data,
-        MutableColumns & key_columns,
-        AggregateColumnsData & aggregate_columns,
-        MutableColumns & final_aggregate_columns,
-        Arena * arena,
-        bool final,
-        bool clear_states) const;
+    Block convertToBlockImpl(
+        Method & method, Table & data, Arena * arena, Arenas & aggregates_pools, bool final, size_t rows, bool clear_states, AggregateStateType type) const;
 
     template <typename Mapped>
     void insertAggregatesIntoColumns(
         Mapped & mapped,
         MutableColumns & final_aggregate_columns,
-        Arena * arena) const;
-
-    template <typename Method, bool use_compiled_functions, typename Table>
-    void convertToBlockImplFinal(
-        Method & method,
-        Table & data,
-        std::vector<IColumn *> key_columns,
-        MutableColumns & final_aggregate_columns,
         Arena * arena,
         bool clear_states) const;
 
-    template <typename Method, typename Table>
-    void convertToBlockImplNotFinal(
-        Method & method,
-        Table & data,
-        std::vector<IColumn *>  key_columns,
-        AggregateColumnsData & aggregate_columns) const;
+    template <bool use_compiled_functions>
+    Block insertResultsIntoColumns(
+        PaddedPODArray<AggregateDataPtr> & places, OutputBlockColumns && out_cols, Arena * arena, bool clear_states) const;
 
-    template <typename Filler>
-    Block prepareBlockAndFill(
-        AggregatedDataVariants & data_variants,
-        bool final,
-        bool clear_states,
-        size_t rows,
-        Filler && filler) const;
+    template <typename Method, bool use_compiled_functions, AggregateStateType type, typename Table>
+    Block convertToBlockImplFinal(
+        Method & method, Table & data, Arena * arena, Arenas & aggregates_pools, size_t rows, bool clear_states) const;
+
+    template <typename Method, typename Table>
+    Block convertToBlockImplNotFinal(Method & method, Table & data, Arenas & aggregates_pools, size_t rows) const;
 
     template <typename Method>
     Block convertOneBucketToBlockImpl(
@@ -1084,65 +1088,60 @@ class Aggregator final
         Arena * arena,
         bool final,
         bool clear_states,
-        size_t bucket) const;
+        Int64 bucket,
+        AggregateStateType type = AggregateStateType::Normal) const;
 
     /// proton: starts.
     template <typename Method>
     void spliceBucketsImpl(
         AggregatedDataVariants & data_dest,
         AggregatedDataVariants & data_src,
-        bool final,
-        bool clear_states,
         const std::vector<Int64> & gcd_buckets,
-        Arena * arena) const;
+        Arena * arena,
+        bool clear_states) const;
 
     template <typename Method>
     BlocksList mergeAndConvertTwoLevelToBlocksImpl(
-        ManyAggregatedDataVariants & non_empty_data, bool final, size_t max_threads, bool clear_states) const;
+        ManyAggregatedDataVariants & non_empty_data, bool final, bool clear_states, ThreadPool * thread_pool) const;
 
-    Block mergeAndConvertWithoutKeyToBlock(ManyAggregatedDataVariants & non_empty_data, bool final, bool clear_states) const;
-    Block mergeAndConvertSingleLevelToBlock(ManyAggregatedDataVariants & non_empty_data, bool final, bool clear_states) const;
-    BlocksList
-    mergeAndConvertTwoLevelToBlocks(ManyAggregatedDataVariants & non_empty_data, bool final, size_t max_threads, bool clear_states) const;
+    void mergeAggregateStates(AggregateDataPtr & dst, AggregateDataPtr & src, Arena * arena, bool clear_states) const;
+
+    void destroyAggregateStates(AggregateDataPtr & place) const;
+
+    void serializeAggregateStates(const AggregateDataPtr & place, WriteBuffer & wb) const;
+    void deserializeAggregateStates(AggregateDataPtr & place, ReadBuffer & rb, Arena * arena) const;
+
+    void clearDataVariants(AggregatedDataVariants & data_variants) const;
+
+    /// @return does need abort ?
+    bool checkAndProcessResult(AggregatedDataVariants & result, bool & no_more_keys) const;
 
     template <typename Method>
     bool executeAndRetractImpl(
         Method & method,
         Arena * aggregates_pool,
-        Method & retracted_method,
         Arena * retracted_pool,
         size_t row_begin,
         size_t row_end,
         ColumnRawPtrs & key_columns,
         AggregateFunctionInstruction * aggregate_instructions) const;
 
+    template <typename Method, bool is_two_level>
+    void mergeUpdatedGroupsImpl(ManyAggregatedDataVariants & non_empty_data, Arena * arena) const;
     template <typename Method>
-    void mergeRetractedGroupsImpl(ManyAggregatedDataVariants & aggregated_data, ManyAggregatedDataVariants & retracted_data) const;
+    void mergeRetractedGroupsImpl(ManyAggregatedDataVariants & non_empty_data, Arena * arena) const;
 
-    void mergeAggregateStates(AggregateDataPtr & dst, AggregateDataPtr & src, Arena * arena, bool clear_states) const;
-
-    void destroyAggregateStates(AggregateDataPtr & place) const;
-
-    void serializeAggregateStates(const AggregateDataPtr & place, WriteBuffer & wb) const;
-    void deserializeAggregateStates(AggregateDataPtr & place, ReadBuffer & rb, Arena * arena) const;
-
-    void clearDataVariants(AggregatedDataVariants & data_variants) const;
-
-    /// @return does need abort ?
-    bool checkAndProcessResult(AggregatedDataVariants & result, bool & no_more_keys) const;
+    template <typename Method>
+    void mergeRetractedIntoImpl(Method & method, Method & retracted_method, Arena * arena) const;
     /// proton: ends.
 
-    Block prepareBlockAndFillWithoutKey(AggregatedDataVariants & data_variants, bool final, bool is_overflows, bool clear_states) const;
-    Block prepareBlockAndFillSingleLevel(AggregatedDataVariants & data_variants, bool final, bool clear_states) const;
-    BlocksList prepareBlocksAndFillTwoLevel(AggregatedDataVariants & data_variants, bool final, size_t max_threads, bool clear_states) const;
+    Block prepareBlockAndFillWithoutKey(AggregatedDataVariants & data_variants, bool final, bool is_overflows, bool clear_states, AggregateStateType type = AggregateStateType::Normal) const;
+    Block prepareBlockAndFillSingleLevel(AggregatedDataVariants & data_variants, bool final, bool clear_states, AggregateStateType type = AggregateStateType::Normal) const;
+    BlocksList prepareBlocksAndFillTwoLevel(AggregatedDataVariants & data_variants, bool final, bool clear_states, size_t max_threads, AggregateStateType type = AggregateStateType::Normal) const;
 
     template <typename Method>
     BlocksList prepareBlocksAndFillTwoLevelImpl(
-        AggregatedDataVariants & data_variants,
-        Method & method,
-        bool final,
-        bool clear_states,
-        ThreadPool * thread_pool) const;
+        AggregatedDataVariants & data_variants, Method & method, bool final, bool clear_states, ThreadPool * thread_pool, AggregateStateType type) const;
 
     template <bool no_more_keys, typename Method, typename Table>
     void mergeStreamsImplCase(
@@ -1167,7 +1166,7 @@ class Aggregator final
 
     template <typename Method>
     void mergeBucketImpl(
-        ManyAggregatedDataVariants & data, bool final, bool clear_states, Int64 bucket, Arena * arena, std::atomic<bool> * is_cancelled = nullptr) const;
+        ManyAggregatedDataVariants & data, Int64 bucket, Arena * arena, bool clear_states, std::atomic<bool> * is_cancelled = nullptr) const;
 
     template <typename Method>
     void convertBlockToTwoLevelImpl(
@@ -1207,30 +1206,31 @@ class Aggregator final
         const AggregatedDataVariants & data_variants,
         MutableColumns & aggregate_columns) const;
 
-    void createStatesAndFillKeyColumnsWithSingleKey(
-        AggregatedDataVariants & data_variants,
-        Columns & key_columns, size_t key_row,
-        MutableColumns & final_key_columns) const;
-
     /// proton: starts
     void setupAggregatesPoolTimestamps(size_t row_begin, size_t row_end, const ColumnRawPtrs & key_columns, Arena * aggregates_pool) const;
 
-    inline bool shouldClearStates(ConvertAction action, bool final_) const;
+public:
+    /// Existed versions:
+    ///   STATE V1 - Legacy version (REVISION 1)
+    ///   STATE V2 - REVISION 1 (Enable revision)
+    ///   STATE V3 - REVISION 3 (Add expanded data)
+    static constexpr UInt64 STATE_V2_MIN_REVISION = 1;
+    static constexpr UInt64 STATE_V3_MIN_REVISION = 3;
 
     VersionType getVersionFromRevision(UInt64 revision) const;
     VersionType getVersion() const;
 
-public:
-    /// Existed versions:
-    ///   STATE VERSION 1 - Legacy version
-    ///   STATE VERSION 2 - REVISION 1 (Enable revision)
-    static constexpr UInt64 STATE_V2_MIN_REVISION = 1;
+    void checkpoint(const AggregatedDataVariants & data_variants, WriteBuffer & wb) const;
+    void recover(AggregatedDataVariants & data_variants, ReadBuffer & rb) const;
 
-    void checkpoint(const AggregatedDataVariants & data_variants, WriteBuffer & wb);
-    void recover(AggregatedDataVariants & data_variants, ReadBuffer & rb);
+private:
+    /// [Version-3]
+    void doCheckpointV3(const AggregatedDataVariants & data_variants, WriteBuffer & wb) const;
+    void doRecoverV3(AggregatedDataVariants & data_variants, ReadBuffer & rb) const;
 
-    void doCheckpoint(const AggregatedDataVariants & data_variants, WriteBuffer & wb);
-    void doRecover(AggregatedDataVariants & data_variants, ReadBuffer & rb);
+    /// [Version-2]
+    void doCheckpointV2(const AggregatedDataVariants & data_variants, WriteBuffer & wb) const;
+    void doRecoverV2(AggregatedDataVariants & data_variants, ReadBuffer & rb) const;
 
     /// [Legacy]
     void doCheckpointLegacy(const AggregatedDataVariants & data_variants, WriteBuffer & wb);
diff --git a/src/Processors/Transforms/Streaming/AggregatingHelper.cpp b/src/Processors/Transforms/Streaming/AggregatingHelper.cpp
index 6fa6139d38a..849b82b802c 100644
--- a/src/Processors/Transforms/Streaming/AggregatingHelper.cpp
+++ b/src/Processors/Transforms/Streaming/AggregatingHelper.cpp
@@ -30,12 +30,19 @@ Chunk mergeBlocksToChunk(BlocksList && blocks)
     return merged_chunk;
 }
 
-Chunk convertToChunkImpl(AggregatedDataVariants & data, const AggregatingTransformParams & params, ConvertAction action)
+Chunk convertToChunkImpl(AggregatedDataVariants & data, const AggregatingTransformParams & params, AggregateStateType type)
 {
     if (data.empty())
         return {};
 
-    auto blocks = params.aggregator.convertToBlocks(data, params.final, action, params.params.max_threads);
+    BlocksList blocks;
+    if (type == AggregateStateType::OnlyUpdated)
+        blocks = params.aggregator.convertUpdatedToBlocks(data);
+    else if (type == AggregateStateType::OnlyRetracted)
+        blocks = params.aggregator.convertRetractedToBlocks(data);
+    else
+        blocks = params.aggregator.convertToBlocks(data, params.final, !params.params.keep_state, params.params.max_threads);
+
     /// FIXME: When global aggr states was converted two level hash table, the merged chunk may be too large
     return mergeBlocksToChunk(std::move(blocks));
 }
@@ -45,12 +52,12 @@ namespace AggregatingHelper
 {
 Chunk convertToChunk(AggregatedDataVariants & data, const AggregatingTransformParams & params)
 {
-    return convertToChunkImpl(data, params, ConvertAction::StreamingEmit);
+    return convertToChunkImpl(data, params, AggregateStateType::Normal);
 }
 
 Chunk mergeAndConvertToChunk(ManyAggregatedDataVariants & data, const AggregatingTransformParams & params)
 {
-    auto blocks = params.aggregator.mergeAndConvertToBlocks(data, params.final, ConvertAction::StreamingEmit, params.params.max_threads);
+    auto blocks = params.aggregator.mergeAndConvertToBlocks(data, params.final, !params.params.keep_state, params.params.max_threads);
     /// FIXME: When global aggr states was converted two level hash table, the merged chunk may be too large
     return mergeBlocksToChunk(std::move(blocks));
 }
@@ -58,32 +65,21 @@ Chunk mergeAndConvertToChunk(ManyAggregatedDataVariants & data, const Aggregatin
 Chunk spliceAndConvertBucketsToChunk(
     AggregatedDataVariants & data, const AggregatingTransformParams & params, const std::vector<Int64> & buckets)
 {
-    if (buckets.size() == 1)
-        return convertToChunk(params.aggregator.convertOneBucketToBlock(data, params.final, ConvertAction::StreamingEmit, buckets[0]));
-    else
-        return convertToChunk(params.aggregator.spliceAndConvertBucketsToBlock(data, params.final, ConvertAction::InternalMerge, buckets));
+    return convertToChunk(params.aggregator.spliceAndConvertBucketsToBlock(data, params.final, /*clear_states*/ false, buckets));
 }
 
 Chunk mergeAndSpliceAndConvertBucketsToChunk(
     ManyAggregatedDataVariants & data, const AggregatingTransformParams & params, const std::vector<Int64> & buckets)
 {
-    if (buckets.size() == 1)
-        return convertToChunk(
-            params.aggregator.mergeAndConvertOneBucketToBlock(data, params.final, ConvertAction::StreamingEmit, buckets[0]));
-    else
-        return convertToChunk(
-            params.aggregator.mergeAndSpliceAndConvertBucketsToBlock(data, params.final, ConvertAction::InternalMerge, buckets));
+    return convertToChunk(params.aggregator.mergeAndSpliceAndConvertBucketsToBlock(data, params.final, /*clear_states*/ false, buckets));
 }
 
-ChunkPair
-convertToChangelogChunk(AggregatedDataVariants & data, RetractedDataVariants & retracted_data, const AggregatingTransformParams & params)
+ChunkPair convertToChangelogChunk(AggregatedDataVariants & data, const AggregatingTransformParams & params)
 {
     if (data.empty())
         return {};
 
-    assert(!retracted_data.empty());
-
-    auto retracted_chunk = convertToChunkImpl(retracted_data, params, ConvertAction::RetractedEmit);
+    auto retracted_chunk = convertToChunkImpl(data, params, AggregateStateType::OnlyRetracted);
     if (retracted_chunk)
     {
         auto retracted_delta_col = ColumnInt8::create(retracted_chunk.rows(), Int8(-1));
@@ -91,25 +87,46 @@ convertToChangelogChunk(AggregatedDataVariants & data, RetractedDataVariants & r
         retracted_chunk.setConsecutiveDataFlag();
     }
 
-    auto chunk = convertToChunkImpl(data, params, ConvertAction::StreamingEmit);
+    auto chunk = convertToChunkImpl(data, params, AggregateStateType::OnlyUpdated);
     if (chunk)
     {
         auto delta_col = ColumnInt8::create(chunk.rows(), Int8(1));
         chunk.addColumn(std::move(delta_col));
     }
-
     return {std::move(retracted_chunk), std::move(chunk)};
 }
 
-ChunkPair mergeAndConvertToChangelogChunk(
-    ManyAggregatedDataVariants & data, ManyRetractedDataVariants & retracted_data, const AggregatingTransformParams & params)
+ChunkPair mergeAndConvertToChangelogChunk(ManyAggregatedDataVariants & data, const AggregatingTransformParams & params)
 {
-    auto [merged_data, merged_retracted_data] = params.aggregator.mergeRetractedGroups(data, retracted_data);
-    if (!merged_data)
-        return {};
+    if (data.size() == 1)
+        return convertToChangelogChunk(*data[0], params);
 
-    assert(merged_retracted_data);
-    return convertToChangelogChunk(*merged_data, *merged_retracted_data, params);
+    ChunkPair results;
+    auto & [retracted_chunk, chunk] = results;
+
+    auto merged_retracted_data = params.aggregator.mergeRetractedGroups(data);
+    if (merged_retracted_data)
+    {
+        retracted_chunk = convertToChunk(*merged_retracted_data, params);
+        if (retracted_chunk)
+        {
+            auto retracted_delta_col = ColumnInt8::create(retracted_chunk.rows(), Int8(-1));
+            retracted_chunk.addColumn(std::move(retracted_delta_col));
+            retracted_chunk.setConsecutiveDataFlag();
+        }
+    }
+
+    auto merged_updated_data = params.aggregator.mergeUpdatedGroups(data);
+    if (merged_updated_data)
+    {
+        chunk = convertToChunk(*merged_updated_data, params);
+        if (chunk)
+        {
+            auto delta_col = ColumnInt8::create(chunk.rows(), Int8(1));
+            chunk.addColumn(std::move(delta_col));
+        }
+    }
+    return results;
 }
 }
 }
diff --git a/src/Processors/Transforms/Streaming/AggregatingHelper.h b/src/Processors/Transforms/Streaming/AggregatingHelper.h
index 5ca32f6fc00..85b177b5b51 100644
--- a/src/Processors/Transforms/Streaming/AggregatingHelper.h
+++ b/src/Processors/Transforms/Streaming/AggregatingHelper.h
@@ -38,16 +38,13 @@ Chunk mergeAndSpliceAndConvertBucketsToChunk(
     ManyAggregatedDataVariants & data, const AggregatingTransformParams & params, const std::vector<Int64> & buckets);
 
 /// Only used for emit changelog
-/// @brief Based on new/updated groups @p retracted_data , only convert the state of changed groups (retracted: last state, aggregated: current state)
-///  \data: current aggregated state of all groups
-///  \retracted_data: only have last state of changed groups (i.e. new/updated/deleted)
+/// @brief only convert the state of changed groups (retracted: last state, aggregated: current state)
+///  \data: current aggregated state of all groups (contains retracted states and updated states)
 /// @returns <retracted_chunk, aggregated_chunk>
 /// retracted_chunk: just contains retracted data of changed groups
 /// aggregated_chunk: just contains aggregated data of changed groups
-ChunkPair
-convertToChangelogChunk(AggregatedDataVariants & data, RetractedDataVariants & retracted_data, const AggregatingTransformParams & params);
-ChunkPair mergeAndConvertToChangelogChunk(
-    ManyAggregatedDataVariants & data, ManyRetractedDataVariants & retracted_data, const AggregatingTransformParams & params);
+ChunkPair convertToChangelogChunk(AggregatedDataVariants & data, const AggregatingTransformParams & params);
+ChunkPair mergeAndConvertToChangelogChunk(ManyAggregatedDataVariants & data, const AggregatingTransformParams & params);
 }
 
 }
diff --git a/src/Processors/Transforms/Streaming/AggregatingTransform.cpp b/src/Processors/Transforms/Streaming/AggregatingTransform.cpp
index 629a047c1f3..b9fa8205e75 100644
--- a/src/Processors/Transforms/Streaming/AggregatingTransform.cpp
+++ b/src/Processors/Transforms/Streaming/AggregatingTransform.cpp
@@ -498,7 +498,7 @@ void AggregatingTransform::checkpoint(CheckpointContextPtr ckpt_ctx)
         }
 
         /// Serializing no shared data
-        params->aggregator.checkpoint(variants, wb);
+        DB::serialize(variants, wb, params->aggregator);
 
         DB::writeIntBinary(watermark, wb);
 
@@ -554,7 +554,7 @@ void AggregatingTransform::recover(CheckpointContextPtr ckpt_ctx)
         }
 
         /// Serializing local or stable data during checkpointing
-        params->aggregator.recover(variants, rb);
+        DB::deserialize(variants, rb, params->aggregator);
 
         DB::readIntBinary(watermark, rb);
 
diff --git a/src/Processors/Transforms/Streaming/AggregatingTransformWithSubstream.cpp b/src/Processors/Transforms/Streaming/AggregatingTransformWithSubstream.cpp
index 918d1337658..fd1cda27554 100644
--- a/src/Processors/Transforms/Streaming/AggregatingTransformWithSubstream.cpp
+++ b/src/Processors/Transforms/Streaming/AggregatingTransformWithSubstream.cpp
@@ -270,7 +270,7 @@ void AggregatingTransformWithSubstream::checkpoint(CheckpointContextPtr ckpt_ctx
         for (const auto & [id, substream_ctx] : substream_contexts)
         {
             assert(id == substream_ctx->id);
-            substream_ctx->serialize(wb, getVersion());
+            serialize(*substream_ctx, wb, getVersion());
         }
     });
 }
@@ -284,7 +284,7 @@ void AggregatingTransformWithSubstream::recover(CheckpointContextPtr ckpt_ctx)
         for (size_t i = 0; i < num_substreams; ++i)
         {
             auto substream_ctx = std::make_shared<SubstreamContext>(this);
-            substream_ctx->deserialize(rb, version_);
+            deserialize(*substream_ctx, rb, version_);
             substream_contexts.emplace(substream_ctx->id, std::move(substream_ctx));
         }
     });
@@ -294,7 +294,7 @@ void SubstreamContext::serialize(WriteBuffer & wb, VersionType version) const
 {
     DB::Streaming::serialize(id, wb);
 
-    aggregating_transform->params->aggregator.checkpoint(variants, wb);
+    DB::serialize(variants, wb, aggregating_transform->params->aggregator);
 
     DB::writeIntBinary(finalized_watermark, wb);
 
@@ -312,7 +312,7 @@ void SubstreamContext::deserialize(ReadBuffer & rb, VersionType version)
 {
     DB::Streaming::deserialize(id, rb);
 
-    aggregating_transform->params->aggregator.recover(variants, rb);
+    DB::deserialize(variants, rb, aggregating_transform->params->aggregator);
 
     DB::readIntBinary(finalized_watermark, rb);
 
diff --git a/src/Processors/Transforms/Streaming/GlobalAggregatingTransform.cpp b/src/Processors/Transforms/Streaming/GlobalAggregatingTransform.cpp
index 6d19e51fcc0..3049e4bebce 100644
--- a/src/Processors/Transforms/Streaming/GlobalAggregatingTransform.cpp
+++ b/src/Processors/Transforms/Streaming/GlobalAggregatingTransform.cpp
@@ -9,6 +9,7 @@ namespace ErrorCodes
 {
 extern const int NOT_IMPLEMENTED;
 extern const int UNSUPPORTED;
+extern const int RECOVER_CHECKPOINT_FAILED;
 }
 
 namespace Streaming
@@ -40,35 +41,58 @@ GlobalAggregatingTransform::GlobalAggregatingTransform(
     if (unlikely(params->params.overflow_row))
         throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Overflow row processing is not implemented in global aggregation");
 
-    /// Need extra retracted data
     if (params->emit_changelog)
     {
         if (params->emit_version)
             throw Exception(ErrorCodes::UNSUPPORTED, "'emit_version()' is not supported in global aggregation emit changelog");
 
-        ManyRetractedDataVariants retracted_data(many_data->variants.size());
-        for (auto & elem : retracted_data)
-            elem = std::make_shared<AggregatedDataVariants>();
-
+        bool retract_enabled = false;
         many_data->setField(
-            {std::move(retracted_data),
+            {retract_enabled,
              /// Field serializer
-             [this](const std::any & field, WriteBuffer & wb, VersionType) {
-                 const auto & data = std::any_cast<const ManyRetractedDataVariants &>(field);
-                 DB::writeIntBinary(data.size(), wb);
-                 for (const auto & elem : data)
-                     params->aggregator.checkpoint(*elem, wb);
+             [](const std::any & field, WriteBuffer & wb, [[maybe_unused]] VersionType version) {
+                 assert(version >= IMPL_V2_MIN_VERSION);
+                 DB::writeBoolText(std::any_cast<bool>(field), wb);
              },
              /// Field deserializer
-             [this](std::any & field, ReadBuffer & rb, VersionType) {
-                 auto & data = std::any_cast<ManyRetractedDataVariants &>(field);
-                 size_t num;
-                 DB::readIntBinary(num, rb);
-                 data.resize(num);
-                 for (auto & elem : data)
+             [this](std::any & field, ReadBuffer & rb, VersionType version) {
+                 if (version >= IMPL_V2_MIN_VERSION)
+                 {
+                     DB::readBoolText(std::any_cast<bool &>(field), rb);
+                 }
+                 else
                  {
-                     elem = std::make_shared<AggregatedDataVariants>();
-                     params->aggregator.recover(*elem, rb);
+                     /// Convert old impl to new impl V2
+                     if (params->aggregator.expandedDataType() != ExpandedDataType::UpdatedWithRetracted)
+                         throw Exception(
+                             ErrorCodes::RECOVER_CHECKPOINT_FAILED,
+                             "Failed to recover aggregation checkpoint. Recover old version '{}' checkpoint, checkpointed need retracted, "
+                             "but "
+                             "current not need",
+                             version);
+
+                     size_t retracted_num;
+                     DB::readIntBinary(retracted_num, rb);
+                     if (retracted_num != many_data->variants.size())
+                         throw Exception(
+                             ErrorCodes::RECOVER_CHECKPOINT_FAILED,
+                             "Failed to recover aggregation checkpoint. Recover old version '{}' checkpoint but the scale of the pipeline "
+                             "is "
+                             "inconsistent, checkpointed={}, current={}",
+                             version,
+                             retracted_num,
+                             many_data->variants.size());
+
+                     bool has_retracted = false;
+                     for (auto & current : many_data->variants)
+                     {
+                         AggregatedDataVariants retracted;
+                         DB::deserialize(retracted, rb, params->aggregator);
+                         has_retracted |= retracted.size() > 0;
+                         params->aggregator.mergeRetractedInto(*current, std::move(retracted));
+                     }
+
+                     std::any_cast<bool &>(field) = many_data->emited_version > 0 || has_retracted; /// retracted enabled
                  }
              }});
     }
@@ -103,14 +127,17 @@ std::pair<bool, bool> GlobalAggregatingTransform::executeOrMergeColumns(Chunk &
     if (params->emit_changelog)
     {
         assert(!params->only_merge);
-
-        auto & retracted_variants = many_data->getField<ManyRetractedDataVariants>()[current_variant];
-        auto & aggregated_variants = many_data->variants[current_variant];
-
         /// Blocking finalization during execution on current variant
         std::lock_guard lock(variants_mutex);
-        return params->aggregator.executeAndRetractOnBlock(
-            chunk.detachColumns(), 0, num_rows, *aggregated_variants, *retracted_variants, key_columns, aggregate_columns, no_more_keys);
+
+        /// Enable retract after first finalization
+        auto retract_enabled = many_data->getField<bool>();
+        if (retract_enabled) [[likely]]
+            return params->aggregator.executeAndRetractOnBlock(
+                chunk.detachColumns(), 0, num_rows, variants, key_columns, aggregate_columns, no_more_keys);
+        else
+            return params->aggregator.executeOnBlock(
+                chunk.detachColumns(), 0, num_rows, variants, key_columns, aggregate_columns, no_more_keys);
     }
     else
         return AggregatingTransform::executeOrMergeColumns(chunk, num_rows);
@@ -127,8 +154,9 @@ void GlobalAggregatingTransform::finalize(const ChunkContextPtr & chunk_ctx)
 
     if (params->emit_changelog)
     {
-        auto [retracted_chunk, chunk] = AggregatingHelper::mergeAndConvertToChangelogChunk(
-            many_data->variants, many_data->getField<ManyRetractedDataVariants>(), *params);
+        auto [retracted_chunk, chunk] = AggregatingHelper::mergeAndConvertToChangelogChunk(many_data->variants, *params);
+        /// Enable retract after first finalization
+        many_data->getField<bool &>() |= chunk.rows();
 
         chunk.setChunkContext(chunk_ctx);
         setCurrentChunk(std::move(chunk), std::move(retracted_chunk));
diff --git a/src/Processors/Transforms/Streaming/GlobalAggregatingTransform.h b/src/Processors/Transforms/Streaming/GlobalAggregatingTransform.h
index 474824e1977..975fe4e115f 100644
--- a/src/Processors/Transforms/Streaming/GlobalAggregatingTransform.h
+++ b/src/Processors/Transforms/Streaming/GlobalAggregatingTransform.h
@@ -28,6 +28,8 @@ class GlobalAggregatingTransform final : public AggregatingTransform
     bool prepareFinalization(Int64 min_watermark) override;
 
     void finalize(const ChunkContextPtr & chunk_ctx) override;
+
+    static constexpr VersionType IMPL_V2_MIN_VERSION = 3;
 };
 
 }
diff --git a/src/Processors/Transforms/Streaming/GlobalAggregatingTransformWithSubstream.cpp b/src/Processors/Transforms/Streaming/GlobalAggregatingTransformWithSubstream.cpp
index e223ee5b623..d59f40c2199 100644
--- a/src/Processors/Transforms/Streaming/GlobalAggregatingTransformWithSubstream.cpp
+++ b/src/Processors/Transforms/Streaming/GlobalAggregatingTransformWithSubstream.cpp
@@ -9,6 +9,7 @@ namespace ErrorCodes
 {
 extern const int NOT_IMPLEMENTED;
 extern const int UNSUPPORTED;
+extern const int RECOVER_CHECKPOINT_FAILED;
 }
 
 namespace Streaming
@@ -28,20 +29,42 @@ GlobalAggregatingTransformWithSubstream::GlobalAggregatingTransformWithSubstream
 SubstreamContextPtr GlobalAggregatingTransformWithSubstream::getOrCreateSubstreamContext(const SubstreamID & id)
 {
     auto substream_ctx = AggregatingTransformWithSubstream::getOrCreateSubstreamContext(id);
+    /// Need extra retracted data for old version impl
     if (params->emit_changelog && !substream_ctx->hasField())
     {
+        bool retract_enabled = false;
         substream_ctx->setField(
-            {std::make_shared<RetractedDataVariants>(),
-            /// Field serializer
-            [this](const std::any & field, WriteBuffer & wb, VersionType) {
-                const auto & data = std::any_cast<const RetractedDataVariantsPtr &>(field);
-                params->aggregator.checkpoint(*data, wb);
-            },
-            /// Field deserializer
-            [this](std::any & field, ReadBuffer & rb, VersionType) {
-                auto & data = std::any_cast<RetractedDataVariantsPtr &>(field);
-                params->aggregator.recover(*data, rb);
-            }});
+            {retract_enabled,
+             /// Field serializer
+             [](const std::any & field, WriteBuffer & wb, VersionType version) {
+                 assert(version >= IMPL_V2_MIN_VERSION);
+                 DB::writeBoolText(std::any_cast<bool>(field), wb);
+             },
+             /// Field deserializer
+             [substream_ctx, this](std::any & field, ReadBuffer & rb, VersionType version) {
+                 if (version >= IMPL_V2_MIN_VERSION)
+                 {
+                     DB::readBoolText(std::any_cast<bool &>(field), rb);
+                 }
+                 else
+                 {
+                     /// Convert old impl to new impl V2
+                     if (params->aggregator.expandedDataType() != ExpandedDataType::UpdatedWithRetracted)
+                         throw Exception(
+                             ErrorCodes::RECOVER_CHECKPOINT_FAILED,
+                             "Failed to recover aggregation checkpoint. Recover old version '{}' checkpoint, checkpointed need retracted, "
+                             "but "
+                             "current not need",
+                             version);
+
+                     AggregatedDataVariants retracted;
+                     DB::deserialize(retracted, rb, params->aggregator);
+                     bool has_retracted = retracted.size() > 0;
+                     params->aggregator.mergeRetractedInto(substream_ctx->variants, std::move(retracted));
+
+                     std::any_cast<bool &>(field) = substream_ctx->emited_version > 0 || has_retracted; /// retracted enabled
+                 }
+             }});
     }
     return substream_ctx;
 }
@@ -52,13 +75,14 @@ GlobalAggregatingTransformWithSubstream::executeOrMergeColumns(Chunk & chunk, co
     if (params->emit_changelog)
     {
         assert(!params->only_merge);
-
         auto num_rows = chunk.getNumRows();
-        auto & retracted_variants = substream_ctx->getField<RetractedDataVariantsPtr>();
-        auto & aggregated_variants = substream_ctx->variants;
-
-        return params->aggregator.executeAndRetractOnBlock(
-            chunk.detachColumns(), 0, num_rows, aggregated_variants, *retracted_variants, key_columns, aggregate_columns, no_more_keys);
+        auto retract_enabled = substream_ctx->getField<bool>();
+        if (retract_enabled) [[likely]]
+            return params->aggregator.executeAndRetractOnBlock(
+                chunk.detachColumns(), 0, num_rows, substream_ctx->variants, key_columns, aggregate_columns, no_more_keys);
+        else
+            return params->aggregator.executeOnBlock(
+                chunk.detachColumns(), 0, num_rows, substream_ctx->variants, key_columns, aggregate_columns, no_more_keys);
     }
     else
         return AggregatingTransformWithSubstream::executeOrMergeColumns(chunk, substream_ctx);
@@ -87,8 +111,10 @@ void GlobalAggregatingTransformWithSubstream::finalize(const SubstreamContextPtr
     auto start = MonotonicMilliseconds::now();
     if (params->emit_changelog)
     {
-        auto [retracted_chunk, chunk]
-            = AggregatingHelper::convertToChangelogChunk(variants, *substream_ctx->getField<RetractedDataVariantsPtr>(), *params);
+        auto [retracted_chunk, chunk] = AggregatingHelper::convertToChangelogChunk(variants, *params);
+        /// Enable retract after first finalization
+        substream_ctx->getField<bool &>() |= chunk.rows();
+
         chunk.setChunkContext(chunk_ctx);
         setCurrentChunk(std::move(chunk), std::move(retracted_chunk));
     }
diff --git a/src/Processors/Transforms/Streaming/GlobalAggregatingTransformWithSubstream.h b/src/Processors/Transforms/Streaming/GlobalAggregatingTransformWithSubstream.h
index 27c69ba6ac5..72bc161bf7c 100644
--- a/src/Processors/Transforms/Streaming/GlobalAggregatingTransformWithSubstream.h
+++ b/src/Processors/Transforms/Streaming/GlobalAggregatingTransformWithSubstream.h
@@ -21,6 +21,8 @@ class GlobalAggregatingTransformWithSubstream final : public AggregatingTransfor
 
 private:
     void finalize(const SubstreamContextPtr & substream_ctx, const ChunkContextPtr & chunk_ctx) override;
+
+    static constexpr VersionType IMPL_V2_MIN_VERSION = 3;
 };
 
 }

From 9a93a4a02d94b772116a00a9be65ec813ccddddf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lisen=20=E6=9D=A8?= <li.yang@timeplus.io>
Date: Tue, 30 Jan 2024 14:45:21 +0800
Subject: [PATCH 3/5] fix unstable smoke test

---
 .../test_stream_smoke/0001_view_case.json     |   7 +-
 .../0013_changelog_stream13.yaml              |   4 +-
 .../0013_changelog_stream14.yaml              |  24 ++--
 .../0013_changelog_stream2.json               | 114 ++++++++----------
 .../0018_query_state7_view.json               |   6 +-
 .../0030_two_level_global_aggr.yaml           |   4 +-
 .../test_stream_smoke/0099_fixed_issues.json  |   2 +-
 7 files changed, 75 insertions(+), 86 deletions(-)

diff --git a/tests/stream/test_stream_smoke/0001_view_case.json b/tests/stream/test_stream_smoke/0001_view_case.json
index cc73303442b..b5eae39feb0 100644
--- a/tests/stream/test_stream_smoke/0001_view_case.json
+++ b/tests/stream/test_stream_smoke/0001_view_case.json
@@ -482,8 +482,8 @@
             "steps":[
                 {"statements": [
                     {"client":"python", "query_type": "table", "query":"drop view if exists test1_mv_union"},
-                    {"client":"python","query_id":"300", "wait":3, "query_type": "table", "query":"create materialized view test1_mv_union as (select id, sum(value) as sum_value from test1_mv group by id limit 4 union select id, sum(value) as sum_value from test1_mv group by id limit 4)"},
-                    {"client":"python", "query_id":"301", "query_type": "stream","depends_on_stream":"test1_mv_union","wait":2,"query_end_timer":5,"drop_view":"test1_mv_union", "drop_view_wait":1, "query":"select id, sum_value from test1_mv_union settings seek_to='earliest'"}
+                    {"client":"python", "wait":3, "query_type": "table", "query":"create materialized view test1_mv_union as (select id, sum(value) as sum_value from test1_mv group by id union select id, sum(value) as sum_value from test1_mv group by id)"},
+                    {"client":"python", "query_id":"301", "query_type": "stream","depends_on_stream":"test1_mv_union","wait":2, "query":"select id, sum_value from test1_mv_union settings seek_to='earliest'"}
 
                     ]},
                 {"inputs": [
@@ -498,7 +498,8 @@
                     ["dev2", "ca", 76, "\"create_time\":\"2021-11-02 20:00:10\"", "2020-02-02 20:01:05"],
                     ["dev2", "ca", 80, "\"create_time\":\"2021-11-02 20:00:01\"", "2020-02-02 20:01:03"],
                     ["dev8", "ca", 67, "\"create_time\":\"2021-11-02 20:00:01\"", "2020-02-02 20:01:02"],
-                    ["dev8", "ca", 77, "\"create_time\":\"2021-11-02 20:00:10\"", "2020-02-02 20:01:08"]]}
+                    ["dev8", "ca", 77, "\"create_time\":\"2021-11-02 20:00:10\"", "2020-02-02 20:01:08"]],
+                    "kill":301, "kill_wait":3, "drop_view":"test1_mv_union", "drop_view_wait":2}
                 ]}
             ],
 
diff --git a/tests/stream/test_stream_smoke/0013_changelog_stream13.yaml b/tests/stream/test_stream_smoke/0013_changelog_stream13.yaml
index 12c2345964a..73af7a1dce0 100644
--- a/tests/stream/test_stream_smoke/0013_changelog_stream13.yaml
+++ b/tests/stream/test_stream_smoke/0013_changelog_stream13.yaml
@@ -368,12 +368,12 @@ tests:
         wait: 2
         depends_on_stream: changelog_kv_13
         query_id: '13108'
-        query: select count_distinct(val), sum_distinct(val) from changelog_kv_13;
+        query: select count_distinct(val), sum_distinct(val) from changelog_kv_13 emit periodic 1s;
 
       - client: python
         query_type: table
         depends_on: '13108'
-        wait: 3
+        wait: 2
         kill: '13108'
         kill_wait: 2
         query: insert into changelog_kv_13(id, val, _tp_delta) values(2, 1, 1)(2, 1, -1)(3, 2, 1)(3, 2, -1);
diff --git a/tests/stream/test_stream_smoke/0013_changelog_stream14.yaml b/tests/stream/test_stream_smoke/0013_changelog_stream14.yaml
index 09912690e77..056513b5aea 100644
--- a/tests/stream/test_stream_smoke/0013_changelog_stream14.yaml
+++ b/tests/stream/test_stream_smoke/0013_changelog_stream14.yaml
@@ -207,34 +207,34 @@ tests:
         wait: 2
         depends_on_stream: test_changelog_14
         query_id: '15112'
-        query: select group_uniq_array(val), _tp_delta from test_changelog_14 emit changelog;
+        query: select group_uniq_array(val), _tp_delta from test_changelog_14 emit changelog periodic 1s;
 
       - client: python
         query_type: table
         depends_on: '15112'
-        wait: 3
+        wait: 2
         query: insert into test_changelog_14(id, val) values(1, 1);
 
       - client: python
         query_type: table
-        wait: 2
+        wait: 1
         query: insert into test_changelog_14(id, val) values(1, 2);
 
       - client: python
         query_type: table
-        wait: 2
+        wait: 1
         query: insert into test_changelog_14(id, val) values(2, 3);
 
       - client: python
         query_type: table
-        wait: 2
+        wait: 1
         query: insert into test_changelog_14(id, val) values(3, 3);
 
       - client: python
         query_type: table
         kill : '15112'
         kill_wait: 2
-        wait: 2
+        wait: 1
         query: insert into test_changelog_14(id, val) values(3, 4);
     expected_results:
       - query_id: '15112'
@@ -278,27 +278,27 @@ tests:
       - client: python
         query_type: table
         depends_on: '15113'
-        wait: 3
+        wait: 2
         query: insert into test_changelog_14(id, val) values(1, 1), (2,2);
 
       - client: python
         query_type: table
         kill: '15113'
-        kill_wait: 2
-        wait: 3
+        kill_wait: 3
+        wait: 2
         query: insert into test_changelog_14(id, val, _tp_delta) values(3, 3, +1), (2, 2, -1);
 
     - statements:
       - client: python
         query_type: stream
         query_id: 15113-1
-        wait: 2
+        wait: 1
         terminate: manual
         query: recover from '15113'
 
       - client: python
         query_type: table
-        depends_on_stream: test_changelog_14
+        depends_on: '15113'
         wait: 2
         query: insert into test_changelog_14(id, val) values(4, 4), (5,5);
 
@@ -314,7 +314,7 @@ tests:
 
       - client: python
         query_type: table
-        wait: 2
+        wait: 3
         query: kill query where query_id='15113-1' sync
 
       - client: python
diff --git a/tests/stream/test_stream_smoke/0013_changelog_stream2.json b/tests/stream/test_stream_smoke/0013_changelog_stream2.json
index 42b8e4d7a38..42c6b1ff542 100644
--- a/tests/stream/test_stream_smoke/0013_changelog_stream2.json
+++ b/tests/stream/test_stream_smoke/0013_changelog_stream2.json
@@ -191,7 +191,7 @@
                         {"client":"python", "query_type": "table", "exist":"test14_append_stream1_2", "exist_wait":2, "wait":1, "query":"create stream if not exists test14_append_stream1_2 (i int, k1 int, k2 string)"},
                         {"client":"python", "query_type": "table", "exist":"test14_append_stream2_2", "exist_wait":2, "wait":1, "query":"create stream if not exists test14_append_stream2_2 (j int, kk1 int, kk2 string) primary key(kk1, kk2) settings mode='versioned_kv'"},
                         {"client":"python", "query_type": "table", "exist":"test14_append_stream3_2", "exist_wait":2, "wait":1, "query":"create stream if not exists test14_append_stream3_2 (k int, kkk1 int, kkk2 string) primary key (kkk1, kkk2) settings mode='versioned_kv'"},
-                        {"client":"python", "query_type": "stream", "query_id":"1444", "wait":1, "terminate":"manual", "query":"select a.i, a.k1, a.k2, b.j, b.kk1, b.kk2, c.k, c.kkk1, c.kkk2, _tp_delta from test14_append_stream1_2 as a inner all join test14_append_stream2_2 as b on a.i = b.j inner all join test14_append_stream3_2 as c on b.kk2 = c.kkk2"},
+                        {"client":"python", "query_type": "stream", "depends_on_stream":"test14_append_stream3_2", "query_id":"1444", "wait":1, "terminate":"manual", "query":"select a.i, a.k1, a.k2, b.j, b.kk1, b.kk2, c.k, c.kkk1, c.kkk2, _tp_delta from test14_append_stream1_2 as a inner all join test14_append_stream2_2 as b on a.i = b.j inner all join test14_append_stream3_2 as c on b.kk2 = c.kkk2"},
                         {"client":"python", "query_type": "table", "depends_on":"1444", "wait":1, "query": "insert into test14_append_stream3_2(k, kkk1, kkk2) values (3, 2, 'k2')"},
                         {"client":"python", "query_type": "table", "wait":1, "query": "insert into test14_append_stream2_2(j, kk1, kk2) values (1, 1, 'k2')"},
                         {"client":"python", "query_type": "table", "wait":1, "query": "insert into test14_append_stream1_2(i, k1, k2) values (1, 1, 'k')"},
@@ -221,13 +221,10 @@
                     "statements": [
                         {"client":"python", "query_type":"table", "query":"drop stream if exists test14_append_stream_2"},
                         {"client":"python", "query_type": "table", "exist":"test14_append_stream_2", "exist_wait":2, "wait":1, "query":"create stream if not exists test14_append_stream_2 (id int, name string, val float, ts datetime)"},
-                        {"client":"python", "query_type": "stream", "query_id":"1445", "wait":1, "terminate":"manual", "query":"select count(), min(val), max(val), sum(val), avg(val) from changelog(test14_append_stream_2, id, name)"},
-                        {"client":"python", "query_type": "table", "depends_on":"1445", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (1, 'a', 11.1 ,'2020-02-02 20:00:00')"},
-                        {"client":"python", "query_type": "table", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (2, 'a', 22.2 ,'2020-02-02 20:00:01')"},
-                        {"client":"python", "query_type": "table", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (1, 'a', 33.3 ,'2020-02-02 20:00:02')"},
-                        {"client":"python", "query_type": "table", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (2, 'a', 44.4 ,'2020-02-02 20:00:03')"},
-                        {"client":"python", "query_type": "table", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (1, 'b', 22.2 ,'2020-02-02 20:00:04')"},
-                        {"client":"python", "query_type": "table", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (2, 'b', 44.4 ,'2020-02-02 20:00:05')"},
+                        {"client":"python", "query_type": "stream", "depends_on_stream":"test14_append_stream_2", "query_id":"1445", "wait":1, "terminate":"manual", "query":"select count(), min(val), max(val), sum(val), avg(val) from changelog(test14_append_stream_2, id, name) emit periodic 1s"},
+                        {"client":"python", "query_type": "table", "depends_on":"1445", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (1, 'a', 11.1 ,'2020-02-02 20:00:00') (2, 'a', 22.2 ,'2020-02-02 20:00:01')"},
+                        {"client":"python", "query_type": "table", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (1, 'a', 33.3 ,'2020-02-02 20:00:02') (2, 'a', 44.4 ,'2020-02-02 20:00:03')"},
+                        {"client":"python", "query_type": "table", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (1, 'b', 22.2 ,'2020-02-02 20:00:04') (2, 'b', 44.4 ,'2020-02-02 20:00:05')"},
                         {"client":"python", "query_type": "table", "kill":"1445", "kill_wait":3, "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (1, 'b', 33.3 ,'2020-02-02 20:00:05')"}
                     ]
                 }
@@ -236,10 +233,10 @@
                 {
                     "query_id":"1445",
                     "expected_results":[
-                        [2, 11.100000381469727, 22.200000762939453, 33.30000114440918, 16.65000057220459], 
-                        [2, 33.29999923706055, 44.400001525878906, 77.70000076293945, 38.85000038146973],
-                        [4, 22.200000762939453, 44.400001525878906, 144.3000030517578, 36.07500076293945],
-                        [4, 33.29999923706055, 44.400001525878906, 155.4000015258789, 38.85000038146973]
+                        [2, 11.1, 22.2, 33.3, 16.65], 
+                        [2, 33.3, 44.4, 77.7, 38.85],
+                        [4, 22.2, 44.4, 144.3, 36.075],
+                        [4, 33.3, 44.4, 155.4, 38.85]
                     ]
                 }
             ]
@@ -254,15 +251,11 @@
                     "statements": [
                         {"client":"python", "query_type":"table", "query":"drop stream if exists test14_append_stream_2"},
                         {"client":"python", "query_type": "table", "exist":"test14_append_stream_2", "exist_wait":2, "wait":1, "query":"create stream if not exists test14_append_stream_2 (id int, name string, val float, ts datetime)"},
-                        {"client":"python", "query_type": "stream", "query_id":"1446", "wait":1, "terminate":"manual", "query":"select count(), min(val), max(val), sum(val), avg(val) from changelog(test14_append_stream_2, id, name) group by id"},
-                        {"client":"python", "query_type": "table", "depends_on":"1446", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (1, 'a', 11.1 ,'2020-02-02 20:00:00')"},
-                        {"client":"python", "query_type": "table", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (2, 'a', 33.3 ,'2020-02-02 20:00:01')"},
-                        {"client":"python", "query_type": "table", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (1, 'b', 22.2 ,'2020-02-02 20:00:02')"},
-                        {"client":"python", "query_type": "table", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (2, 'b', 22.2 ,'2020-02-02 20:00:03')"},
-                        {"client":"python", "query_type": "table", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (1, 'c', 33.3 ,'2020-02-02 20:00:04')"},
-                        {"client":"python", "query_type": "table", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (2, 'c', 11.1 ,'2020-02-02 20:00:05')"},
-                        {"client":"python", "query_type": "table", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (1, 'b', 44.4 ,'2020-02-02 20:00:06')"},
-                        {"client":"python", "query_type": "table", "kill":"1446", "kill_wait":3, "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (2, 'b', 44.4 ,'2020-02-02 20:00:07')"}
+                        {"client":"python", "query_type": "stream", "depends_on_stream":"test14_append_stream_2", "query_id":"1446", "wait":1, "terminate":"manual", "query":"select count(), min(val), max(val), sum(val), avg(val) from changelog(test14_append_stream_2, id, name) group by id emit periodic 1s"},
+                        {"client":"python", "query_type": "table", "depends_on":"1446", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (1, 'a', 11.1 ,'2020-02-02 20:00:00') (2, 'a', 33.3 ,'2020-02-02 20:00:01')"},
+                        {"client":"python", "query_type": "table", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (1, 'b', 22.2 ,'2020-02-02 20:00:02') (2, 'b', 22.2 ,'2020-02-02 20:00:03')"},
+                        {"client":"python", "query_type": "table", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (1, 'c', 33.3 ,'2020-02-02 20:00:04') (2, 'c', 11.1 ,'2020-02-02 20:00:05')"},
+                        {"client":"python", "query_type": "table", "kill":"1446", "kill_wait":3, "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (1, 'b', 44.4 ,'2020-02-02 20:00:06') (2, 'b', 44.4 ,'2020-02-02 20:00:07')"}
                     ]
                 }
             ],
@@ -270,14 +263,14 @@
                 {
                     "query_id":"1446",
                     "expected_results":[
-                        [1, 33.29999923706055, 33.29999923706055, 33.29999923706055, 33.29999923706055],
-                        [1, 11.100000381469727, 11.100000381469727, 11.100000381469727, 11.100000381469727],
-                        [2, 22.200000762939453, 33.29999923706055, 55.5, 27.75],
-                        [2, 11.100000381469727, 22.200000762939453, 33.30000114440918, 16.65000057220459],
-                        [3, 11.100000381469727, 33.29999923706055, 66.60000038146973, 22.200000127156574],
-                        [3, 11.100000381469727, 33.29999923706055, 66.60000038146973, 22.200000127156574],
-                        [3, 11.100000381469727, 44.400001525878906, 88.80000114440918, 29.600000381469727], 
-                        [3, 11.100000381469727, 44.400001525878906, 88.80000114440918, 29.600000381469727]
+                        [1, 33.3, 33.3, 33.3, 33.3],
+                        [1, 11.1, 11.1, 11.1, 11.1],
+                        [2, 22.2, 33.3, 55.5, 27.75],
+                        [2, 11.1, 22.2, 33.3, 16.65],
+                        [3, 11.1, 33.3, 66.6, 22.2],
+                        [3, 11.1, 33.3, 66.6, 22.2],
+                        [3, 11.1, 44.4, 88.8, 29.6], 
+                        [3, 11.1, 44.4, 88.8, 29.6]
                     ]
                 }
             ]
@@ -292,15 +285,11 @@
                     "statements": [
                         {"client":"python", "query_type":"table", "query":"drop stream if exists test14_append_stream_2"},
                         {"client":"python", "query_type": "table", "exist":"test14_append_stream_2", "exist_wait":2, "wait":1, "query":"create stream if not exists test14_append_stream_2 (id int, name string, val float, ts datetime)"},
-                        {"client":"python", "query_type": "stream", "query_id":"1447", "wait":1, "terminate":"manual", "query":"select count(), min(val), max(val), sum(val), avg(val) from changelog(test14_append_stream_2, id, name) group by id"},
-                        {"client":"python", "query_type": "table", "depends_on":"1447", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (1, 'a', 11.1 ,'2020-02-02 20:00:00')"},
-                        {"client":"python", "query_type": "table", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (2, 'a', 33.3 ,'2020-02-02 20:00:01')"},
-                        {"client":"python", "query_type": "table", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (1, 'b', 22.2 ,'2020-02-02 20:00:02')"},
-                        {"client":"python", "query_type": "table", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (2, 'b', 22.2 ,'2020-02-02 20:00:03')"},
-                        {"client":"python", "query_type": "table", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (1, 'c', 33.3 ,'2020-02-02 20:00:04')"},
-                        {"client":"python", "query_type": "table", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (2, 'c', 11.1 ,'2020-02-02 20:00:05')"},
-                        {"client":"python", "query_type": "table", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (1, 'b', 44.4 ,'2020-02-02 20:00:06')"},
-                        {"client":"python", "query_type": "table", "kill":"1447", "kill_wait":3, "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (2, 'b', 44.4 ,'2020-02-02 20:00:07')"}
+                        {"client":"python", "query_type": "stream", "query_id":"1447", "wait":1, "terminate":"manual", "query":"select count(), min(val), max(val), sum(val), avg(val) from changelog(test14_append_stream_2, id, name) group by id emit periodic 1s"},
+                        {"client":"python", "query_type": "table", "depends_on":"1447", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (1, 'a', 11.1 ,'2020-02-02 20:00:00') (2, 'a', 33.3 ,'2020-02-02 20:00:01')"},
+                        {"client":"python", "query_type": "table", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (1, 'b', 22.2 ,'2020-02-02 20:00:02') (2, 'b', 22.2 ,'2020-02-02 20:00:03')"},
+                        {"client":"python", "query_type": "table", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (1, 'c', 33.3 ,'2020-02-02 20:00:04') (2, 'c', 11.1 ,'2020-02-02 20:00:05')"},
+                        {"client":"python", "query_type": "table", "kill":"1447", "kill_wait":3, "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (1, 'b', 44.4 ,'2020-02-02 20:00:06') (2, 'b', 44.4 ,'2020-02-02 20:00:07')"}
                     ]
                 }
             ],
@@ -308,14 +297,14 @@
                 {
                     "query_id":"1447",
                     "expected_results":[
-                        [1, 33.29999923706055, 33.29999923706055, 33.29999923706055, 33.29999923706055],
-                        [1, 11.100000381469727, 11.100000381469727, 11.100000381469727, 11.100000381469727],
-                        [2, 22.200000762939453, 33.29999923706055, 55.5, 27.75],
-                        [2, 11.100000381469727, 22.200000762939453, 33.30000114440918, 16.65000057220459],
-                        [3, 11.100000381469727, 33.29999923706055, 66.60000038146973, 22.200000127156574],
-                        [3, 11.100000381469727, 33.29999923706055, 66.60000038146973, 22.200000127156574],
-                        [3, 11.100000381469727, 44.400001525878906, 88.80000114440918, 29.600000381469727], 
-                        [3, 11.100000381469727, 44.400001525878906, 88.80000114440918, 29.600000381469727]
+                        [1, 33.3, 33.3, 33.3, 33.3],
+                        [1, 11.1, 11.1, 11.1, 11.1],
+                        [2, 22.2, 33.3, 55.5, 27.75],
+                        [2, 11.1, 22.2, 33.3, 16.65],
+                        [3, 11.1, 33.3, 66.6, 22.2],
+                        [3, 11.1, 33.3, 66.6, 22.2],
+                        [3, 11.1, 44.4, 88.8, 29.6], 
+                        [3, 11.1, 44.4, 88.8, 29.6]
                     ]
                 }
             ]
@@ -344,8 +333,8 @@
                 {
                     "query_id":"1448",
                     "expected_results":[
-                        [1, 3, 11.100000381469727, 44.400001525878906, 88.80000114440918, 29.600000381469727, "2020-02-02 20:00:00", "2020-02-02 20:00:05"],
-                        [2, 2, 22.200000762939453, 44.400001525878906, 66.60000228881836, 33.30000114440918, "2020-02-02 20:00:00", "2020-02-02 20:00:05"]
+                        [1, 3, 11.1, 44.4, 88.8, 29.6, "2020-02-02 20:00:00", "2020-02-02 20:00:05"],
+                        [2, 2, 22.2, 44.4, 66.6, 33.3, "2020-02-02 20:00:00", "2020-02-02 20:00:05"]
                     ]
                 }
             ]
@@ -360,7 +349,7 @@
                     "statements": [
                         {"client":"python", "query_type":"table", "query":"drop stream if exists test14_append_stream_2"},
                         {"client":"python", "query_type": "table", "exist":"test14_append_stream_2", "exist_wait":2, "wait":1, "query":"create stream if not exists test14_append_stream_2 (id int, name string, val float, ts datetime)"},
-                        {"client":"python", "query_type": "stream", "query_id":"1449", "wait":1, "terminate":"manual", "query":"with subquery as (select * from changelog(test14_append_stream_2, id, name))select id, count(*), min(val), max(val) from subquery group by id"},
+                        {"client":"python", "query_type": "stream", "query_id":"1449", "wait":1, "terminate":"manual", "query":"with subquery as (select * from changelog(test14_append_stream_2, id, name))select id, count(*), min(val), max(val) from subquery group by id emit periodic 1s"},
                         {"client":"python", "query_type": "table", "depends_on":"1449", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (1, 'a', 11.1 ,'2020-02-02 20:00:00')"},
                         {"client":"python", "query_type": "table", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (1, 'b', 22.2 ,'2020-02-02 20:00:02')"},
                         {"client":"python", "query_type": "table", "kill":"1449", "kill_wait":3, "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (1, 'a', 33.3 ,'2020-02-02 20:00:04')"}
@@ -371,8 +360,9 @@
                 {
                     "query_id":"1449",
                     "expected_results":[
-                        [1, 2, 11.100000381469727, 22.200000762939453],
-                        [1, 2, 22.200000762939453, 33.29999923706055]
+                        [1, 1, 11.1, 11.1],
+                        [1, 2, 11.1, 22.2],
+                        [1, 2, 22.2, 33.3]
                     ]
                 }
             ]
@@ -387,15 +377,11 @@
                     "statements": [
                         {"client":"python", "query_type":"table", "query":"drop stream if exists test14_append_stream_2"},
                         {"client":"python", "query_type": "table", "exist":"test14_append_stream_2", "exist_wait":2, "wait":1, "query":"create stream if not exists test14_append_stream_2 (id int, name string, val float, ts datetime)"},
-                        {"client":"python", "query_type": "stream", "query_id":"1450", "wait":1, "terminate":"manual", "query":"with subquery as (select id, count() as cnt, min(val) as min_val, max(val) as max_val from changelog(test14_append_stream_2, id, name) group by id)select count(*), sum(cnt), min(min_val), max(max_val) from subquery"},
-                        {"client":"python", "query_type": "table", "depends_on":"1450", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (1, 'a', 11.1 ,'2020-02-02 20:00:00')"},
-                        {"client":"python", "query_type": "table", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (2, 'a', 33.3 ,'2020-02-02 20:00:01')"},
-                        {"client":"python", "query_type": "table", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (1, 'b', 22.2 ,'2020-02-02 20:00:02')"},
-                        {"client":"python", "query_type": "table", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (2, 'b', 22.2 ,'2020-02-02 20:00:03')"},
-                        {"client":"python", "query_type": "table", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (1, 'c', 33.3 ,'2020-02-02 20:00:04')"},
-                        {"client":"python", "query_type": "table", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (2, 'c', 11.1 ,'2020-02-02 20:00:05')"},
-                        {"client":"python", "query_type": "table", "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (1, 'b', 44.4 ,'2020-02-02 20:00:06')"},
-                        {"client":"python", "query_type": "table", "kill":"1450", "kill_wait":3, "wait":1, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (2, 'b', 44.4 ,'2020-02-02 20:00:07')"}
+                        {"client":"python", "query_type": "stream", "depends_on_stream":"test14_append_stream_2", "query_id":"1450", "wait":1, "terminate":"manual", "query":"with subquery as (select id, count() as cnt, min(val) as min_val, max(val) as max_val from changelog(test14_append_stream_2, id, name) group by id)select count(*), sum(cnt), min(min_val), max(max_val) from subquery"},
+                        {"client":"python", "query_type": "table", "depends_on":"1450", "wait":2, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (1, 'a', 11.1 ,'2020-02-02 20:00:00') (2, 'a', 33.3 ,'2020-02-02 20:00:01')"},
+                        {"client":"python", "query_type": "table", "wait":2, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (1, 'b', 22.2 ,'2020-02-02 20:00:02') (2, 'b', 22.2 ,'2020-02-02 20:00:03')"},
+                        {"client":"python", "query_type": "table", "wait":2, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (1, 'c', 33.3 ,'2020-02-02 20:00:04') (2, 'c', 11.1 ,'2020-02-02 20:00:05')"},
+                        {"client":"python", "query_type": "table", "kill":"1450", "kill_wait":3, "wait":2, "query": "insert into test14_append_stream_2 (id, name, val, ts) values (1, 'b', 44.4 ,'2020-02-02 20:00:06') (2, 'b', 44.4 ,'2020-02-02 20:00:07')"}
                     ]
                 }
             ],
@@ -403,10 +389,10 @@
                 {
                     "query_id":"1450",
                     "expected_results":[
-                        [2, 2, 11.100000381469727, 33.29999923706055], 
-                        [2, 4, 11.100000381469727, 33.29999923706055], 
-                        [2, 6, 11.100000381469727, 33.29999923706055], 
-                        [2, 6, 11.100000381469727, 44.400001525878906]
+                        [2, 2, 11.1, 33.3], 
+                        [2, 4, 11.1, 33.3], 
+                        [2, 6, 11.1, 33.3], 
+                        [2, 6, 11.1, 44.4]
                     ]
                 }
             ]
diff --git a/tests/stream/test_stream_smoke/0018_query_state7_view.json b/tests/stream/test_stream_smoke/0018_query_state7_view.json
index 2803f5c8d77..79a53ddd044 100644
--- a/tests/stream/test_stream_smoke/0018_query_state7_view.json
+++ b/tests/stream/test_stream_smoke/0018_query_state7_view.json
@@ -548,7 +548,7 @@
                         {"client":"python", "query_type": "table", "wait":1, "query":"drop view if exists test19_state_mv7"},
                         {"client":"python", "query_type": "table", "wait":1, "query":"drop stream if exists test19_state_stream7"},
                         {"client":"python", "query_type": "table", "exist":"test19_state_stream7", "exist_wait":2, "wait":1, "query":"create stream test19_state_stream7 (id string, location string, value float, timestamp datetime64(3) default now64(3))"},
-                        {"client":"python", "query_type": "table", "exist":"test19_state_mv7", "exist_wait":2, "wait":1, "query":"create materialized view test19_state_mv7 as (select id, sum(value) as sum_value from test19_state_stream7 group by id limit 4 union select id, sum(value) as sum_value from test19_state_stream7 group by id limit 4)"},
+                        {"client":"python", "query_type": "table", "exist":"test19_state_mv7", "exist_wait":2, "wait":1, "query":"create materialized view test19_state_mv7 as (select id, sum(value) as sum_value from test19_state_stream7 group by id union select id, sum(value) as sum_value from test19_state_stream7 group by id)"},
                         {"client":"python", "query_type": "stream", "query_id":"19177", "wait":1, "terminate":"manual", "query":"subscribe to select id, sum_value from test19_state_mv7 settings checkpoint_interval=1"},
                         {"client":"python", "query_type": "table", "depends_on":"19177", "kill":"19177", "kill_wait":3, "wait":1, "query": "insert into test19_state_stream7(id, location, value, timestamp) values ('dev1', 'ca', 57.3, '2020-02-02 20:00:00')('dev2', 'ca', 58.3, '2020-02-02 20:00:03')"}
                     ]
@@ -579,8 +579,10 @@
                     "expected_results":[
                         ["dev1", "57.3"],
                         ["dev2", "127.3"],
+                        ["dev4", "67"],
                         ["dev1", "57.3"],
-                        ["dev2", "127.3"]
+                        ["dev2", "127.3"],
+                        ["dev4", "67"]
                     ]
                 }
             ]
diff --git a/tests/stream/test_stream_smoke/0030_two_level_global_aggr.yaml b/tests/stream/test_stream_smoke/0030_two_level_global_aggr.yaml
index 09a1c4d9240..9c5657d1111 100644
--- a/tests/stream/test_stream_smoke/0030_two_level_global_aggr.yaml
+++ b/tests/stream/test_stream_smoke/0030_two_level_global_aggr.yaml
@@ -131,7 +131,7 @@ tests:
           query_id: 3100
           depends_on_stream: test_31_multishards_stream
           query: |
-            subscribe to with cte as (select i as key, count() from test_31_multishards_stream where _tp_time > earliest_ts() group by key settings group_by_two_level_threshold=50) select count() from cte settings checkpoint_interval=2, emit_during_backfill=false;
+            subscribe to with cte as (select i as key, count() from test_31_multishards_stream where _tp_time > earliest_ts() group by key settings group_by_two_level_threshold=50) select count() from cte settings checkpoint_interval=2;
 
         - client: python
           query_type: table
@@ -207,7 +207,7 @@ tests:
           depends_on_stream: test_31_multishards_stream
           wait: 1
           query: |
-            subscribe to with cte as (select i as key, count() from changelog(test_31_multishards_stream, i) where _tp_time > earliest_ts() group by key emit changelog settings group_by_two_level_threshold=50) select count() from cte settings checkpoint_interval=2, emit_during_backfill=false;
+            subscribe to with cte as (select i as key, count() from changelog(test_31_multishards_stream, i) where _tp_time > earliest_ts() group by key emit changelog settings group_by_two_level_threshold=50) select count() from cte settings checkpoint_interval=2;
 
         - client: python
           query_type: table
diff --git a/tests/stream/test_stream_smoke/0099_fixed_issues.json b/tests/stream/test_stream_smoke/0099_fixed_issues.json
index 1422c6d81a7..3f298ec5413 100644
--- a/tests/stream/test_stream_smoke/0099_fixed_issues.json
+++ b/tests/stream/test_stream_smoke/0099_fixed_issues.json
@@ -640,7 +640,7 @@
             {"client":"python", "query_type": "table", "query": "drop stream if exists v_12183487"},
             {"client":"python", "query_type": "table", "wait":2, "query": "create stream v_12183487(id int, val int) primary key id settings shards=3;"},
             {"client":"python", "query_type": "stream", "wait":2, "depends_on_stream":"v_12183487", "query_id":"12183487213", "query":"subscribe to select sum_distinct_streaming(val), sum(val), count_distinct(val), count(val) from v_12183487 settings checkpoint_interval=1;"},
-            {"client":"python", "query_type": "table", "depends_on_stream": "v_12183487", "kill":"12183487213", "kill_wait":2, "wait": 3, "query": "insert into v_12183487(id, val) values(3, 30);"}
+            {"client":"python", "query_type": "table", "depends_on": "12183487213", "kill":"12183487213", "kill_wait":3, "wait": 2, "query": "insert into v_12183487(id, val) values(3, 30);"}
           ]
         },
         {

From 22a14c5608330ef0bcb02e9b058328216f3b1815 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lisen=20=E6=9D=A8?= <li.yang@timeplus.io>
Date: Sat, 3 Feb 2024 01:58:12 +0800
Subject: [PATCH 4/5] fix comments  * remove compile aggregate functions for
 streaming query now  * remove no_more_keys  * remove overflow_rows  * move
 out refactor code for retract impl

---
 cmake/autogenerated_versions.txt              |    2 +-
 src/Common/HashMapsTemplate.h                 |    4 +-
 src/Common/HashTable/TimeBucketHashMap.h      |    4 +-
 src/Common/HashTable/TimeBucketHashTable.h    |   42 +-
 src/Common/HashTable/TwoLevelHashMap.h        |    4 +-
 src/Common/HashTable/TwoLevelHashTable.h      |   37 +-
 src/Common/HashTable/TwoLevelStringHashMap.h  |    4 +-
 .../HashTable/TwoLevelStringHashTable.h       |   50 +-
 src/Interpreters/InterpreterSelectQuery.cpp   |    3 +-
 src/Interpreters/Streaming/AggregateDataEx.h  |  124 --
 .../Streaming/AggregationUtils.cpp            |   10 +-
 src/Interpreters/Streaming/Aggregator.cpp     | 1848 +++++------------
 src/Interpreters/Streaming/Aggregator.h       |  202 +-
 .../Streaming/UpdatesTrackingData.h           |  105 +
 .../Streaming/AggregatingHelper.cpp           |  107 +-
 .../Transforms/Streaming/AggregatingHelper.h  |   11 +-
 .../Streaming/AggregatingTransform.cpp        |   21 +-
 .../AggregatingTransformWithSubstream.cpp     |   12 +-
 .../Streaming/GlobalAggregatingTransform.cpp  |   84 +-
 .../Streaming/GlobalAggregatingTransform.h    |    2 -
 ...lobalAggregatingTransformWithSubstream.cpp |   58 +-
 .../GlobalAggregatingTransformWithSubstream.h |    2 -
 22 files changed, 917 insertions(+), 1819 deletions(-)
 delete mode 100644 src/Interpreters/Streaming/AggregateDataEx.h
 create mode 100644 src/Interpreters/Streaming/UpdatesTrackingData.h

diff --git a/cmake/autogenerated_versions.txt b/cmake/autogenerated_versions.txt
index 2f61abb85dc..29ccf0cc41c 100644
--- a/cmake/autogenerated_versions.txt
+++ b/cmake/autogenerated_versions.txt
@@ -2,7 +2,7 @@
 
 # NOTE: has nothing common with DBMS_TCP_PROTOCOL_VERSION,
 # only DBMS_TCP_PROTOCOL_VERSION should be incremented on protocol changes.
-SET(VERSION_REVISION 3)
+SET(VERSION_REVISION 2)
 SET(VERSION_MAJOR 1)
 SET(VERSION_MINOR 4)
 SET(VERSION_PATCH 1)
diff --git a/src/Common/HashMapsTemplate.h b/src/Common/HashMapsTemplate.h
index 53df5ecd69f..09e1a031935 100644
--- a/src/Common/HashMapsTemplate.h
+++ b/src/Common/HashMapsTemplate.h
@@ -70,14 +70,14 @@ template <typename Map, typename MappedSerializer>
 void serializeTwoLevelHashMap(const Map & map, MappedSerializer && mapped_serializer, WriteBuffer & wb)
 {
     serializeHashMap<Map, MappedSerializer>(map, std::move(mapped_serializer), wb);
-    map.writeBucketUpdatedFlags(wb);
+    map.writeUpdatedBuckets(wb);
 }
 
 template <typename Map, typename MappedDeserializer>
 void deserializeTwoLevelHashMap(Map & map, MappedDeserializer && mapped_deserializer, Arena & pool, ReadBuffer & rb)
 {
     deserializeHashMap<Map, MappedDeserializer>(map, std::move(mapped_deserializer), pool, rb);
-    map.readBucketUpdatedFlags(rb); /// recover buckets updated status
+    map.readUpdatedBuckets(rb); /// recover buckets updated status
 }
 
 /// HashMapsTemplate is a taken from HashJoin class and make it standalone
diff --git a/src/Common/HashTable/TimeBucketHashMap.h b/src/Common/HashTable/TimeBucketHashMap.h
index 685ede30af4..827c396f8ef 100644
--- a/src/Common/HashTable/TimeBucketHashMap.h
+++ b/src/Common/HashTable/TimeBucketHashMap.h
@@ -38,11 +38,11 @@ class TimeBucketHashMapTable
     {
         for (auto & p : this->impls)
         {
-            if (this->isUpdatedBucket(p.first))
+            if (this->isBucketUpdated(p.first))
             {
                 p.second.forEachValue(func);
                 if (reset_updated)
-                    this->resetUpdated(p.first);
+                    this->resetUpdatedBucket(p.first);
             }
         }
     }
diff --git a/src/Common/HashTable/TimeBucketHashTable.h b/src/Common/HashTable/TimeBucketHashTable.h
index 9bff2271aa3..023a10ba9de 100644
--- a/src/Common/HashTable/TimeBucketHashTable.h
+++ b/src/Common/HashTable/TimeBucketHashTable.h
@@ -110,7 +110,7 @@ class TimeBucketHashTable : private boost::noncopyable, protected Hash /// empty
     /// FIXME, choose a better perf data structure
     /// Usually we don't have too many time buckets
     std::map<Int64, Impl> impls;
-    std::unordered_map<Int64, bool/*updated*/> bucket_updated_flags;
+    std::unordered_map<Int64, bool/*updated*/> updated_buckets;
     Impl sentinel;
 
     TimeBucketHashTable() { }
@@ -265,7 +265,7 @@ class TimeBucketHashTable : private boost::noncopyable, protected Hash /// empty
     {
         auto window = windowKey(key_holder);
         impls[window].emplace(key_holder, it, inserted, hash_value);
-        bucket_updated_flags[window] = true; /// updated
+        updated_buckets[window] = true; /// updated
     }
 
     LookupResult ALWAYS_INLINE find(Key x, size_t hash_value)
@@ -292,7 +292,7 @@ class TimeBucketHashTable : private boost::noncopyable, protected Hash /// empty
         {
             DB::writeIntBinary(p.first);
             p.second.write(wb);
-            DB::writeBoolText(bucket_updated_flags[p.first], wb);
+            DB::writeBinary(updated_buckets[p.first], wb);
         }
     }
 
@@ -317,7 +317,7 @@ class TimeBucketHashTable : private boost::noncopyable, protected Hash /// empty
             DB::writeChar('<', wb);
             p.second.writeText(wb);
             DB::writeChar(',', wb);
-            DB::writeBoolText(bucket_updated_flags[p.first], wb);
+            DB::writeBoolText(updated_buckets[p.first], wb);
             DB::writeChar('>', wb);
         }
         DB::writeChar(END_BUCKET_MARKER, wb);
@@ -336,7 +336,7 @@ class TimeBucketHashTable : private boost::noncopyable, protected Hash /// empty
             assert(key != 0);
             assert(!impls.contains(key));
             impls[key].read(rb);
-            DB::readBoolText(bucket_updated_flags[key], rb);
+            DB::readBinary(updated_buckets[key], rb);
         }
     }
 
@@ -363,7 +363,7 @@ class TimeBucketHashTable : private boost::noncopyable, protected Hash /// empty
             DB::assertChar('<', rb);
             impls[key].readText(rb);
             DB::assertChar(',', rb);
-            DB::readBoolText(bucket_updated_flags[key], rb);
+            DB::readBoolText(updated_buckets[key], rb);
             DB::assertChar('>', rb);
         }
         DB::assertChar(END_BUCKET_MARKER, rb);
@@ -417,7 +417,7 @@ class TimeBucketHashTable : private boost::noncopyable, protected Hash /// empty
                 last_removed_watermark = it->first;
                 ++removed;
 
-                bucket_updated_flags.erase(it->first);
+                updated_buckets.erase(it->first);
                 it = impls.erase(it);
             }
             else
@@ -455,44 +455,44 @@ class TimeBucketHashTable : private boost::noncopyable, protected Hash /// empty
         return buckets;
     }
 
-    bool isUpdatedBucket(Int64 bucket_) const
+    bool isBucketUpdated(Int64 bucket_) const
     {
-        auto it = bucket_updated_flags.find(bucket_);
-        if (it != bucket_updated_flags.end())
+        auto it = updated_buckets.find(bucket_);
+        if (it != updated_buckets.end())
             return it->second;
 
         return false;
     }
 
-    void resetUpdated(Int64 bucket_)
+    void resetUpdatedBucket(Int64 bucket_)
     {
-        auto it = bucket_updated_flags.find(bucket_);
-        if (it != bucket_updated_flags.end())
+        auto it = updated_buckets.find(bucket_);
+        if (it != updated_buckets.end())
             it->second = false;
     }
 
-    void writeBucketUpdatedFlags(DB::WriteBuffer & wb) const
+    void writeUpdatedBuckets(DB::WriteBuffer & wb) const
     {
-        DB::writeVarUInt(bucket_updated_flags.size(), wb);
-        for (const auto & [bucket, updated] : bucket_updated_flags)
+        DB::writeVarUInt(updated_buckets.size(), wb);
+        for (const auto & [bucket, updated] : updated_buckets)
         {
             DB::writeIntBinary(bucket, wb);
-            DB::writeBoolText(updated, wb);
+            DB::writeBinary(updated, wb);
         }
     }
 
-    void readBucketUpdatedFlags(DB::ReadBuffer & rb)
+    void readUpdatedBuckets(DB::ReadBuffer & rb)
     {
         size_t size = 0;
         DB::readVarUInt(size, rb);
-        bucket_updated_flags.clear();
+        updated_buckets.clear();
         Int64 bucket = 0;
         bool updated = false;
         for (size_t i = 0; i < size; ++i)
         {
             DB::readIntBinary(bucket, rb);
-            DB::readBoolText(updated, rb);
-            bucket_updated_flags.emplace(bucket, updated);
+            DB::readBinary(updated, rb);
+            updated_buckets.emplace(bucket, updated);
         }
     }
 };
diff --git a/src/Common/HashTable/TwoLevelHashMap.h b/src/Common/HashTable/TwoLevelHashMap.h
index 5c87d5e6eb0..26008468974 100644
--- a/src/Common/HashTable/TwoLevelHashMap.h
+++ b/src/Common/HashTable/TwoLevelHashMap.h
@@ -43,11 +43,11 @@ class TwoLevelHashMapTable : public TwoLevelHashTable<Key, Cell, Hash, Grower, A
     {
         for (auto i = 0u; i < this->NUM_BUCKETS; ++i)
         {
-            if (this->isUpdatedBucket(i))
+            if (this->isBucketUpdated(i))
             {
                 this->impls[i].forEachValue(func);
                 if (reset_updated)
-                    this->resetUpdated(i);
+                    this->resetUpdatedBucket(i);
             }
         }
     }
diff --git a/src/Common/HashTable/TwoLevelHashTable.h b/src/Common/HashTable/TwoLevelHashTable.h
index 4dd13e6e7e4..46d9e3ad637 100644
--- a/src/Common/HashTable/TwoLevelHashTable.h
+++ b/src/Common/HashTable/TwoLevelHashTable.h
@@ -90,7 +90,7 @@ class TwoLevelHashTable :
     using ConstLookupResult = typename Impl::ConstLookupResult;
 
     Impl impls[NUM_BUCKETS];
-    bool bucket_updated_flags[NUM_BUCKETS] = {false};
+    bool updated_buckets[NUM_BUCKETS] = {false};
 
 
     TwoLevelHashTable() = default;
@@ -120,7 +120,7 @@ class TwoLevelHashTable :
             size_t hash_value = cell->getHash(src);
             size_t buck = getBucketFromHash(hash_value);
             impls[buck].insertUniqueNonZero(cell, hash_value);
-            bucket_updated_flags[buck] = true;
+            updated_buckets[buck] = true;
         }
     }
 
@@ -273,7 +273,7 @@ class TwoLevelHashTable :
     {
         size_t buck = getBucketFromHash(hash_value);
         impls[buck].emplace(key_holder, it, inserted, hash_value);
-        bucket_updated_flags[buck] = true;
+        updated_buckets[buck] = true;
     }
 
     LookupResult ALWAYS_INLINE find(Key x, size_t hash_value)
@@ -297,7 +297,7 @@ class TwoLevelHashTable :
         for (UInt32 i = 0; i < NUM_BUCKETS; ++i)
         {
             impls[i].write(wb);
-            DB::writeBoolText(bucket_updated_flags[i], wb);
+            DB::writeBinary(updated_buckets[i], wb);
         }
     }
 
@@ -307,11 +307,12 @@ class TwoLevelHashTable :
         {
             if (i != 0)
                 DB::writeChar(',', wb);
+
             /// <impl,updated>
             DB::writeChar('<', wb);
             impls[i].writeText(wb);
             DB::writeChar(',', wb);
-            DB::writeBoolText(bucket_updated_flags[i], wb);
+            DB::writeBoolText(updated_buckets[i], wb);
             DB::writeChar('>', wb);
         }
     }
@@ -321,7 +322,7 @@ class TwoLevelHashTable :
         for (UInt32 i = 0; i < NUM_BUCKETS; ++i)
         {
             impls[i].read(rb);
-            DB::readBoolText(bucket_updated_flags[i], rb);
+            DB::readBinary(updated_buckets[i], rb);
         }
     }
 
@@ -331,12 +332,12 @@ class TwoLevelHashTable :
         {
             if (i != 0)
                 DB::assertChar(',', rb);
-            
+
             /// <impl,updated>
             DB::assertChar('<', rb);
             impls[i].readText(rb);
             DB::assertChar(',', rb);
-            DB::readBoolText(bucket_updated_flags[i], rb);
+            DB::readBoolText(updated_buckets[i], rb);
             DB::assertChar('>', rb);
         }
     }
@@ -386,30 +387,30 @@ class TwoLevelHashTable :
         return bucket_ids;
     }
 
-    bool isUpdatedBucket(Int64 bucket_) const
+    bool isBucketUpdated(Int64 bucket_) const
     {
-        return bucket_updated_flags[bucket_];
+        return updated_buckets[bucket_];
     }
 
-    void resetUpdated(Int64 bucket_)
+    void resetUpdatedBucket(Int64 bucket_)
     {
-        bucket_updated_flags[bucket_] = false;
+        updated_buckets[bucket_] = false;
     }
 
-    void writeBucketUpdatedFlags(DB::WriteBuffer & wb) const
+    void writeUpdatedBuckets(DB::WriteBuffer & wb) const
     {
         DB::writeVarUInt(NUM_BUCKETS, wb);
-        for (const auto & elem : bucket_updated_flags)
-            DB::writeBoolText(elem, wb);
+        for (const auto & elem : updated_buckets)
+            DB::writeBinary(elem, wb);
     }
 
-    void readBucketUpdatedFlags(DB::ReadBuffer & rb)
+    void readUpdatedBuckets(DB::ReadBuffer & rb)
     {
         size_t size = 0;
         DB::readVarUInt(size, rb);
         assert(size == NUM_BUCKETS);
-        for (auto & elem : bucket_updated_flags)
-            DB::readBoolText(elem, rb);
+        for (auto & elem : updated_buckets)
+            DB::readBinary(elem, rb);
     }
     /// proton : ends
 };
diff --git a/src/Common/HashTable/TwoLevelStringHashMap.h b/src/Common/HashTable/TwoLevelStringHashMap.h
index 9f2c5ba00d3..3501861a3ee 100644
--- a/src/Common/HashTable/TwoLevelStringHashMap.h
+++ b/src/Common/HashTable/TwoLevelStringHashMap.h
@@ -34,11 +34,11 @@ class TwoLevelStringHashMap : public TwoLevelStringHashTable<StringHashMapSubMap
     {
         for (auto i = 0u; i < this->NUM_BUCKETS; ++i)
         {
-            if (this->isUpdatedBucket(i))
+            if (this->isBucketUpdated(i))
             {
                 this->impls[i].forEachValue(func);
                 if (reset_updated)
-                    this->resetUpdated(i);
+                    this->resetUpdatedBucket(i);
             }
         }
     }
diff --git a/src/Common/HashTable/TwoLevelStringHashTable.h b/src/Common/HashTable/TwoLevelStringHashTable.h
index e74ae676143..e1a3910ecf4 100644
--- a/src/Common/HashTable/TwoLevelStringHashTable.h
+++ b/src/Common/HashTable/TwoLevelStringHashTable.h
@@ -39,7 +39,7 @@ class TwoLevelStringHashTable : private boost::noncopyable
     using ConstLookupResult = typename Impl::ConstLookupResult;
 
     Impl impls[NUM_BUCKETS];
-    bool bucket_updated_flags[NUM_BUCKETS] = {false};
+    bool updated_buckets[NUM_BUCKETS] = {false};
 
     TwoLevelStringHashTable() {}
 
@@ -54,28 +54,28 @@ class TwoLevelStringHashTable : private boost::noncopyable
             size_t hash_value = v.getHash(src.m1);
             size_t buck = getBucketFromHash(hash_value);
             impls[buck].m1.insertUniqueNonZero(&v, hash_value);
-            bucket_updated_flags[buck] = true;
+            updated_buckets[buck] = true;
         }
         for (auto & v : src.m2)
         {
             size_t hash_value = v.getHash(src.m2);
             size_t buck = getBucketFromHash(hash_value);
             impls[buck].m2.insertUniqueNonZero(&v, hash_value);
-            bucket_updated_flags[buck] = true;
+            updated_buckets[buck] = true;
         }
         for (auto & v : src.m3)
         {
             size_t hash_value = v.getHash(src.m3);
             size_t buck = getBucketFromHash(hash_value);
             impls[buck].m3.insertUniqueNonZero(&v, hash_value);
-            bucket_updated_flags[buck] = true;
+            updated_buckets[buck] = true;
         }
         for (auto & v : src.ms)
         {
             size_t hash_value = v.getHash(src.ms);
             size_t buck = getBucketFromHash(hash_value);
             impls[buck].ms.insertUniqueNonZero(&v, hash_value);
-            bucket_updated_flags[buck] = true;
+            updated_buckets[buck] = true;
         }
     }
 
@@ -90,7 +90,7 @@ class TwoLevelStringHashTable : private boost::noncopyable
         if (sz == 0)
         {
             if constexpr (std::is_same_v<Func, typename Impl::EmplaceCallable>)
-                self.bucket_updated_flags[0] = true;
+                self.updated_buckets[0] = true;
 
             keyHolderDiscardKey(key_holder);
             return func(self.impls[0].m0, VoidKey{}, 0);
@@ -103,7 +103,7 @@ class TwoLevelStringHashTable : private boost::noncopyable
             auto res = hash(x);
             auto buck = getBucketFromHash(res);
             if constexpr (std::is_same_v<Func, typename Impl::EmplaceCallable>)
-                self.bucket_updated_flags[buck] = true;
+                self.updated_buckets[buck] = true;
 
             return func(self.impls[buck].ms, std::forward<KeyHolder>(key_holder),
                 res);
@@ -138,7 +138,7 @@ class TwoLevelStringHashTable : private boost::noncopyable
                 auto res = hash(k8);
                 auto buck = getBucketFromHash(res);
                 if constexpr (std::is_same_v<Func, typename Impl::EmplaceCallable>)
-                    self.bucket_updated_flags[buck] = true;
+                    self.updated_buckets[buck] = true;
 
                 keyHolderDiscardKey(key_holder);
                 return func(self.impls[buck].m1, k8, res);
@@ -152,7 +152,7 @@ class TwoLevelStringHashTable : private boost::noncopyable
                 auto res = hash(k16);
                 auto buck = getBucketFromHash(res);
                 if constexpr (std::is_same_v<Func, typename Impl::EmplaceCallable>)
-                    self.bucket_updated_flags[buck] = true;
+                    self.updated_buckets[buck] = true;
 
                 keyHolderDiscardKey(key_holder);
                 return func(self.impls[buck].m2, k16, res);
@@ -166,7 +166,7 @@ class TwoLevelStringHashTable : private boost::noncopyable
                 auto res = hash(k24);
                 auto buck = getBucketFromHash(res);
                 if constexpr (std::is_same_v<Func, typename Impl::EmplaceCallable>)
-                    self.bucket_updated_flags[buck] = true;
+                    self.updated_buckets[buck] = true;
 
                 keyHolderDiscardKey(key_holder);
                 return func(self.impls[buck].m3, k24, res);
@@ -176,7 +176,7 @@ class TwoLevelStringHashTable : private boost::noncopyable
                 auto res = hash(x);
                 auto buck = getBucketFromHash(res);
                 if constexpr (std::is_same_v<Func, typename Impl::EmplaceCallable>)
-                    self.bucket_updated_flags[buck] = true;
+                    self.updated_buckets[buck] = true;
 
                 return func(self.impls[buck].ms, std::forward<KeyHolder>(key_holder), res);
             }
@@ -204,7 +204,7 @@ class TwoLevelStringHashTable : private boost::noncopyable
         for (UInt32 i = 0; i < NUM_BUCKETS; ++i)
         {
             impls[i].write(wb);
-            DB::writeBoolText(bucket_updated_flags[i], wb);
+            DB::writeBinary(updated_buckets[i], wb);
         }
     }
 
@@ -218,7 +218,7 @@ class TwoLevelStringHashTable : private boost::noncopyable
             DB::writeChar('<', wb);
             impls[i].writeText(wb);
             DB::writeChar(',', wb);
-            DB::writeBoolText(bucket_updated_flags[i], wb);
+            DB::writeBoolText(updated_buckets[i], wb);
             DB::writeChar('>', wb);
         }
     }
@@ -228,7 +228,7 @@ class TwoLevelStringHashTable : private boost::noncopyable
         for (UInt32 i = 0; i < NUM_BUCKETS; ++i)
         {
             impls[i].read(rb);
-            DB::readBoolText(bucket_updated_flags[i], rb);
+            DB::readBinary(updated_buckets[i], rb);
         }
     }
 
@@ -243,7 +243,7 @@ class TwoLevelStringHashTable : private boost::noncopyable
             DB::assertChar('<', rb);
             impls[i].readText(rb);
             DB::assertChar(',', rb);
-            DB::readBoolText(bucket_updated_flags[i], rb);
+            DB::readBoolText(updated_buckets[i], rb);
             DB::assertChar('>', rb);
         }
     }
@@ -293,29 +293,29 @@ class TwoLevelStringHashTable : private boost::noncopyable
         return bucket_ids;
     }
 
-    bool isUpdatedBucket(Int64 bucket_) const
+    bool isBucketUpdated(Int64 bucket_) const
     {
-        return bucket_updated_flags[bucket_];
+        return updated_buckets[bucket_];
     }
 
-    void resetUpdated(Int64 bucket_)
+    void resetUpdatedBucket(Int64 bucket_)
     {
-        bucket_updated_flags[bucket_] = false;
+        updated_buckets[bucket_] = false;
     }
 
-    void writeBucketUpdatedFlags(DB::WriteBuffer & wb) const
+    void writeUpdatedBuckets(DB::WriteBuffer & wb) const
     {
         DB::writeVarUInt(NUM_BUCKETS, wb);
-        for (const auto & elem : bucket_updated_flags)
-            DB::writeBoolText(elem, wb);
+        for (const auto & elem : updated_buckets)
+            DB::writeBinary(elem, wb);
     }
 
-    void readBucketUpdatedFlags(DB::ReadBuffer & rb)
+    void readUpdatedBuckets(DB::ReadBuffer & rb)
     {
         size_t size = 0;
         DB::readVarUInt(size, rb);
         assert(size == NUM_BUCKETS);
-        for (auto & elem : bucket_updated_flags)
-            DB::readBoolText(elem, rb);
+        for (auto & elem : updated_buckets)
+            DB::readBinary(elem, rb);
     }
 };
diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp
index 66d99a2c0fb..ae40014e4ba 100644
--- a/src/Interpreters/InterpreterSelectQuery.cpp
+++ b/src/Interpreters/InterpreterSelectQuery.cpp
@@ -3261,8 +3261,7 @@ void InterpreterSelectQuery::executeStreamingAggregation(
         streaming_group_by,
         delta_col_pos,
         window_keys_num,
-        query_info.streaming_window_params,
-        data_stream_semantic_pair.isChangelogOutput());
+        query_info.streaming_window_params);
 
     auto merge_threads = max_streams;
     auto temporary_data_merge_threads = settings.aggregation_memory_efficient_merge_threads
diff --git a/src/Interpreters/Streaming/AggregateDataEx.h b/src/Interpreters/Streaming/AggregateDataEx.h
deleted file mode 100644
index 2b969018a7d..00000000000
--- a/src/Interpreters/Streaming/AggregateDataEx.h
+++ /dev/null
@@ -1,124 +0,0 @@
-#pragma once
-
-#include <IO/ReadHelpers.h>
-#include <IO/WriteHelpers.h>
-#include <Common/serde.h>
-
-namespace DB
-{
-using AggregateDataPtr = char *;
-using ConstAggregateDataPtr = const char *;
-
-namespace Streaming
-{
-SERDE struct UpdatedDataEx
-{
-    static ALWAYS_INLINE UpdatedDataEx & data(AggregateDataPtr __restrict place) { return *reinterpret_cast<UpdatedDataEx *>(place); }
-    static ALWAYS_INLINE const UpdatedDataEx & data(ConstAggregateDataPtr __restrict place) { return *reinterpret_cast<const UpdatedDataEx *>(place); }
-
-    static ALWAYS_INLINE bool isEmpty(ConstAggregateDataPtr __restrict place) { return data(place).final_count == 0; }
-    static ALWAYS_INLINE bool isUpdated(ConstAggregateDataPtr __restrict place) { return data(place).updated_since_last_finalization; }
-    static ALWAYS_INLINE void setUpdated(AggregateDataPtr __restrict place) { data(place).updated_since_last_finalization = true; }
-    static ALWAYS_INLINE void resetUpdated(AggregateDataPtr __restrict place) { data(place).updated_since_last_finalization = false; }
-
-    static void addBatch(size_t row_begin, size_t row_end, AggregateDataPtr * places, const IColumn * delta_col)
-    {
-        if (delta_col == nullptr)
-        {
-            for (size_t i = row_begin; i < row_end; ++i)
-                if (places[i])
-                    data(places[i]).add();
-        }
-        else
-        {
-            const auto & delta_flags = assert_cast<const ColumnInt8 &>(*delta_col).getData();
-            for (size_t i = row_begin; i < row_end; ++i)
-            {
-                if (places[i])
-                {
-                    if (delta_flags[i] >= 0)
-                        data(places[i]).add();
-                    else
-                        data(places[i]).negate();
-                }
-            }
-        }
-    }
-
-    static void addBatchSinglePlace(size_t row_begin, size_t row_end, AggregateDataPtr __restrict place, const IColumn * delta_col)
-    {
-        if (!place)
-            return;
-
-        auto & metadata = data(place);
-        if (delta_col == nullptr)
-            metadata.final_count += row_end - row_begin;
-        else
-        {
-            const auto & delta_flags = assert_cast<const ColumnInt8 &>(*delta_col).getData();
-            metadata.final_count = std::accumulate(delta_flags.begin(), delta_flags.end(), metadata.final_count);
-        }
-
-        metadata.updated_since_last_finalization = true;
-    }
-
-    static void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & wb)
-    {
-        const auto & data_ex = data(place);
-        writeIntBinary(data_ex.final_count, wb);
-        writeBoolText(data_ex.updated_since_last_finalization, wb);
-    }
-
-    static void deserialize(AggregateDataPtr __restrict place, ReadBuffer & rb)
-    {
-        auto & data_ex = data(place);
-        readIntBinary(data_ex.final_count, rb);
-        readBoolText(data_ex.updated_since_last_finalization, rb);
-    }
-
-    ALWAYS_INLINE void add()
-    {
-        ++final_count;
-        updated_since_last_finalization = true;
-    }
-
-    ALWAYS_INLINE void negate()
-    {
-        --final_count;
-        updated_since_last_finalization = true;
-    }
-
-    /// Used for tracking the group is empty or not
-    UInt32 final_count = 0;
-
-    /// Used for tracking the group is updated or not
-    bool updated_since_last_finalization = true;
-};
-
-SERDE struct RetractedDataEx : UpdatedDataEx
-{
-    static ALWAYS_INLINE AggregateDataPtr & getRetracted(AggregateDataPtr & place) { return reinterpret_cast<RetractedDataEx *>(place)->retracted_data; }
-    static ALWAYS_INLINE bool hasRetracted(ConstAggregateDataPtr __restrict place) { return reinterpret_cast<const RetractedDataEx *>(place)->retracted_data; }
-
-    template <bool use_retracted_data>
-    static ALWAYS_INLINE AggregateDataPtr & getData(AggregateDataPtr & place)
-    {
-        if constexpr (use_retracted_data)
-            return getRetracted(place);
-        else
-            return place;
-    }
-
-    /// Used for tracking group changes
-    AggregateDataPtr retracted_data = nullptr;
-};
-
-enum class ExpandedDataType : uint8_t
-{
-    None = 0,
-    Updated = 1, /// Allow tracking group is empty or updated
-    UpdatedWithRetracted = 2, /// Allow tracking group is empty or updated and changes
-};
-
-}
-}
diff --git a/src/Interpreters/Streaming/AggregationUtils.cpp b/src/Interpreters/Streaming/AggregationUtils.cpp
index b40851b65e6..6f79b641e01 100644
--- a/src/Interpreters/Streaming/AggregationUtils.cpp
+++ b/src/Interpreters/Streaming/AggregationUtils.cpp
@@ -39,6 +39,12 @@ OutputBlockColumns prepareOutputBlockColumns(
             /// The ColumnAggregateFunction column captures the shared ownership of the arena with the aggregate function states.
             ColumnAggregateFunction & column_aggregate_func = assert_cast<ColumnAggregateFunction &>(*aggregate_columns[i]);
 
+            /// proton: starts
+            column_aggregate_func.setKeepState(params.keep_state);
+            /// proton: ends
+
+            /// Add arenas to ColumnAggregateFunction, which can result in moving ownership to it if reference count
+            /// get dropped in other places
             for (auto & pool : aggregates_pools)
                 column_aggregate_func.addArena(pool);
 
@@ -52,10 +58,10 @@ OutputBlockColumns prepareOutputBlockColumns(
 
             if (aggregate_functions[i]->isState())
             {
-                auto callback = [&](IColumn & subcolumn)
+                auto callback = [&](IColumn & column)
                 {
                     /// The ColumnAggregateFunction column captures the shared ownership of the arena with aggregate function states.
-                    if (auto * column_aggregate_func = typeid_cast<ColumnAggregateFunction *>(&subcolumn))
+                    if (auto * column_aggregate_func = typeid_cast<ColumnAggregateFunction *>(&column))
                         for (auto & pool : aggregates_pools)
                             column_aggregate_func->addArena(pool);
                 };
diff --git a/src/Interpreters/Streaming/Aggregator.cpp b/src/Interpreters/Streaming/Aggregator.cpp
index f1937f482d7..d82dc8b1f8e 100644
--- a/src/Interpreters/Streaming/Aggregator.cpp
+++ b/src/Interpreters/Streaming/Aggregator.cpp
@@ -80,9 +80,6 @@ inline void initDataVariants(
     result.keys_size = params.keys_size;
     result.key_sizes = key_sizes;
     result.init(method_chosen);
-
-    if (params.tracking_changes)
-        result.resetRetractedPool();
 }
 
 Columns materializeKeyColumns(Columns & columns, ColumnRawPtrs & key_columns, const Aggregator::Params & params, bool is_low_cardinality)
@@ -108,27 +105,11 @@ Columns materializeKeyColumns(Columns & columns, ColumnRawPtrs & key_columns, co
     return materialized_columns;
 }
 
-Arena * getArena(AggregatedDataVariants & variants, AggregateStateType type)
-{
-    if (type == AggregateStateType::OnlyRetracted)
-        return variants.retracted_pool.get();
-    else
-        return variants.aggregates_pool;
-}
-
 template <typename BucketConverter>
-BlocksList concurrentBucketConvert(ThreadPool * thread_pool, const std::vector<Int64> & buckets, Arena * arena, Arenas & pools, BucketConverter && bucket_converter)
+BlocksList convertBucketsInParallel(ThreadPool * thread_pool, const std::vector<Int64> & buckets, BucketConverter && bucket_converter)
 {
     std::atomic<UInt32> next_bucket_idx_to_merge = 0;
-    auto converter = [&](Arena * pool, ThreadGroupStatusPtr thread_group, const std::atomic_flag * cancelled) {
-        SCOPE_EXIT_SAFE(
-            if (thread_group)
-                CurrentThread::detachQueryIfNotDetached();
-        );
-
-        if (thread_group)
-            CurrentThread::attachToIfDetached(thread_group);
-
+    auto converter = [&](Arena * pool, const std::atomic_flag * cancelled) {
         BlocksList blocks;
         while (true)
         {
@@ -147,9 +128,14 @@ BlocksList concurrentBucketConvert(ThreadPool * thread_pool, const std::vector<I
 
     size_t num_threads = thread_pool ? std::min(thread_pool->getMaxThreads(), buckets.size()) : 1;
     if (num_threads <= 1)
-        return converter(arena, nullptr, nullptr);
+    {
+        auto arena = std::make_shared<Arena>();
+        return converter(arena.get(), nullptr);
+    }
 
     /// Process in parallel
+    Arenas pools;
+    pools.reserve(num_threads);
     for (size_t i = pools.size(); i < num_threads; ++i)
         pools.push_back(std::make_shared<Arena>());
 
@@ -161,9 +147,13 @@ BlocksList concurrentBucketConvert(ThreadPool * thread_pool, const std::vector<I
         SCOPE_EXIT_SAFE(cancelled.test_and_set(););
 
         for (size_t thread_id = 0; thread_id < num_threads; ++thread_id)
+        {
             thread_pool->scheduleOrThrowOnError([&pools, thread_id, group = CurrentThread::getGroup(), results, &converter, &cancelled] {
-                (*results)[thread_id] = converter(pools[thread_id].get(), group, &cancelled);
+                CurrentThread::attachToIfDetached(group);
+                SCOPE_EXIT_SAFE( CurrentThread::detachQueryIfNotDetached() );
+                (*results)[thread_id] = converter(pools[thread_id].get(), &cancelled);
             });
+        }
 
         thread_pool->wait();
     }
@@ -212,8 +202,7 @@ void AggregatedDataVariants::reset()
     invalidate();
 
     /// Reset pool
-    resetAggregatesPool();
-    retracted_pool.reset();
+    resetAndCreateAggregatesPools();
 }
 
 void AggregatedDataVariants::convertToTwoLevel()
@@ -241,12 +230,12 @@ void AggregatedDataVariants::convertToTwoLevel()
 
 void AggregatedDataVariants::serialize(WriteBuffer & wb, const Aggregator & aggregator_) const
 {
+    /// We cannot use itself `aggregator` since if there is no data, it is nullptr.
     aggregator_.checkpoint(*this, wb);
 }
 
 void AggregatedDataVariants::deserialize(ReadBuffer & rb, const Aggregator & aggregator_)
 {
-    aggregator = &aggregator_;
     aggregator_.recover(*this, rb);
 }
 
@@ -375,6 +364,9 @@ void Aggregator::Params::explain(JSONBuilder::JSONMap & map) const
 
 Aggregator::Aggregator(const Params & params_) : params(params_),  log(&Poco::Logger::get("StreamingAggregator"))
 {
+    if (params.overflow_row) [[unlikely]]
+        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Overflow row processing is not implemented in streaming aggregation");
+
     /// Use query-level memory tracker
     if (auto * memory_tracker_child = CurrentThread::getMemoryTracker())
         if (auto * memory_tracker = memory_tracker_child->getParent())
@@ -389,21 +381,14 @@ Aggregator::Aggregator(const Params & params_) : params(params_),  log(&Poco::Lo
     total_size_of_aggregate_states = 0;
     all_aggregates_has_trivial_destructor = true;
 
-    if (params.tracking_changes)
-    {
-        total_size_of_aggregate_states = sizeof(RetractedDataEx);
-        align_aggregate_states = alignof(RetractedDataEx);
-        expanded_data_type = ExpandedDataType::UpdatedWithRetracted;
-    }
-    else if (params.tracking_updated)
+    if (trackingUpdatesType() == TrackingUpdatesType::Updates)
     {
-        total_size_of_aggregate_states = sizeof(UpdatedDataEx);
-        align_aggregate_states = alignof(UpdatedDataEx);
-        expanded_data_type = ExpandedDataType::Updated;
+        total_size_of_aggregate_states = sizeof(TrackingUpdates);
+        align_aggregate_states = alignof(TrackingUpdates);
     }
 
     // aggregate_states will be aligned as below:
-    // |<-- [ExpandedDataEx] -->||<-- state_1 -->|<-- pad_1 -->|<-- state_2 -->|<-- pad_2 -->| .....
+    // |<-- [UpdatesTrackingData] -->||<-- state_1 -->|<-- pad_1 -->|<-- state_2 -->|<-- pad_2 -->| .....
     //
     // pad_N will be used to match alignment requirement for each next state.
     // The address of state_1 is aligned based on maximum alignment requirements in states
@@ -438,12 +423,13 @@ Aggregator::Aggregator(const Params & params_) : params(params_),  log(&Poco::Lo
     aggregation_state_cache = AggregatedDataVariants::createCache(method_chosen, cache_settings);
 
 #if USE_EMBEDDED_COMPILER
-    compileAggregateFunctionsIfNeeded();
+    /// TODO: Support compile aggregate functions
+    // compileAggregateFunctionsIfNeeded();
 #endif
 }
 
 #if USE_EMBEDDED_COMPILER
-
+/*
 void Aggregator::compileAggregateFunctionsIfNeeded()
 {
     static std::unordered_map<UInt128, UInt64, UInt128Hash> aggregate_functions_description_to_count;
@@ -518,7 +504,7 @@ void Aggregator::compileAggregateFunctionsIfNeeded()
         }
     }
 }
-
+*/
 #endif
 
 AggregatedDataVariants::Type Aggregator::chooseAggregationMethod()
@@ -770,46 +756,18 @@ AggregatedDataVariants::Type Aggregator::chooseAggregationMethodTimeBucketTwoLev
 }
 /// proton: ends
 
-template <bool use_compiled_functions, bool skip_expanded_data>
-void Aggregator::createAggregateStates(AggregateDataPtr & aggregate_data) const
+void Aggregator::createAggregateStates(AggregateDataPtr & aggregate_data, bool prefix_with_updates_tracking_state) const
 {
-    /// Initialize reserved UpdatedDataEx
+    /// Initialize reserved TrackingUpdates
     assert(aggregate_data);
-    if constexpr (!skip_expanded_data)
-    {
-        if (expanded_data_type == ExpandedDataType::UpdatedWithRetracted)
-            new (aggregate_data) RetractedDataEx();
-        else if (expanded_data_type == ExpandedDataType::Updated)
-            new (aggregate_data) UpdatedDataEx();
-    }
-
-    if constexpr (use_compiled_functions)
+    if (prefix_with_updates_tracking_state)
     {
-        assert(compiled_aggregate_functions_holder);
-        const auto & compiled_aggregate_functions = compiled_aggregate_functions_holder->compiled_aggregate_functions;
-        compiled_aggregate_functions.create_aggregate_states_function(aggregate_data);
-
-#if defined(MEMORY_SANITIZER)
-
-        /// We compile only functions that do not allocate some data in Arena. Only store necessary state in AggregateData place.
-        for (size_t aggregate_function_index = 0; aggregate_function_index < aggregate_functions.size(); ++aggregate_function_index)
-        {
-            if (!is_aggregate_function_compiled[aggregate_function_index])
-                continue;
-
-            auto aggregate_data_with_offset = aggregate_data + offsets_of_aggregate_states[aggregate_function_index];
-            auto data_size = params.aggregates[aggregate_function_index].function->sizeOfData();
-            __msan_unpoison(aggregate_data_with_offset, data_size);
-        }
-#endif
+        if (trackingUpdatesType() == TrackingUpdatesType::Updates)
+            new (aggregate_data) TrackingUpdates();
     }
 
     for (size_t j = 0; j < params.aggregates_size; ++j)
     {
-        if constexpr (use_compiled_functions)
-            if (is_aggregate_function_compiled[j])
-                continue;
-
         try
         {
             /** An exception may occur if there is a shortage of memory.
@@ -821,13 +779,7 @@ void Aggregator::createAggregateStates(AggregateDataPtr & aggregate_data) const
         catch (...)
         {
             for (size_t rollback_j = 0; rollback_j < j; ++rollback_j)
-            {
-                if constexpr (use_compiled_functions)
-                    if (is_aggregate_function_compiled[j])
-                        continue;
-
                 aggregate_functions[rollback_j]->destroy(aggregate_data + offsets_of_aggregate_states[rollback_j]);
-            }
 
             throw;
         }
@@ -839,13 +791,11 @@ void Aggregator::createAggregateStates(AggregateDataPtr & aggregate_data) const
     size_t row_begin,
     size_t row_end,
     ColumnRawPtrs & key_columns,
-    AggregateFunctionInstruction * aggregate_instructions,
-    bool no_more_keys,
-    AggregateDataPtr overflow_row) const
+    AggregateFunctionInstruction * aggregate_instructions) const
 {
     #define M(NAME, IS_TWO_LEVEL) \
         else if (result.type == AggregatedDataVariants::Type::NAME) \
-            return executeImpl(*result.NAME, result.aggregates_pool, row_begin, row_end, key_columns, aggregate_instructions, no_more_keys, overflow_row);
+            return executeImplBatch(*result.NAME, result.aggregates_pool, row_begin, row_end, key_columns, aggregate_instructions);
 
     if (false) {} // NOLINT
     APPLY_FOR_AGGREGATED_VARIANTS_STREAMING(M)
@@ -859,48 +809,19 @@ void Aggregator::createAggregateStates(AggregateDataPtr & aggregate_data) const
   * Inline does not make sense, since the inner loop is entirely inside this function.
   */
 template <typename Method>
-[[nodiscard]] bool NO_INLINE Aggregator::executeImpl(
+[[nodiscard]] bool NO_INLINE Aggregator::executeImplBatch(
     Method & method,
     Arena * aggregates_pool,
     size_t row_begin,
     size_t row_end,
     ColumnRawPtrs & key_columns,
-    AggregateFunctionInstruction * aggregate_instructions,
-    bool no_more_keys,
-    AggregateDataPtr overflow_row) const
+    AggregateFunctionInstruction * aggregate_instructions) const
 {
     typename Method::State state(key_columns, key_sizes, aggregation_state_cache);
-    assert(!no_more_keys);
-
-#if USE_EMBEDDED_COMPILER
-    /// TODO: So far not support compiled functions with expanded data
-    if (compiled_aggregate_functions_holder && !hasExpandedData())
-    {
-        return executeImplBatch<false, true>(method, state, aggregates_pool, row_begin, row_end, aggregate_instructions, overflow_row);
-    }
-    else
-#endif
-    {
-        return executeImplBatch<false, false>(method, state, aggregates_pool, row_begin, row_end, aggregate_instructions, overflow_row);
-    }
-}
 
-template <bool no_more_keys, bool use_compiled_functions, typename Method>
-[[nodiscard]] bool NO_INLINE Aggregator::executeImplBatch(
-    Method & method,
-    typename Method::State & state,
-    Arena * aggregates_pool,
-    size_t row_begin,
-    size_t row_end,
-    AggregateFunctionInstruction * aggregate_instructions,
-    AggregateDataPtr overflow_row) const
-{
     /// Optimization for special case when there are no aggregate functions.
-    if (params.aggregates_size == 0 && !hasExpandedData())
+    if (params.aggregates_size == 0 && !needTrackUpdates())
     {
-        if constexpr (no_more_keys)
-            return false;
-
         /// For all rows.
         AggregateDataPtr place = aggregates_pool->alloc(0);
         for (size_t i = row_begin; i < row_end; ++i)
@@ -911,7 +832,7 @@ template <bool no_more_keys, bool use_compiled_functions, typename Method>
     bool need_finalization = false;
 
     /// Optimization for special case when aggregating by 8bit key.
-    if constexpr (!no_more_keys && std::is_same_v<Method, typename decltype(AggregatedDataVariants::key8)::element_type>)
+    if constexpr (std::is_same_v<Method, typename decltype(AggregatedDataVariants::key8)::element_type>)
     {
         /// We use another method if there are aggregate functions with -Array combinator.
         bool has_arrays = false;
@@ -924,7 +845,7 @@ template <bool no_more_keys, bool use_compiled_functions, typename Method>
             }
         }
 
-        if (!has_arrays && !hasExpandedData())
+        if (!has_arrays && !needTrackUpdates())
         {
             for (AggregateFunctionInstruction * inst = aggregate_instructions; inst->that; ++inst)
             {
@@ -936,7 +857,7 @@ template <bool no_more_keys, bool use_compiled_functions, typename Method>
                     [&](AggregateDataPtr & aggregate_data)
                     {
                         auto data = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
-                        createAggregateStates<use_compiled_functions, /*skip_expanded_data*/ true>(data);
+                        createAggregateStates(data, /*prefix_with_updates_tracking_state*/ false);
                         aggregate_data = data;
                     },
                     state.getKeyData(),
@@ -968,7 +889,6 @@ template <bool no_more_keys, bool use_compiled_functions, typename Method>
     {
         AggregateDataPtr aggregate_data = nullptr;
 
-        assert(!no_more_keys);
         auto emplace_result = state.emplaceKey(method.data, i, *aggregates_pool);
 
         /// If a new key is inserted, initialize the states of the aggregate functions, and possibly something related to the key.
@@ -978,7 +898,7 @@ template <bool no_more_keys, bool use_compiled_functions, typename Method>
             emplace_result.setMapped(nullptr);
 
             aggregate_data = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
-            createAggregateStates<use_compiled_functions>(aggregate_data);
+            createAggregateStates(aggregate_data);
 
             emplace_result.setMapped(aggregate_data);
         }
@@ -989,37 +909,9 @@ template <bool no_more_keys, bool use_compiled_functions, typename Method>
         places[i] = aggregate_data;
     }
 
-#if USE_EMBEDDED_COMPILER
-    if constexpr (use_compiled_functions)
-    {
-        std::vector<ColumnData> columns_data;
-
-        for (size_t i = 0; i < aggregate_functions.size(); ++i)
-        {
-            if (!is_aggregate_function_compiled[i])
-                continue;
-
-            AggregateFunctionInstruction * inst = aggregate_instructions + i;
-            size_t arguments_size = inst->that->getArgumentTypes().size();
-
-            for (size_t argument_index = 0; argument_index < arguments_size; ++argument_index)
-                columns_data.emplace_back(getColumnData(inst->batch_arguments[argument_index]));
-        }
-
-        auto add_into_aggregate_states_function = compiled_aggregate_functions_holder->compiled_aggregate_functions.add_into_aggregate_states_function;
-        add_into_aggregate_states_function(row_begin, row_end, columns_data.data(), places.get());
-    }
-#endif
-
     /// Add values to the aggregate functions.
     for (size_t i = 0; i < aggregate_functions.size(); ++i)
     {
-#if USE_EMBEDDED_COMPILER
-        if constexpr (use_compiled_functions)
-            if (is_aggregate_function_compiled[i])
-                continue;
-#endif
-
         AggregateFunctionInstruction * inst = aggregate_instructions + i;
 
         if (inst->offsets)
@@ -1043,13 +935,12 @@ template <bool no_more_keys, bool use_compiled_functions, typename Method>
         }
     }
 
-    if (hasExpandedData())
-        UpdatedDataEx::addBatch(row_begin, row_end, places.get(), aggregate_instructions ? aggregate_instructions->delta_column : nullptr);
+    if (needTrackUpdates())
+        TrackingUpdates::addBatch(row_begin, row_end, places.get(), aggregate_instructions ? aggregate_instructions->delta_column : nullptr);
 
     return need_finalization;
 }
 
-template <bool use_compiled_functions>
 [[nodiscard]] bool NO_INLINE Aggregator::executeWithoutKeyImpl(
     AggregatedDataWithoutKey & res,
     size_t row_begin,
@@ -1057,55 +948,12 @@ template <bool use_compiled_functions>
     AggregateFunctionInstruction * aggregate_instructions,
     Arena * arena) const
 {
-#if USE_EMBEDDED_COMPILER
-    if constexpr (use_compiled_functions)
-    {
-        std::vector<ColumnData> columns_data;
-
-        for (size_t i = 0; i < aggregate_functions.size(); ++i)
-        {
-            if (!is_aggregate_function_compiled[i])
-                continue;
-
-            AggregateFunctionInstruction * inst = aggregate_instructions + i;
-            size_t arguments_size = inst->that->getArgumentTypes().size();
-
-            for (size_t argument_index = 0; argument_index < arguments_size; ++argument_index)
-            {
-                columns_data.emplace_back(getColumnData(inst->batch_arguments[argument_index]));
-            }
-        }
-
-        auto add_into_aggregate_states_function_single_place = compiled_aggregate_functions_holder->compiled_aggregate_functions.add_into_aggregate_states_function_single_place;
-        add_into_aggregate_states_function_single_place(row_begin, row_end, columns_data.data(), res);
-
-#if defined(MEMORY_SANITIZER)
-
-        /// We compile only functions that do not allocate some data in Arena. Only store necessary state in AggregateData place.
-        for (size_t aggregate_function_index = 0; aggregate_function_index < aggregate_functions.size(); ++aggregate_function_index)
-        {
-            if (!is_aggregate_function_compiled[aggregate_function_index])
-                continue;
-
-            auto aggregate_data_with_offset = res + offsets_of_aggregate_states[aggregate_function_index];
-            auto data_size = params.aggregates[aggregate_function_index].function->sizeOfData();
-            __msan_unpoison(aggregate_data_with_offset, data_size);
-        }
-#endif
-    }
-#endif
-
     /// Adding values
     bool should_finalize = false;
     for (size_t i = 0; i < aggregate_functions.size(); ++i)
     {
         AggregateFunctionInstruction * inst = aggregate_instructions + i;
 
-#if USE_EMBEDDED_COMPILER
-        if constexpr (use_compiled_functions)
-            if (is_aggregate_function_compiled[i])
-                continue;
-#endif
         if (inst->offsets)
             inst->batch_that->addBatchSinglePlace(
                 inst->offsets[static_cast<ssize_t>(row_begin) - 1],
@@ -1134,8 +982,8 @@ template <bool use_compiled_functions>
         }
     }
 
-    if (hasExpandedData())
-        UpdatedDataEx::addBatchSinglePlace(row_begin, row_end, res, aggregate_instructions ? aggregate_instructions->delta_column : nullptr);
+    if (needTrackUpdates())
+        TrackingUpdates::addBatchSinglePlace(row_begin, row_end, res, aggregate_instructions ? aggregate_instructions->delta_column : nullptr);
 
     return should_finalize;
 }
@@ -1201,8 +1049,7 @@ std::pair<bool, bool> Aggregator::executeOnBlock(
     const Block & block,
     AggregatedDataVariants & result,
     ColumnRawPtrs & key_columns,
-    AggregateColumns & aggregate_columns,
-    bool & no_more_keys) const
+    AggregateColumns & aggregate_columns) const
 {
     return executeOnBlock(
         block.getColumns(),
@@ -1210,8 +1057,7 @@ std::pair<bool, bool> Aggregator::executeOnBlock(
         block.rows(),
         result,
         key_columns,
-        aggregate_columns,
-        no_more_keys);
+        aggregate_columns);
 }
 
 /// return {should_abort, need_finalization}
@@ -1221,8 +1067,7 @@ std::pair<bool, bool> Aggregator::executeOnBlock(
     size_t row_end,
     AggregatedDataVariants & result,
     ColumnRawPtrs & key_columns,
-    AggregateColumns & aggregate_columns,
-    bool & no_more_keys) const
+    AggregateColumns & aggregate_columns) const
 {
     std::pair<bool, bool> return_result = {false, false};
     auto & need_abort = return_result.first;
@@ -1255,33 +1100,17 @@ std::pair<bool, bool> Aggregator::executeOnBlock(
     AggregateFunctionInstructions aggregate_functions_instructions;
     prepareAggregateInstructions(columns, aggregate_columns, materialized_columns, aggregate_functions_instructions, nested_columns_holder);
 
-    initStatesForWithoutKeyOrOverflow(result);
+    initStatesForWithoutKey(result);
 
     /// We select one of the aggregation methods and call it.
 
     /// For the case when there are no keys (all aggregate into one row).
     if (result.type == AggregatedDataVariants::Type::without_key)
-    {
-        /// TODO: So far not support compiled functions with expanded data
-#if USE_EMBEDDED_COMPILER
-        if (compiled_aggregate_functions_holder && !hasExpandedData())
-        {
-            need_finalization = executeWithoutKeyImpl<true>(result.without_key, row_begin, row_end, aggregate_functions_instructions.data(), result.aggregates_pool);
-        }
-        else
-#endif
-        {
-            need_finalization = executeWithoutKeyImpl<false>(result.without_key, row_begin, row_end, aggregate_functions_instructions.data(), result.aggregates_pool);
-        }
-    }
+        need_finalization = executeWithoutKeyImpl(result.without_key, row_begin, row_end, aggregate_functions_instructions.data(), result.aggregates_pool);
     else
-    {
-        /// This is where data is written that does not fit in `max_rows_to_group_by` with `group_by_overflow_mode = any`.
-        AggregateDataPtr overflow_row_ptr = params.overflow_row ? result.without_key : nullptr;
-        need_finalization = executeImpl(result, row_begin, row_end, key_columns, aggregate_functions_instructions.data(), no_more_keys, overflow_row_ptr);
-    }
+        need_finalization = executeImpl(result, row_begin, row_end, key_columns, aggregate_functions_instructions.data());
 
-    need_abort = checkAndProcessResult(result, no_more_keys);
+    need_abort = checkAndProcessResult(result);
     return return_result;
 }
 
@@ -1315,7 +1144,7 @@ void Aggregator::writeToTemporaryFile(AggregatedDataVariants & data_variants, co
     data_variants.init(data_variants.type);
     data_variants.aggregates_pools = Arenas(1, std::make_shared<Arena>());
     data_variants.aggregates_pool = data_variants.aggregates_pools.back().get();
-    initStatesForWithoutKeyOrOverflow(data_variants);
+    initStatesForWithoutKey(data_variants);
 
     block_out.flush();
     compressed_buf.next();
@@ -1367,11 +1196,11 @@ Block Aggregator::convertOneBucketToBlockImpl(
     bool final,
     bool clear_states,
     Int64 bucket,
-    AggregateStateType type) const
+    ConvertType type) const
 {
     Block block = convertToBlockImpl(method, method.data.impls[bucket], arena, data_variants.aggregates_pools, final, method.data.impls[bucket].size(), clear_states, type);
     block.info.bucket_num = static_cast<int>(bucket);
-    method.data.resetUpdated(bucket); /// finalized
+    method.data.resetUpdatedBucket(bucket); /// finalized
     return block;
 }
 
@@ -1402,13 +1231,6 @@ void Aggregator::writeToTemporaryFileImpl(
         update_max_sizes(block);
     }
 
-    if (params.overflow_row)
-    {
-        Block block = prepareBlockAndFillWithoutKey(data_variants, false, true, false);
-        out.write(block);
-        update_max_sizes(block);
-    }
-
     /// Pass ownership of the aggregate functions states:
     /// `data_variants` will not destroy them in the destructor, they are now owned by ColumnAggregateFunction objects.
     data_variants.aggregator = nullptr;
@@ -1417,9 +1239,9 @@ void Aggregator::writeToTemporaryFileImpl(
 }
 
 
-bool Aggregator::checkLimits(size_t result_size, bool & no_more_keys) const
+bool Aggregator::checkLimits(size_t result_size) const
 {
-    if (!no_more_keys && params.max_rows_to_group_by && result_size > params.max_rows_to_group_by)
+    if (params.max_rows_to_group_by && result_size > params.max_rows_to_group_by)
     {
         switch (params.group_by_overflow_mode)
         {
@@ -1432,8 +1254,7 @@ bool Aggregator::checkLimits(size_t result_size, bool & no_more_keys) const
                 return false;
 
             case OverflowMode::ANY:
-                no_more_keys = true;
-                break;
+                throw Exception(ErrorCodes::LOGICAL_ERROR, "Streaming aggregation doesn't support 'OverflowMode::ANY'");
         }
     }
 
@@ -1446,7 +1267,7 @@ bool Aggregator::checkLimits(size_t result_size, bool & no_more_keys) const
 
 template <typename Method, typename Table>
 Block Aggregator::convertToBlockImpl(
-    Method & method, Table & data, Arena * arena, Arenas & aggregates_pools, bool final, size_t rows, bool clear_states, AggregateStateType type) const
+    Method & method, Table & data, Arena * arena, Arenas & aggregates_pools, bool final, size_t rows, bool clear_states, ConvertType type) const
 {
     if (data.empty())
     {
@@ -1458,34 +1279,17 @@ Block Aggregator::convertToBlockImpl(
 
     if (final)
     {
-#if USE_EMBEDDED_COMPILER
-        /// TODO: So far not support compiled functions with expanded data
-        if (compiled_aggregate_functions_holder && !hasExpandedData())
-        {
-            static constexpr bool use_compiled_functions = !Method::low_cardinality_optimization;
-            assert(type == AggregateStateType::Normal);
-            res = convertToBlockImplFinal<Method, use_compiled_functions, AggregateStateType::Normal>(method, data, arena, aggregates_pools, rows, clear_states);
-        }
-        else
-#endif
-        {
-            if (type == AggregateStateType::OnlyUpdated)
-                res = convertToBlockImplFinal<Method, false, AggregateStateType::OnlyUpdated>(method, data, arena, aggregates_pools, rows, clear_states);
-            else if (type == AggregateStateType::OnlyRetracted)
-                res = convertToBlockImplFinal<Method, false, AggregateStateType::OnlyRetracted>(method, data, arena, aggregates_pools, rows, clear_states);
-            else
-                res = convertToBlockImplFinal<Method, false, AggregateStateType::Normal>(method, data, arena, aggregates_pools, rows, clear_states);
-        }
+        res = convertToBlockImplFinal<Method>(method, data, arena, aggregates_pools, rows, clear_states, type);
     }
     else
     {
-        assert(type == AggregateStateType::Normal);
+        assert(type == ConvertType::Normal);
         res = convertToBlockImplNotFinal(method, data, aggregates_pools, rows);
     }
 
     /// In order to release memory early.
     /// proton: starts. For streaming aggr, we hold on to the states
-    if (clear_states && type == AggregateStateType::Normal)
+    if (clear_states)
         data.clearAndShrink();
     /// proton: ends
 
@@ -1570,7 +1374,6 @@ inline void Aggregator::insertAggregatesIntoColumns(
         std::rethrow_exception(exception);
 }
 
-template <bool use_compiled_functions>
 Block Aggregator::insertResultsIntoColumns(PaddedPODArray<AggregateDataPtr> & places, OutputBlockColumns && out_cols, Arena * arena, bool clear_states) const
 {
     std::exception_ptr exception;
@@ -1578,40 +1381,8 @@ Block Aggregator::insertResultsIntoColumns(PaddedPODArray<AggregateDataPtr> & pl
 
     try
     {
-        if constexpr (use_compiled_functions)
-        {
-            /** For JIT compiled functions we need to resize columns before pass them into compiled code.
-              * insert_aggregates_into_columns_function function does not throw exception.
-              */
-            std::vector<ColumnData> columns_data;
-
-            auto compiled_functions = compiled_aggregate_functions_holder->compiled_aggregate_functions;
-
-            for (size_t i = 0; i < params.aggregates_size; ++i)
-            {
-                if (!is_aggregate_function_compiled[i])
-                    continue;
-
-                auto & final_aggregate_column = out_cols.final_aggregate_columns[i];
-                final_aggregate_column = final_aggregate_column->cloneResized(places.size());
-                columns_data.emplace_back(getColumnData(final_aggregate_column.get()));
-            }
-
-            auto insert_aggregates_into_columns_function = compiled_functions.insert_aggregates_into_columns_function;
-            insert_aggregates_into_columns_function(0, places.size(), columns_data.data(), places.data());
-        }
-
         for (; aggregate_functions_destroy_index < params.aggregates_size;)
         {
-            if constexpr (use_compiled_functions)
-            {
-                if (is_aggregate_function_compiled[aggregate_functions_destroy_index])
-                {
-                    ++aggregate_functions_destroy_index;
-                    continue;
-                }
-            }
-
             auto & final_aggregate_column = out_cols.final_aggregate_columns[aggregate_functions_destroy_index];
             size_t offset = offsets_of_aggregate_states[aggregate_functions_destroy_index];
 
@@ -1659,15 +1430,6 @@ Block Aggregator::insertResultsIntoColumns(PaddedPODArray<AggregateDataPtr> & pl
 
     for (; aggregate_functions_destroy_index < params.aggregates_size; ++aggregate_functions_destroy_index)
     {
-        if constexpr (use_compiled_functions)
-        {
-            if (is_aggregate_function_compiled[aggregate_functions_destroy_index])
-            {
-                ++aggregate_functions_destroy_index;
-                continue;
-            }
-        }
-
         bool is_state = aggregate_functions[aggregate_functions_destroy_index]->isState();
         bool destroy_place_after_insert = !is_state && clear_states;
         if (destroy_place_after_insert)
@@ -1698,9 +1460,9 @@ Block Aggregator::insertResultsIntoColumns(PaddedPODArray<AggregateDataPtr> & pl
     return finalizeBlock(params, getHeader(/* final */ true), std::move(out_cols), /* final */ true, places.size());
 }
 
-template <typename Method, bool use_compiled_functions, AggregateStateType type, typename Table>
+template <typename Method, typename Table>
 Block NO_INLINE Aggregator::convertToBlockImplFinal(
-    Method & method, Table & data, Arena * arena, Arenas & aggregates_pools, size_t rows, bool clear_states) const
+    Method & method, Table & data, Arena * arena, Arenas & aggregates_pools, size_t rows, bool clear_states, ConvertType type) const
 {
     constexpr bool final = true;
     auto out_cols = prepareOutputBlockColumns(params, aggregate_functions, getHeader(final), aggregates_pools, final, rows);
@@ -1709,7 +1471,7 @@ Block NO_INLINE Aggregator::convertToBlockImplFinal(
     {
         if (data.hasNullKeyData())
         {
-            assert(type == AggregateStateType::Normal);
+            assert(type == ConvertType::Normal);
             out_cols.key_columns[0]->insertDefault();
             insertAggregatesIntoColumns(data.getNullKeyData(), out_cols.final_aggregate_columns, arena, clear_states);
         }
@@ -1721,27 +1483,25 @@ Block NO_INLINE Aggregator::convertToBlockImplFinal(
     PaddedPODArray<AggregateDataPtr> places;
     places.reserve(rows);
 
-    constexpr bool only_updated = (type == AggregateStateType::OnlyUpdated);
-    constexpr bool only_retracted = (type == AggregateStateType::OnlyRetracted);
+    bool only_updates = (type == ConvertType::OnlyUpdates);
 
     data.forEachValue([&](const auto & key, auto & mapped)
     {
-        if constexpr (only_updated)
+        /// Ingore invalid mapped, there are two cases:
+        /// 1) mapped was destroyed (it's a bug)
+        /// 2) no mapped states for retracted data (means it's an new group key, but no retracted data)
+        if (!mapped)
+            return;
+
+        if (only_updates)
         {
-            if (!UpdatedDataEx::isUpdated(mapped))
+            if (!TrackingUpdates::updated(mapped))
                 return;
 
             /// Finalized it for current coverting
-            UpdatedDataEx::resetUpdated(mapped);
-        }
-        else if constexpr (only_retracted)
-        {
-            if (!RetractedDataEx::hasRetracted(mapped))
-                return;
+            TrackingUpdates::resetUpdated(mapped);
         }
 
-        auto & place = RetractedDataEx::getData<only_retracted>(mapped);
-
         /// For UDA with own emit strategy, there are two special cases to be handled:
         /// 1. not all groups need to  be emitted. therefore proton needs to pick groups
         /// that should emits, and only emit those groups while keep other groups unchanged.
@@ -1754,7 +1514,7 @@ Block NO_INLINE Aggregator::convertToBlockImplFinal(
         if (params.group_by == Params::GroupBy::USER_DEFINED)
         {
             assert(aggregate_functions.size() == 1);
-            emit_times = aggregate_functions[0]->getEmitTimes(place + offsets_of_aggregate_states[0]);
+            emit_times = aggregate_functions[0]->getEmitTimes(mapped + offsets_of_aggregate_states[0]);
         }
 
         if (emit_times > 0)
@@ -1763,17 +1523,17 @@ Block NO_INLINE Aggregator::convertToBlockImplFinal(
             for (size_t i = 0; i < emit_times; i++)
                 method.insertKeyIntoColumns(key, out_cols.raw_key_columns, key_sizes_ref);
 
-            places.emplace_back(place);
+            places.emplace_back(mapped);
 
             /// Mark the cell as destroyed so it will not be destroyed in destructor.
             /// proton: starts. Here we push the `place` to `places`, for streaming
             /// case, we don't want aggregate function to destroy the places
             if (clear_states)
-                place = nullptr;
+                mapped = nullptr;
         }
     });
 
-    return insertResultsIntoColumns<use_compiled_functions>(places, std::move(out_cols), arena, clear_states);
+    return insertResultsIntoColumns(places, std::move(out_cols), arena, clear_states);
 }
 
 template <typename Method, typename Table>
@@ -1840,30 +1600,32 @@ void Aggregator::addArenasToAggregateColumns(
     }
 }
 
-Block Aggregator::prepareBlockAndFillWithoutKey(AggregatedDataVariants & data_variants, bool final, bool is_overflows, bool clear_states, AggregateStateType type) const
+Block Aggregator::prepareBlockAndFillWithoutKey(AggregatedDataVariants & data_variants, bool final, bool clear_states, ConvertType type) const
 {
+    /// proton: starts.
+    if (!data_variants.without_key)
+    {
+        data_variants.invalidate();
+        return {};
+    }
+    /// proton: ends.
+
     auto res_header = getHeader(final);
     size_t rows = 1;
     auto && out_cols = prepareOutputBlockColumns(params, aggregate_functions, res_header, data_variants.aggregates_pools, final, rows);
     auto && [key_columns, raw_key_columns, aggregate_columns, final_aggregate_columns, aggregate_columns_data] = out_cols;
 
-    /// TODO: support overflow row ?
-    assert(!is_overflows);
-    assert(!params.overflow_row);
     assert(data_variants.type == AggregatedDataVariants::Type::without_key);
 
-    if ((type == AggregateStateType::OnlyUpdated && !UpdatedDataEx::isUpdated(data_variants.without_key))
-        || (type == AggregateStateType::OnlyRetracted && !RetractedDataEx::hasRetracted(data_variants.without_key)))
+    if (type == ConvertType::OnlyUpdates && !TrackingUpdates::updated(data_variants.without_key))
         return res_header.cloneEmpty();
 
     AggregatedDataWithoutKey & data = [&]() -> AggregateDataPtr & {
-        if (type == AggregateStateType::OnlyUpdated)
+        if (type == ConvertType::OnlyUpdates)
         {
-            UpdatedDataEx::resetUpdated( data_variants.without_key);
+            TrackingUpdates::resetUpdated(data_variants.without_key);
             return data_variants.without_key;
         }
-        else if (type == AggregateStateType::OnlyRetracted)
-            return RetractedDataEx::getRetracted(data_variants.without_key);
         else
             return data_variants.without_key;
     }();
@@ -1880,23 +1642,18 @@ Block Aggregator::prepareBlockAndFillWithoutKey(AggregatedDataVariants & data_va
     else
     {
         /// Always single-thread. It's safe to pass current arena from 'aggregates_pool'.
-        insertAggregatesIntoColumns(data, final_aggregate_columns, getArena(data_variants, type), clear_states);
+        insertAggregatesIntoColumns(data, final_aggregate_columns, data_variants.aggregates_pool, clear_states);
     }
 
-    Block block = finalizeBlock(params, res_header, std::move(out_cols), final, rows);
-
-    if (is_overflows)
-        block.info.is_overflows = true;
-
-    return block;
+    return finalizeBlock(params, res_header, std::move(out_cols), final, rows);
 }
 
-Block Aggregator::prepareBlockAndFillSingleLevel(AggregatedDataVariants & data_variants, bool final, bool clear_states, AggregateStateType type) const
+Block Aggregator::prepareBlockAndFillSingleLevel(AggregatedDataVariants & data_variants, bool final, bool clear_states, ConvertType type) const
 {
     const size_t rows = data_variants.sizeWithoutOverflowRow();
 #define M(NAME) \
     else if (data_variants.type == AggregatedDataVariants::Type::NAME) \
-        return convertToBlockImpl(*data_variants.NAME, data_variants.NAME->data, getArena(data_variants, type), data_variants.aggregates_pools, final, rows, clear_states, type);
+        return convertToBlockImpl(*data_variants.NAME, data_variants.NAME->data, data_variants.aggregates_pool, data_variants.aggregates_pools, final, rows, clear_states, type);
 
     if (false) {} // NOLINT
     APPLY_FOR_VARIANTS_SINGLE_LEVEL(M)
@@ -1904,13 +1661,13 @@ Block Aggregator::prepareBlockAndFillSingleLevel(AggregatedDataVariants & data_v
     else throw Exception(ErrorCodes::UNKNOWN_AGGREGATED_DATA_VARIANT, "Unknown aggregated data variant.");
 }
 
-BlocksList Aggregator::prepareBlocksAndFillTwoLevel(AggregatedDataVariants & data_variants, bool final, bool clear_states, size_t max_threads, AggregateStateType type) const
+BlocksList Aggregator::prepareBlocksAndFillTwoLevel(AggregatedDataVariants & data_variants, bool final, bool clear_states, size_t max_threads, ConvertType type) const
 {
     /// TODO Make a custom threshold.
     /// TODO Use the shared thread pool with the `merge` function.
     std::unique_ptr<ThreadPool> thread_pool;
     if (max_threads > 1 && data_variants.sizeWithoutOverflowRow() > 100000
-        && final && type == AggregateStateType::Normal) /// use single thread for non-final or retracted data or updated data
+        && final && type == ConvertType::Normal) /// use single thread for non-final or retracted data or updated data
         thread_pool = std::make_unique<ThreadPool>(max_threads);
 
     if (false) {} // NOLINT
@@ -1931,23 +1688,18 @@ BlocksList Aggregator::prepareBlocksAndFillTwoLevelImpl(
     bool final,
     bool clear_states,
     ThreadPool * thread_pool,
-    AggregateStateType type) const
+    ConvertType type) const
 {
-    return concurrentBucketConvert(
-        thread_pool,
-        method.data.buckets(),
-        getArena(data_variants, type),
-        data_variants.aggregates_pools,
-        [&](Int64 bucket, Arena * arena) -> BlocksList {
-            /// Skip no changed bucket if only updated is requested
-            if (type == AggregateStateType::OnlyUpdated && !method.data.isUpdatedBucket(bucket))
-                return {};
-
-            return {convertOneBucketToBlockImpl(data_variants, method, arena, final, clear_states, bucket, type)};
-        });
+    return convertBucketsInParallel(thread_pool, method.data.buckets(), [&](Int64 bucket, Arena * arena) -> BlocksList {
+        /// Skip no changed bucket if only updated is requested
+        if (type == ConvertType::OnlyUpdates && !method.data.isBucketUpdated(bucket))
+            return {};
+
+        return {convertOneBucketToBlockImpl(data_variants, method, arena, final, clear_states, bucket, type)};
+    });
 }
 
-BlocksList Aggregator::convertToBlocks(AggregatedDataVariants & data_variants, bool final, bool clear_states, size_t max_threads) const
+BlocksList Aggregator::convertToBlocks(AggregatedDataVariants & data_variants, bool final, size_t max_threads) const
 {
     LOG_DEBUG(log, "Converting aggregated data to blocks");
 
@@ -1959,11 +1711,10 @@ BlocksList Aggregator::convertToBlocks(AggregatedDataVariants & data_variants, b
     if (data_variants.empty())
         return blocks;
 
-    if (unlikely(params.overflow_row))
-        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Overflow row processing is not implemented in streaming aggregation");
+    bool clear_states = final && !params.keep_state;
 
     if (data_variants.type == AggregatedDataVariants::Type::without_key)
-        blocks.emplace_back(prepareBlockAndFillWithoutKey(data_variants, final, false, clear_states));
+        blocks.emplace_back(prepareBlockAndFillWithoutKey(data_variants, final, clear_states));
     else if (!data_variants.isTwoLevel())
         blocks.emplace_back(prepareBlockAndFillSingleLevel(data_variants, final, clear_states));
     else
@@ -2035,7 +1786,7 @@ void NO_INLINE Aggregator::mergeDataNullKey(
 }
 
 
-template <typename Method, bool use_compiled_functions, typename Table, typename KeyHandler>
+template <typename Method, typename Table, typename KeyHandler>
 void NO_INLINE Aggregator::mergeDataImpl(
     Table & table_dst,
     Table & table_src,
@@ -2056,7 +1807,7 @@ void NO_INLINE Aggregator::mergeDataImpl(
             /// that from the 'src' to store the final aggregated result, it will cause the data from other AggregatedDataVariant will be merged multiple times and
             /// generate incorrect aggregated result.
             auto aggregate_data = arena->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
-            createAggregateStates<use_compiled_functions>(aggregate_data);
+            createAggregateStates(aggregate_data);
             dst = aggregate_data;
         }
 
@@ -2094,35 +1845,20 @@ template <typename Method>
 void NO_INLINE Aggregator::mergeSingleLevelDataImpl(ManyAggregatedDataVariants & non_empty_data, bool clear_states) const
 {
     AggregatedDataVariantsPtr & res = non_empty_data[0];
-    bool no_more_keys = false;
 
     /// We merge all aggregation results to the first.
     for (size_t result_num = 1, size = non_empty_data.size(); result_num < size; ++result_num)
     {
-        if (!checkLimits(res->sizeWithoutOverflowRow(), no_more_keys))
+        if (!checkLimits(res->sizeWithoutOverflowRow()))
             break;
 
         AggregatedDataVariants & current = *non_empty_data[result_num];
 
-        assert(!no_more_keys);
-#if USE_EMBEDDED_COMPILER
-        if (compiled_aggregate_functions_holder)
-        {
-            mergeDataImpl<Method, true>(
-                getDataVariant<Method>(*res).data,
-                getDataVariant<Method>(current).data,
-                res->aggregates_pool,
-                clear_states);
-        }
-        else
- #endif
-        {
-            mergeDataImpl<Method, false>(
-                getDataVariant<Method>(*res).data,
-                getDataVariant<Method>(current).data,
-                res->aggregates_pool,
-                clear_states);
-        }
+        mergeDataImpl<Method>(
+            getDataVariant<Method>(*res).data,
+            getDataVariant<Method>(current).data,
+            res->aggregates_pool,
+            clear_states);
 
         /// In order to release memory early.
         if (clear_states)
@@ -2138,21 +1874,19 @@ void NO_INLINE Aggregator::mergeSingleLevelDataImpl(ManyAggregatedDataVariants &
 
 
 BlocksList
-Aggregator::mergeAndConvertToBlocks(ManyAggregatedDataVariants & data_variants, bool final, bool clear_states, size_t max_threads) const
+Aggregator::mergeAndConvertToBlocks(ManyAggregatedDataVariants & data_variants, bool final, size_t max_threads) const
 {
     auto prepared_data_ptr = prepareVariantsToMerge(data_variants);
     if (prepared_data_ptr->empty())
         return {};
 
-    if (unlikely(params.overflow_row))
-        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Overflow row processing is not implemented in streaming aggregation");
-
+    bool clear_states = final && !params.keep_state;
     BlocksList blocks;
     auto & first = *prepared_data_ptr->at(0);
     if (first.type == AggregatedDataVariants::Type::without_key)
     {
         mergeWithoutKeyDataImpl(*prepared_data_ptr, clear_states);
-        blocks.emplace_back(prepareBlockAndFillWithoutKey(first, final, false, clear_states));
+        blocks.emplace_back(prepareBlockAndFillWithoutKey(first, final, clear_states));
     }
     else if (!first.isTwoLevel())
     {
@@ -2205,7 +1939,9 @@ BlocksList Aggregator::mergeAndConvertTwoLevelToBlocksImpl(
 
     std::vector<Int64> buckets;
     if (first.isStaticBucketTwoLevel())
+    {
         buckets = getDataVariant<Method>(first).data.buckets();
+    }
     else
     {
         assert(first.isTimeBucketTwoLevel());
@@ -2218,11 +1954,10 @@ BlocksList Aggregator::mergeAndConvertTwoLevelToBlocksImpl(
         buckets.assign(buckets_set.begin(), buckets_set.end());
     }
 
-    return concurrentBucketConvert(
-        thread_pool, buckets, first.aggregates_pool, first.aggregates_pools, [&](Int64 bucket, Arena * arena) -> BlocksList {
-            mergeBucketImpl<Method>(non_empty_data, bucket, arena, clear_states);
-            return {convertOneBucketToBlockImpl(first, getDataVariant<Method>(first), arena, final, clear_states, bucket)};
-        });
+    return convertBucketsInParallel(thread_pool, buckets, [&](Int64 bucket, Arena * arena) -> BlocksList {
+        mergeBucketImpl<Method>(non_empty_data, bucket, arena, clear_states);
+        return {convertOneBucketToBlockImpl(first, getDataVariant<Method>(first), arena, final, clear_states, bucket)};
+    });
 }
 
 template <typename Method>
@@ -2237,27 +1972,14 @@ void NO_INLINE Aggregator::mergeBucketImpl(
             return;
 
         AggregatedDataVariants & current = *data[result_num];
-#if USE_EMBEDDED_COMPILER
-        if (compiled_aggregate_functions_holder)
-        {
-            mergeDataImpl<Method, true>(
-                getDataVariant<Method>(*res).data.impls[bucket],
-                getDataVariant<Method>(current).data.impls[bucket],
-                arena,
-                clear_states);
-        }
-        else
-#endif
-        {
-            mergeDataImpl<Method, false>(
-                getDataVariant<Method>(*res).data.impls[bucket],
-                getDataVariant<Method>(current).data.impls[bucket],
-                arena,
-                clear_states);
-        }
+        mergeDataImpl<Method>(
+            getDataVariant<Method>(*res).data.impls[bucket],
+            getDataVariant<Method>(current).data.impls[bucket],
+            arena,
+            clear_states);
 
         /// Assume the current bucket has been finalized.
-        getDataVariant<Method>(current).data.resetUpdated(bucket);
+        getDataVariant<Method>(current).data.resetUpdatedBucket(bucket);
     }
 }
 
@@ -2286,7 +2008,7 @@ ManyAggregatedDataVariantsPtr Aggregator::prepareVariantsToMerge(ManyAggregatedD
         auto result_variants = std::make_shared<AggregatedDataVariants>(false);
         result_variants->aggregator = this;
         initDataVariants(*result_variants, method_chosen, key_sizes, params);
-        initStatesForWithoutKeyOrOverflow(*result_variants);
+        initStatesForWithoutKey(*result_variants);
         non_empty_data->insert(non_empty_data->begin(), result_variants);
     }
 
@@ -2335,427 +2057,66 @@ ManyAggregatedDataVariantsPtr Aggregator::prepareVariantsToMerge(ManyAggregatedD
     return non_empty_data;
 }
 
-template <bool no_more_keys, typename Method, typename Table>
-void NO_INLINE Aggregator::mergeStreamsImplCase(
-    Block & block,
-    Arena * aggregates_pool,
-    Method & method [[maybe_unused]],
-    Table & data,
-    AggregateDataPtr overflow_row) const
+template <typename Method>
+void NO_INLINE Aggregator::convertBlockToTwoLevelImpl(
+    Method & method,
+    Arena * pool,
+    ColumnRawPtrs & key_columns,
+    const Block & source,
+    std::vector<Block> & destinations) const
 {
-    ColumnRawPtrs key_columns(params.keys_size);
-    AggregateColumnsConstData aggregate_columns(params.aggregates_size);
-
-    /// Remember the columns we will work with
-    for (size_t i = 0; i < params.keys_size; ++i)
-        key_columns[i] = block.safeGetByPosition(i).column.get();
-
-    for (size_t i = 0; i < params.aggregates_size; ++i)
-    {
-        const auto & aggregate_column_name = params.aggregates[i].column_name;
-        aggregate_columns[i] = &typeid_cast<const ColumnAggregateFunction &>(*block.getByName(aggregate_column_name).column).getData();
-    }
-
     typename Method::State state(key_columns, key_sizes, aggregation_state_cache);
 
-    /// For all rows.
-    size_t rows = block.rows();
-    std::unique_ptr<AggregateDataPtr[]> places(new AggregateDataPtr[rows]);
+    size_t rows = source.rows();
+    size_t columns = source.columns();
+
+    /// Create a 'selector' that will contain bucket index for every row. It will be used to scatter rows to buckets.
+    IColumn::Selector selector(rows);
 
+    /// For every row.
     for (size_t i = 0; i < rows; ++i)
     {
-        AggregateDataPtr aggregate_data = nullptr;
-
-        if (!no_more_keys)
+        if constexpr (Method::low_cardinality_optimization)
         {
-            auto emplace_result = state.emplaceKey(data, i, *aggregates_pool);
-            if (emplace_result.isInserted())
+            if (state.isNullAt(i))
             {
-                /// exception-safety - if you can not allocate memory or create states, then destructors will not be called.
-                emplace_result.setMapped(nullptr);
-
-                aggregate_data = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
-                createAggregateStates(aggregate_data);
-
-                emplace_result.setMapped(aggregate_data);
+                selector[i] = 0;
+                continue;
             }
-            else
-                aggregate_data = emplace_result.getMapped();
-        }
-        else
-        {
-            auto find_result = state.findKey(data, i, *aggregates_pool);
-            if (find_result.isFound())
-                aggregate_data = find_result.getMapped();
         }
 
-        /// aggregate_date == nullptr means that the new key did not fit in the hash table because of no_more_keys.
+        /// Calculate bucket number from row hash.
+        auto hash = state.getHash(method.data, i, *pool);
+        auto bucket = method.data.getBucketFromHash(hash);
 
-        AggregateDataPtr value = aggregate_data ? aggregate_data : overflow_row;
-        places[i] = value;
+        selector[i] = bucket;
     }
 
-    for (size_t j = 0; j < params.aggregates_size; ++j)
+    size_t num_buckets = destinations.size();
+
+    for (size_t column_idx = 0; column_idx < columns; ++column_idx)
     {
-        /// Merge state of aggregate functions.
-        aggregate_functions[j]->mergeBatch(
-            0, rows,
-            places.get(), offsets_of_aggregate_states[j],
-            aggregate_columns[j]->data(),
-            aggregates_pool);
-    }
+        const ColumnWithTypeAndName & src_col = source.getByPosition(column_idx);
+        MutableColumns scattered_columns = src_col.column->scatter(num_buckets, selector);
 
-    /// Early release memory.
-    block.clear();
-}
+        for (size_t bucket = 0, size = num_buckets; bucket < size; ++bucket)
+        {
+            if (!scattered_columns[bucket]->empty())
+            {
+                Block & dst = destinations[bucket];
+                dst.info.bucket_num = static_cast<Int32>(bucket);
+                dst.insert({std::move(scattered_columns[bucket]), src_col.type, src_col.name});
+            }
 
-template <typename Method, typename Table>
-void NO_INLINE Aggregator::mergeStreamsImpl(
-    Block & block,
-    Arena * aggregates_pool,
-    Method & method,
-    Table & data,
-    AggregateDataPtr overflow_row,
-    bool no_more_keys) const
-{
-    if (!no_more_keys)
-        mergeStreamsImplCase<false>(block, aggregates_pool, method, data, overflow_row);
-    else
-        mergeStreamsImplCase<true>(block, aggregates_pool, method, data, overflow_row);
+            /** Inserted columns of type ColumnAggregateFunction will own states of aggregate functions
+              *  by holding shared_ptr to source column. See ColumnAggregateFunction.h
+              */
+        }
+    }
 }
 
 
-void NO_INLINE Aggregator::mergeWithoutKeyStreamsImpl(
-    Block & block,
-    AggregatedDataVariants & result) const
-{
-    AggregateColumnsConstData aggregate_columns(params.aggregates_size);
-
-    /// Remember the columns we will work with
-    for (size_t i = 0; i < params.aggregates_size; ++i)
-    {
-        const auto & aggregate_column_name = params.aggregates[i].column_name;
-        aggregate_columns[i] = &typeid_cast<const ColumnAggregateFunction &>(*block.getByName(aggregate_column_name).column).getData();
-    }
-
-    AggregatedDataWithoutKey & res = result.without_key;
-    if (!res)
-    {
-        AggregateDataPtr place = result.aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
-        createAggregateStates(place);
-        res = place;
-    }
-
-    for (size_t row = 0, rows = block.rows(); row < rows; ++row)
-    {
-        /// Adding Values
-        for (size_t i = 0; i < params.aggregates_size; ++i)
-            aggregate_functions[i]->merge(res + offsets_of_aggregate_states[i], (*aggregate_columns[i])[row], result.aggregates_pool);
-    }
-
-    /// Early release memory.
-    block.clear();
-}
-
-bool Aggregator::mergeOnBlock(Block block, AggregatedDataVariants & result, bool & no_more_keys) const
-{
-    /// `result` will destroy the states of aggregate functions in the destructor
-    result.aggregator = this;
-
-    /// How to perform the aggregation?
-    if (result.empty())
-    {
-        result.init(method_chosen);
-        result.keys_size = params.keys_size;
-        result.key_sizes = key_sizes;
-        LOG_TRACE(log, "Aggregation method: {}", result.getMethodName());
-    }
-
-    if (result.type == AggregatedDataVariants::Type::without_key || block.info.is_overflows)
-        mergeWithoutKeyStreamsImpl(block, result);
-
-#define M(NAME, IS_TWO_LEVEL) \
-    else if (result.type == AggregatedDataVariants::Type::NAME) \
-        mergeStreamsImpl(block, result.aggregates_pool, *result.NAME, result.NAME->data, result.without_key, no_more_keys);
-
-    APPLY_FOR_AGGREGATED_VARIANTS_STREAMING(M)
-#undef M
-    else if (result.type != AggregatedDataVariants::Type::without_key)
-        throw Exception("Unknown aggregated data variant.", ErrorCodes::UNKNOWN_AGGREGATED_DATA_VARIANT);
-
-    return checkAndProcessResult(result, no_more_keys);
-}
-
-
-void Aggregator::mergeBlocks(BucketToBlocks bucket_to_blocks, AggregatedDataVariants & result, size_t max_threads)
-{
-    if (bucket_to_blocks.empty())
-        return;
-
-    UInt64 total_input_rows = 0;
-    for (auto & bucket : bucket_to_blocks)
-        for (auto & block : bucket.second)
-            total_input_rows += block.rows();
-
-    /** `minus one` means the absence of information about the bucket
-      * - in the case of single-level aggregation, as well as for blocks with "overflowing" values.
-      * If there is at least one block with a bucket number greater or equal than zero, then there was a two-level aggregation.
-      */
-    auto max_bucket = bucket_to_blocks.rbegin()->first;
-    bool has_two_level = max_bucket >= 0;
-
-    if (has_two_level)
-    {
-    #define M(NAME) \
-        if (method_chosen == AggregatedDataVariants::Type::NAME) \
-            method_chosen = AggregatedDataVariants::Type::NAME ## _two_level;
-
-        APPLY_FOR_VARIANTS_CONVERTIBLE_TO_STATIC_BUCKET_TWO_LEVEL(M)
-
-    #undef M
-    }
-
-    /// result will destroy the states of aggregate functions in the destructor
-    result.aggregator = this;
-
-    result.init(method_chosen);
-    result.keys_size = params.keys_size;
-    result.key_sizes = key_sizes;
-
-    bool has_blocks_with_unknown_bucket = bucket_to_blocks.contains(-1);
-
-    /// First, parallel the merge for the individual buckets. Then we continue merge the data not allocated to the buckets.
-    if (has_two_level)
-    {
-        /** In this case, no_more_keys is not supported due to the fact that
-          *  from different threads it is difficult to update the general state for "other" keys (overflows).
-          * That is, the keys in the end can be significantly larger than max_rows_to_group_by.
-          */
-
-        LOG_TRACE(log, "Merging partially aggregated two-level data.");
-
-        auto merge_bucket = [&bucket_to_blocks, &result, this](size_t bucket, Arena * aggregates_pool, ThreadGroupStatusPtr thread_group)
-        {
-            SCOPE_EXIT_SAFE(
-                if (thread_group)
-                    CurrentThread::detachQueryIfNotDetached();
-            );
-            if (thread_group)
-                CurrentThread::attachToIfDetached(thread_group);
-
-            for (Block & block : bucket_to_blocks[static_cast<int>(bucket)])
-            {
-            #define M(NAME) \
-                else if (result.type == AggregatedDataVariants::Type::NAME) \
-                    mergeStreamsImpl(block, aggregates_pool, *result.NAME, result.NAME->data.impls[bucket], nullptr, false);
-
-                if (false) {} // NOLINT
-                    APPLY_FOR_VARIANTS_ALL_TWO_LEVEL(M)
-            #undef M
-                else
-                    throw Exception("Unknown aggregated data variant.", ErrorCodes::UNKNOWN_AGGREGATED_DATA_VARIANT);
-            }
-        };
-
-        std::unique_ptr<ThreadPool> thread_pool;
-        if (max_threads > 1 && total_input_rows > 100000)    /// TODO Make a custom threshold.
-            thread_pool = std::make_unique<ThreadPool>(max_threads);
-
-        for (const auto & bucket_blocks : bucket_to_blocks)
-        {
-            const auto bucket = bucket_blocks.first;
-
-            if (bucket == -1)
-                continue;
-
-            result.aggregates_pools.push_back(std::make_shared<Arena>());
-            Arena * aggregates_pool = result.aggregates_pools.back().get();
-
-            auto task = [group = CurrentThread::getGroup(), bucket, &merge_bucket, aggregates_pool]{ return merge_bucket(bucket, aggregates_pool, group); };
-
-            if (thread_pool)
-                thread_pool->scheduleOrThrowOnError(task);
-            else
-                task();
-        }
-
-        if (thread_pool)
-            thread_pool->wait();
-
-        LOG_TRACE(log, "Merged partially aggregated two-level data.");
-    }
-
-    if (has_blocks_with_unknown_bucket)
-    {
-        LOG_TRACE(log, "Merging partially aggregated single-level data.");
-
-        bool no_more_keys = false;
-
-        BlocksList & blocks = bucket_to_blocks[-1];
-        for (Block & block : blocks)
-        {
-            if (!checkLimits(result.sizeWithoutOverflowRow(), no_more_keys))
-                break;
-
-            if (result.type == AggregatedDataVariants::Type::without_key || block.info.is_overflows)
-                mergeWithoutKeyStreamsImpl(block, result);
-
-        #define M(NAME, IS_TWO_LEVEL) \
-            else if (result.type == AggregatedDataVariants::Type::NAME) \
-                mergeStreamsImpl(block, result.aggregates_pool, *result.NAME, result.NAME->data, result.without_key, no_more_keys);
-
-            APPLY_FOR_AGGREGATED_VARIANTS_STREAMING(M)
-        #undef M
-            else if (result.type != AggregatedDataVariants::Type::without_key)
-                throw Exception("Unknown aggregated data variant.", ErrorCodes::UNKNOWN_AGGREGATED_DATA_VARIANT);
-        }
-
-        LOG_TRACE(log, "Merged partially aggregated single-level data.");
-    }
-}
-
-
-Block Aggregator::mergeBlocks(BlocksList & blocks, bool final, bool clear_states, bool only_updated)
-{
-    if (blocks.empty())
-        return {};
-
-    auto bucket_num = blocks.front().info.bucket_num;
-    bool is_overflows = blocks.front().info.is_overflows;
-
-    LOG_TRACE(log, "Merging partially aggregated blocks (bucket = {}).", bucket_num);
-    Stopwatch watch;
-
-    /** If possible, change 'method' to some_hash64. Otherwise, leave as is.
-      * Better hash function is needed because during external aggregation,
-      *  we may merge partitions of data with total number of keys far greater than 4 billion.
-      */
-    auto merge_method = method_chosen;
-
-#define APPLY_FOR_VARIANTS_THAT_MAY_USE_BETTER_HASH_FUNCTION(M) \
-        M(key64)            \
-        M(key_string)       \
-        M(key_fixed_string) \
-        M(keys128)          \
-        M(keys256)          \
-        M(serialized)       \
-
-#define M(NAME) \
-    if (merge_method == AggregatedDataVariants::Type::NAME) \
-        merge_method = AggregatedDataVariants::Type::NAME ## _hash64; \
-
-    APPLY_FOR_VARIANTS_THAT_MAY_USE_BETTER_HASH_FUNCTION(M)
-#undef M
-
-#undef APPLY_FOR_VARIANTS_THAT_MAY_USE_BETTER_HASH_FUNCTION
-
-    /// Temporary data for aggregation.
-    AggregatedDataVariants result;
-
-    /// result will destroy the states of aggregate functions in the destructor
-    result.aggregator = this;
-
-    /// proton: starts
-    initDataVariants(result, method_chosen, key_sizes, params);
-    /// proton: ends
-
-    for (Block & block : blocks)
-    {
-        if (bucket_num >= 0 && block.info.bucket_num != bucket_num)
-            bucket_num = -1;
-
-        if (result.type == AggregatedDataVariants::Type::without_key || is_overflows)
-            mergeWithoutKeyStreamsImpl(block, result);
-
-    #define M(NAME, IS_TWO_LEVEL) \
-        else if (result.type == AggregatedDataVariants::Type::NAME) \
-            mergeStreamsImpl(block, result.aggregates_pool, *result.NAME, result.NAME->data, nullptr, false);
-
-        APPLY_FOR_AGGREGATED_VARIANTS_STREAMING(M)
-    #undef M
-        else if (result.type != AggregatedDataVariants::Type::without_key)
-            throw Exception("Unknown aggregated data variant.", ErrorCodes::UNKNOWN_AGGREGATED_DATA_VARIANT);
-    }
-
-    Block block;
-    if (result.type == AggregatedDataVariants::Type::without_key || is_overflows)
-        block = prepareBlockAndFillWithoutKey(result, final, is_overflows, clear_states);
-    else
-        block = prepareBlockAndFillSingleLevel(result, final, clear_states);
-    /// NOTE: two-level data is not possible here - chooseAggregationMethod chooses only among single-level methods.
-
-    size_t rows = block.rows();
-    size_t bytes = block.bytes();
-    double elapsed_seconds = watch.elapsedSeconds();
-    LOG_DEBUG(log, "Merged partially aggregated blocks. {} rows, {}. in {} sec. ({:.3f} rows/sec., {}/sec.)",
-        rows, ReadableSize(bytes),
-        elapsed_seconds, rows / elapsed_seconds,
-        ReadableSize(bytes / elapsed_seconds));
-
-    block.info.bucket_num = bucket_num;
-    return block;
-}
-
-template <typename Method>
-void NO_INLINE Aggregator::convertBlockToTwoLevelImpl(
-    Method & method,
-    Arena * pool,
-    ColumnRawPtrs & key_columns,
-    const Block & source,
-    std::vector<Block> & destinations) const
-{
-    typename Method::State state(key_columns, key_sizes, aggregation_state_cache);
-
-    size_t rows = source.rows();
-    size_t columns = source.columns();
-
-    /// Create a 'selector' that will contain bucket index for every row. It will be used to scatter rows to buckets.
-    IColumn::Selector selector(rows);
-
-    /// For every row.
-    for (size_t i = 0; i < rows; ++i)
-    {
-        if constexpr (Method::low_cardinality_optimization)
-        {
-            if (state.isNullAt(i))
-            {
-                selector[i] = 0;
-                continue;
-            }
-        }
-
-        /// Calculate bucket number from row hash.
-        auto hash = state.getHash(method.data, i, *pool);
-        auto bucket = method.data.getBucketFromHash(hash);
-
-        selector[i] = bucket;
-    }
-
-    size_t num_buckets = destinations.size();
-
-    for (size_t column_idx = 0; column_idx < columns; ++column_idx)
-    {
-        const ColumnWithTypeAndName & src_col = source.getByPosition(column_idx);
-        MutableColumns scattered_columns = src_col.column->scatter(num_buckets, selector);
-
-        for (size_t bucket = 0, size = num_buckets; bucket < size; ++bucket)
-        {
-            if (!scattered_columns[bucket]->empty())
-            {
-                Block & dst = destinations[bucket];
-                dst.info.bucket_num = static_cast<Int32>(bucket);
-                dst.insert({std::move(scattered_columns[bucket]), src_col.type, src_col.name});
-            }
-
-            /** Inserted columns of type ColumnAggregateFunction will own states of aggregate functions
-              *  by holding shared_ptr to source column. See ColumnAggregateFunction.h
-              */
-        }
-    }
-}
-
-
-std::vector<Block> Aggregator::convertBlockToTwoLevel(const Block & block) const
+std::vector<Block> Aggregator::convertBlockToTwoLevel(const Block & block) const
 {
     if (!block)
         return {};
@@ -2842,7 +2203,7 @@ void Aggregator::destroyAllAggregateStates(AggregatedDataVariants & result) cons
     LOG_TRACE(log, "Destroying aggregate states");
 
     /// In what data structure is the data aggregated?
-    if (result.type == AggregatedDataVariants::Type::without_key || params.overflow_row)
+    if (result.type == AggregatedDataVariants::Type::without_key)
         destroyWithoutKey(result);
 
 #define M(NAME, IS_TWO_LEVEL) \
@@ -2857,9 +2218,9 @@ void Aggregator::destroyAllAggregateStates(AggregatedDataVariants & result) cons
 }
 
 /// proton: starts. for streaming processing
-void Aggregator::initStatesForWithoutKeyOrOverflow(AggregatedDataVariants & data_variants) const
+void Aggregator::initStatesForWithoutKey(AggregatedDataVariants & data_variants) const
 {
-    if (!data_variants.without_key && (params.overflow_row || data_variants.type == AggregatedDataVariants::Type::without_key))
+    if (!data_variants.without_key && data_variants.type == AggregatedDataVariants::Type::without_key)
     {
         AggregateDataPtr place = data_variants.aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
         createAggregateStates(place);
@@ -3017,7 +2378,7 @@ void Aggregator::doCheckpointLegacy(const AggregatedDataVariants & data_variants
 
     /// FIXME, set a good max_threads
     /// For ConvertAction::Checkpoint, don't clear state `data_variants`
-    auto blocks = convertToBlocks(const_cast<AggregatedDataVariants &>(data_variants), false, false, 8);
+    auto blocks = convertToBlocks(const_cast<AggregatedDataVariants &>(data_variants), false, 8);
 
     /// assert(!blocks.empty());
 
@@ -3120,7 +2481,7 @@ void Aggregator::recoverStatesWithoutKey(AggregatedDataVariants & data_variants,
     /// may have internal states as well
 
     assert(!data_variants.without_key);
-    initStatesForWithoutKeyOrOverflow(data_variants);
+    initStatesForWithoutKey(data_variants);
     AggregatedDataWithoutKey & data = data_variants.without_key;
 
     AggregateColumnsData aggregate_columns(params.aggregates_size);
@@ -3349,9 +2710,7 @@ void Aggregator::doRecoverV2(AggregatedDataVariants & data_variants, ReadBuffer
 
 VersionType Aggregator::getVersionFromRevision(UInt64 revision) const
 {
-    if (revision >= STATE_V3_MIN_REVISION)
-        return static_cast<VersionType>(3);
-    else if (revision >= STATE_V2_MIN_REVISION)
+    if (revision >= STATE_V2_MIN_REVISION)
         return static_cast<VersionType>(2);
     else
         throw Exception(
@@ -3403,22 +2762,11 @@ void NO_INLINE Aggregator::spliceBucketsImpl(
     auto & table_dest = getDataVariant<Method>(data_dest).data.impls;
     auto & table_src = getDataVariant<Method>(data_src).data.impls;
 
-#if USE_EMBEDDED_COMPILER
-    if (compiled_aggregate_functions_holder)
-    {
-        for (auto bucket : gcd_buckets)
-            mergeDataImpl<Method, true>(table_dest[0], table_src[bucket], arena, clear_states, zero_out_window_keys_func);
-    }
-    else
-#endif
-    {
-        for (auto bucket : gcd_buckets)
-            mergeDataImpl<Method, false>(table_dest[0], table_src[bucket], arena, clear_states, zero_out_window_keys_func);
-    }
+    for (auto bucket : gcd_buckets)
+        mergeDataImpl<Method>(table_dest[0], table_src[bucket], arena, clear_states, zero_out_window_keys_func);
 }
 
-Block Aggregator::spliceAndConvertBucketsToBlock(
-    AggregatedDataVariants & variants, bool final, bool clear_states, const std::vector<Int64> & gcd_buckets) const
+Block Aggregator::spliceAndConvertBucketsToBlock(AggregatedDataVariants & variants, bool final, const std::vector<Int64> & gcd_buckets) const
 {
     assert(variants.isTimeBucketTwoLevel());
 
@@ -3431,11 +2779,12 @@ Block Aggregator::spliceAndConvertBucketsToBlock(
             AggregatedDataVariants result_variants; \
             result_variants.aggregator = this; \
             initDataVariants(result_variants, method_chosen, key_sizes, params); \
-            spliceBucketsImpl<decltype(result_variants.NAME)::element_type>(result_variants, variants, gcd_buckets, result_variants.aggregates_pool, clear_states); \
+            initStatesForWithoutKey(result_variants); \
+            spliceBucketsImpl<decltype(result_variants.NAME)::element_type>(result_variants, variants, gcd_buckets, result_variants.aggregates_pool, /*clear_states*/ false); \
             return convertOneBucketToBlockImpl(result_variants, *result_variants.NAME, result_variants.aggregates_pool, final, /*clear_states*/ true, 0); \
         } \
         else \
-            return convertOneBucketToBlockImpl(variants, *variants.NAME, variants.aggregates_pool, final, clear_states, gcd_buckets[0]); \
+            return convertOneBucketToBlockImpl(variants, *variants.NAME, variants.aggregates_pool, final, /*clear_states*/ false, gcd_buckets[0]); \
     }
 
     APPLY_FOR_VARIANTS_TIME_BUCKET_TWO_LEVEL(M)
@@ -3446,8 +2795,7 @@ Block Aggregator::spliceAndConvertBucketsToBlock(
     UNREACHABLE();
 }
 
-Block Aggregator::mergeAndSpliceAndConvertBucketsToBlock(
-    ManyAggregatedDataVariants & variants, bool final, bool clear_states, const std::vector<Int64> & gcd_buckets) const
+Block Aggregator::mergeAndSpliceAndConvertBucketsToBlock(ManyAggregatedDataVariants & variants, bool final, const std::vector<Int64> & gcd_buckets) const
 {
     bool need_splice = gcd_buckets.size() > 1;
     auto prepared_data = prepareVariantsToMerge(variants, /*always_merge_into_empty*/ need_splice);
@@ -3464,14 +2812,14 @@ Block Aggregator::mergeAndSpliceAndConvertBucketsToBlock(
     { \
         using Method = decltype(first.NAME)::element_type; \
         for (auto bucket : gcd_buckets) \
-            mergeBucketImpl<Method>(*prepared_data, bucket, arena, clear_states); \
+            mergeBucketImpl<Method>(*prepared_data, bucket, arena, /*clear_states*/ false); \
         if (need_splice) \
         { \
             spliceBucketsImpl<Method>(first, first, gcd_buckets, arena, /*clear_states*/ true); \
             return convertOneBucketToBlockImpl(first, *first.NAME, arena, final, /*clear_states*/ true, 0); \
         } \
         else \
-            return convertOneBucketToBlockImpl(first, *first.NAME, arena, final, clear_states, gcd_buckets[0]); \
+            return convertOneBucketToBlockImpl(first, *first.NAME, arena, final, /*clear_states*/ false, gcd_buckets[0]); \
     }
 
     APPLY_FOR_VARIANTS_TIME_BUCKET_TWO_LEVEL(M)
@@ -3482,36 +2830,367 @@ Block Aggregator::mergeAndSpliceAndConvertBucketsToBlock(
     UNREACHABLE();
 }
 
-void Aggregator::mergeAggregateStates(AggregateDataPtr & dst, AggregateDataPtr & src, Arena * arena, bool clear_states) const
+template <typename Method>
+bool Aggregator::executeAndRetractImpl(
+    Method & method,
+    Arena * aggregates_pool,
+    Method & retracted_method,
+    Arena * retracted_pool,
+    size_t row_begin,
+    size_t row_end,
+    ColumnRawPtrs & key_columns,
+    AggregateFunctionInstruction * aggregate_instructions) const
 {
-    assert(src);
-    assert(dst);
-
-    for (size_t i = 0; i < params.aggregates_size; ++i)
-        aggregate_functions[i]->merge(dst + offsets_of_aggregate_states[i], src + offsets_of_aggregate_states[i], arena);
-
-    if (clear_states)
-        destroyAggregateStates(src);
-}
+    typename Method::State state(key_columns, key_sizes, aggregation_state_cache);
+    typename Method::State retracted_state(key_columns, key_sizes, nullptr);
 
-void Aggregator::destroyAggregateStates(AggregateDataPtr & place) const
-{
-    if (place)
+    /// Optimization for special case when there are no aggregate functions.
+    if (params.aggregates_size == 0)
     {
-        for (size_t i = 0; i < params.aggregates_size; ++i)
-            aggregate_functions[i]->destroy(place + offsets_of_aggregate_states[i]);
+        if (params.delta_col_pos >= 0)
+            throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Changelog aggregating must have aggregate functions");
 
-        place = nullptr;
+        /// For all rows.
+        AggregateDataPtr place = aggregates_pool->alloc(0);
+        for (size_t i = row_begin; i < row_end; ++i)
+        {
+            auto emplace_result = state.emplaceKey(method.data, i, *aggregates_pool);
+            if (emplace_result.isInserted())
+            {
+                emplace_result.setMapped(place);
+                /// Only add new key
+                retracted_state.emplaceKey(retracted_method.data, i, *retracted_pool).setMapped(place);
+            }
+        }
+        return false;
     }
-}
 
-void Aggregator::serializeAggregateStates(const AggregateDataPtr & place, WriteBuffer & wb) const
-{
-    UInt8 has_states = place ? 1 : 0;
-    writeIntBinary(has_states, wb);
-    if (has_states)
-    {
-        for (size_t i = 0; i < params.aggregates_size; ++i)
+    bool need_finalization = false;
+
+    /// NOTE: only row_end-row_start is required, but:
+    /// - this affects only optimize_aggregation_in_order,
+    /// - this is just a pointer, so it should not be significant,
+    /// - and plus this will require other changes in the interface.
+    std::unique_ptr<AggregateDataPtr[]> places(new AggregateDataPtr[row_end]);
+
+    /// For all rows.
+    for (size_t i = row_begin; i < row_end; ++i)
+    {
+        AggregateDataPtr aggregate_data = nullptr;
+
+        auto emplace_result = state.emplaceKey(method.data, i, *aggregates_pool);
+
+        /// If a new key is inserted, initialize the states of the aggregate functions, and possibly something related to the key.
+        if (emplace_result.isInserted())
+        {
+            /// exception-safety - if you can not allocate memory or create states, then destructors will not be called.
+            emplace_result.setMapped(nullptr);
+
+            aggregate_data = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
+            /// TODO: support use_compiled_functions
+            createAggregateStates(aggregate_data);
+            emplace_result.setMapped(aggregate_data);
+
+            /// Save new group without retracted state (used for emit new key group)
+            /// FIXME: There is a bug when use hash table (key8 or key16), it use a optimzed FixedImplicitZeroHashMap that the empty mapped directly means zero (i.e. invalid insertion).
+            /// But in retract group scenario, we need to use an empty mapped to represent no ratracted value for new group
+            /// Use a non-optimized FixedHashMap ? or revisit retract implementation ?
+            retracted_state.emplaceKey(retracted_method.data, i, *retracted_pool).setMapped(nullptr);
+        }
+        else
+        {
+            aggregate_data = emplace_result.getMapped();
+
+            /// Save changed group with retracted state (used for emit changed group)
+            auto retracted_result = retracted_state.emplaceKey(retracted_method.data, i, *retracted_pool);
+            if (retracted_result.isInserted())
+            {
+                retracted_result.setMapped(nullptr);
+                auto retracted_data = retracted_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
+                createAggregateStates(retracted_data);
+                /// Copy aggregate data to retracted data before changed
+                mergeAggregateStates(retracted_data, aggregate_data, retracted_pool, /*clear_states*/ false);
+                retracted_result.setMapped(retracted_data);
+            }
+        }
+
+        assert(aggregate_data != nullptr);
+        places[i] = aggregate_data;
+    }
+
+    /// Add values to the aggregate functions.
+    for (size_t i = 0; i < aggregate_functions.size(); ++i)
+    {
+        AggregateFunctionInstruction * inst = aggregate_instructions + i;
+
+        if (inst->offsets)
+            inst->batch_that->addBatchArray(row_begin, row_end, places.get(), inst->state_offset, inst->batch_arguments, inst->offsets, aggregates_pool);
+        else
+            inst->batch_that->addBatch(row_begin, row_end, places.get(), inst->state_offset, inst->batch_arguments, aggregates_pool, -1, inst->delta_column);
+
+        if (inst->batch_that->isUserDefined())
+        {
+            AggregateDataPtr * places_ptr = places.get();
+            /// It is ok to re-flush if it is flush already, then we don't need maintain a map to check if it is ready flushed
+            for (size_t j = row_begin; j < row_end; ++j)
+            {
+                if (places_ptr[j])
+                {
+                    inst->batch_that->flush(places_ptr[j] + inst->state_offset);
+                    if (!need_finalization)
+                        need_finalization = (inst->batch_that->getEmitTimes(places_ptr[j] + inst->state_offset) > 0);
+                }
+            }
+        }
+    }
+
+    return need_finalization;
+}
+
+std::pair<bool, bool> Aggregator::executeAndRetractOnBlock(
+    Columns columns,
+    size_t row_begin,
+    size_t row_end,
+    AggregatedDataVariants & result,
+    AggregatedDataVariants & retracted_result,
+    ColumnRawPtrs & key_columns,
+    AggregateColumns & aggregate_columns) const
+{
+    std::pair<bool, bool> return_result = {false, false};
+    auto & need_abort = return_result.first;
+    auto & need_finalization = return_result.second;
+
+    if (unlikely(row_end <= row_begin))
+        return return_result;
+
+    result.aggregator = this;
+    if (result.empty())
+    {
+        initDataVariants(result, method_chosen, key_sizes, params);
+        LOG_TRACE(log, "Aggregation method: {}", result.getMethodName());
+    }
+
+    Columns materialized_columns = materializeKeyColumns(columns, key_columns, params, result.isLowCardinality());
+
+    setupAggregatesPoolTimestamps(row_begin, row_end, key_columns, result.aggregates_pool);
+
+    NestedColumnsHolder nested_columns_holder;
+    AggregateFunctionInstructions aggregate_functions_instructions;
+    prepareAggregateInstructions(columns, aggregate_columns, materialized_columns, aggregate_functions_instructions, nested_columns_holder);
+
+    retracted_result.aggregator = this;
+    if (result.type == AggregatedDataVariants::Type::without_key)
+    {
+        /// Save last finalization state into `retracted_result` before processing new data.
+        /// We shall clear and reset it after finalization
+        if (retracted_result.empty())
+        {
+            initDataVariants(retracted_result, method_chosen, key_sizes, params);
+
+            if (result.without_key)
+            {
+                initStatesForWithoutKey(retracted_result);
+                mergeAggregateStates(retracted_result.without_key, result.without_key, retracted_result.aggregates_pool, false);
+            }
+        }
+
+        initStatesForWithoutKey(result);
+        need_finalization = executeWithoutKeyImpl(result.without_key, row_begin, row_end, aggregate_functions_instructions.data(), result.aggregates_pool);
+    }
+    else
+    {
+        if (retracted_result.empty())
+            initDataVariants(retracted_result, method_chosen, key_sizes, params);
+
+        if (result.isTwoLevel() && !retracted_result.isTwoLevel())
+            retracted_result.convertToTwoLevel();
+
+        #define M(NAME, IS_TWO_LEVEL) \
+            else if (result.type == AggregatedDataVariants::Type::NAME) \
+                need_finalization = executeAndRetractImpl(*result.NAME, result.aggregates_pool, *retracted_result.NAME, retracted_result.aggregates_pool, row_begin, row_end, key_columns, aggregate_functions_instructions.data());
+
+        if (false) {} // NOLINT
+        APPLY_FOR_AGGREGATED_VARIANTS_STREAMING(M)
+        #undef M
+    }
+
+    need_abort = checkAndProcessResult(result);
+    /// it's possible for gloabl single level hash table was converted to two level table after `checkAndProcessResult`,
+    /// so we also convert retarcted data to two level.
+    if (result.isTwoLevel() && !retracted_result.isTwoLevel())
+        retracted_result.convertToTwoLevel();
+
+    return return_result;
+}
+
+std::pair<AggregatedDataVariantsPtr, AggregatedDataVariantsPtr>
+Aggregator::mergeRetractedGroups(ManyAggregatedDataVariants & aggregated_data, ManyAggregatedDataVariants & retracted_data) const
+{
+    auto prepared_data = prepareVariantsToMerge(aggregated_data, /*always_merge_into_empty*/ true);
+    if (prepared_data->empty())
+        return {};
+
+    auto first = prepared_data->at(0);
+
+    auto prepared_retracted_data = prepareVariantsToMerge(retracted_data, first->type != AggregatedDataVariants::Type::without_key);
+    assert(!prepared_retracted_data->empty());
+
+    /// So far, only global aggregation support emit changelog, so time bucket two level is not possible
+
+#define M(NAME, ...) \
+    else if (first->type == AggregatedDataVariants::Type::NAME) \
+        mergeRetractedGroupsImpl<decltype(first->NAME)::element_type>(*prepared_data, *prepared_retracted_data);
+
+    if (first->type == AggregatedDataVariants::Type::without_key)
+    {
+        mergeWithoutKeyDataImpl(*prepared_retracted_data, true);
+        mergeWithoutKeyDataImpl(*prepared_data, false);
+    }
+    APPLY_FOR_VARIANTS_SINGLE_LEVEL_STREAMING(M)
+    APPLY_FOR_VARIANTS_STATIC_BUCKET_TWO_LEVEL(M)
+#undef M
+    else
+        throw Exception("Unknown aggregated data variant.", ErrorCodes::UNKNOWN_AGGREGATED_DATA_VARIANT);
+
+    return {prepared_data->at(0), prepared_retracted_data->at(0)};
+}
+
+template <typename Method>
+void Aggregator::mergeRetractedGroupsImpl(
+    ManyAggregatedDataVariants & aggregated_data, ManyAggregatedDataVariants & retracted_data) const
+{
+    AggregatedDataVariantsPtr & res = aggregated_data[0];
+    AggregatedDataVariantsPtr & retracted_res = retracted_data[0];
+
+    using Table = typename Method::Data;
+    Table & dst_table = getDataVariant<Method>(*res).data;
+    Table & dst_retracted_table = getDataVariant<Method>(*retracted_res).data;
+
+    /// First data variants always is empty.
+    assert(dst_table.empty() && dst_retracted_table.empty());
+
+    /// For example:
+    ///                 thread-1        thread-2
+    ///     group-1      changed        non-changed
+    ///     group-2     non-changed     changed
+    ///     group-3     non-changed     non-changed
+
+    /// Collect all changed groups, then merge retracted/updated data
+    /// 1) Collect changed groups:
+    /// `dst_retracted` <= (thread-1: group-1) + (thread-2: group-2)
+    for (size_t result_num = 1, size = retracted_data.size(); result_num < size; ++result_num)
+    {
+        if (!checkLimits(retracted_res->sizeWithoutOverflowRow()))
+            break;
+
+        auto & src_retracted_table = getDataVariant<Method>(*retracted_data[result_num]).data;
+        src_retracted_table.mergeToViaEmplace(dst_retracted_table, [&](AggregateDataPtr & __restrict dst, AggregateDataPtr & __restrict src, bool inserted) {
+            if (inserted)
+                dst = nullptr;
+
+            mergeAggregateStates(dst, src, retracted_res->aggregates_pool, true);
+        });
+    }
+
+    /// 2) Merge retracted groups non-changed thread parts (based on all changed groups)
+    /// `dst_retracted` <= (thread-1: group-2) + (thread-2: group-1)
+    for (size_t result_num = 1, size = retracted_data.size(); result_num < size; ++result_num)
+    {
+        if (!checkLimits(retracted_res->sizeWithoutOverflowRow()))
+            break;
+
+        auto & current_retracted = *retracted_data[result_num];
+        Table & src_retracted_table = getDataVariant<Method>(current_retracted).data;
+        Table & src_aggregated_table = getDataVariant<Method>(*aggregated_data[result_num]).data;
+        dst_retracted_table.forEachValue([&](const auto & key, auto & mapped) {
+            /// Merge retracted groups non-changed thread parts
+            if (!src_retracted_table.find(key))
+            {
+                auto find_it = src_aggregated_table.find(key);
+                if (find_it)
+                    mergeAggregateStates(
+                        mapped,
+                        find_it->getMapped(),
+                        retracted_res->aggregates_pool,
+                        /*clear_states*/ false);
+            }});
+
+        /// Reset retracted data after finalization
+        current_retracted.reset();
+    }
+
+    /// 3) Merge new/updated groups (based on all changed groups)
+    /// `dst` <= (thread-1: group-1 group-2) + (thread-2: group-1 group-2)
+    for (size_t result_num = 1, size = aggregated_data.size(); result_num < size; ++result_num)
+    {
+        if (!checkLimits(res->sizeWithoutOverflowRow()))
+            break;
+
+        Table & src_aggregated_table = getDataVariant<Method>(*aggregated_data[result_num]).data;
+        dst_retracted_table.forEachValue([&](const auto & key, auto & mapped) {
+            /// Merge new/updated groups
+            typename Table::LookupResult dst_it;
+            bool inserted;
+
+            /// NOTE: For StringRef `key`, its memory was allocated in `retracted_res->aggregates_pool`,
+            /// we shall save this key in itself pool (i.e. res->aggregates_pool) if inserted
+            using KeyType = std::decay_t<decltype(key)>;
+            if constexpr (std::is_same_v<KeyType, StringRef>)
+                dst_table.emplace(ArenaKeyHolder{key, *res->aggregates_pool}, dst_it, inserted);
+            else
+                dst_table.emplace(key, dst_it, inserted);
+
+            if (inserted)
+                dst_it->getMapped() = nullptr;
+
+            auto find_it = src_aggregated_table.find(key);
+            if (find_it)
+                mergeAggregateStates(
+                    dst_it->getMapped(),
+                    find_it->getMapped(),
+                    res->aggregates_pool,
+                    /*clear_states*/ false);
+        });
+    }
+}
+
+void Aggregator::mergeAggregateStates(AggregateDataPtr & dst, AggregateDataPtr & src, Arena * arena, bool clear_states) const
+{
+    if (!src)
+        return;
+
+    if (!dst)
+    {
+        auto aggregate_data = arena->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
+        createAggregateStates(aggregate_data);
+        dst = aggregate_data;
+    }
+
+    for (size_t i = 0; i < params.aggregates_size; ++i)
+        aggregate_functions[i]->merge(dst + offsets_of_aggregate_states[i], src + offsets_of_aggregate_states[i], arena);
+
+    if (clear_states)
+        destroyAggregateStates(src);
+}
+
+void Aggregator::destroyAggregateStates(AggregateDataPtr & place) const
+{
+    if (place)
+    {
+        for (size_t i = 0; i < params.aggregates_size; ++i)
+            aggregate_functions[i]->destroy(place + offsets_of_aggregate_states[i]);
+
+        place = nullptr;
+    }
+}
+
+void Aggregator::serializeAggregateStates(const AggregateDataPtr & place, WriteBuffer & wb) const
+{
+    UInt8 has_states = place ? 1 : 0;
+    writeIntBinary(has_states, wb);
+    if (has_states)
+    {
+        for (size_t i = 0; i < params.aggregates_size; ++i)
             aggregate_functions[i]->serialize(place + offsets_of_aggregate_states[i], wb);
     }
 }
@@ -3547,30 +3226,18 @@ void Aggregator::doCheckpointV3(const AggregatedDataVariants & data_variants, Wr
 
     writeIntBinary<UInt8>(static_cast<UInt8>(data_variants.type), wb);
 
-    writeIntBinary<UInt8>(static_cast<UInt8>(expanded_data_type), wb);
+    writeIntBinary<UInt8>(static_cast<UInt8>(trackingUpdatesType()), wb);
 
     auto state_serializer = [this](auto place, auto & wb_) {
         assert(place);
-        if (expanded_data_type == ExpandedDataType::UpdatedWithRetracted)
-        {
-            UpdatedDataEx::serialize(place, wb_);
-
-            auto & retracted_place = RetractedDataEx::getRetracted(place);
-            bool has_retracted = retracted_place != nullptr;
-            writeBoolText(has_retracted, wb_);
-            if (has_retracted)
-                for (size_t i = 0; i < params.aggregates_size; ++i)
-                    aggregate_functions[i]->serialize(retracted_place + offsets_of_aggregate_states[i], wb_);
-        }
-        else if (expanded_data_type == ExpandedDataType::Updated)
-            UpdatedDataEx::serialize(place, wb_);
+        if (trackingUpdatesType() == TrackingUpdatesType::Updates)
+            TrackingUpdates::serialize(place, wb_);
 
         for (size_t i = 0; i < params.aggregates_size; ++i)
             aggregate_functions[i]->serialize(place + offsets_of_aggregate_states[i], wb_);
     };
 
     /// [aggr-func-state-without-key]
-    assert(!params.overflow_row);
     if (data_variants.type == AggregatedDataVariants::Type::without_key)
         state_serializer(data_variants.without_key, wb);
 
@@ -3617,13 +3284,13 @@ void Aggregator::doRecoverV3(AggregatedDataVariants & data_variants, ReadBuffer
 
     UInt8 recovered_expanded_data_type_uint8;
     readIntBinary<UInt8>(recovered_expanded_data_type_uint8, rb);
-    ExpandedDataType recovered_expanded_data_type = static_cast<ExpandedDataType>(recovered_expanded_data_type_uint8);
-    if (recovered_expanded_data_type != expanded_data_type)
+    TrackingUpdatesType recovered_expanded_data_type = static_cast<TrackingUpdatesType>(recovered_expanded_data_type_uint8);
+    if (recovered_expanded_data_type != trackingUpdatesType())
         throw Exception(
             ErrorCodes::RECOVER_CHECKPOINT_FAILED,
             "Failed to recover aggregation checkpoint. Expanded data type is not the same, checkpointed={}, current={}",
             magic_enum::enum_name(recovered_expanded_data_type),
-            magic_enum::enum_name(expanded_data_type));
+            magic_enum::enum_name(trackingUpdatesType()));
 
     auto state_deserializer = [this](auto & place, auto & rb_, Arena * arena) {
         place = nullptr; /// exception-safety - if you can not allocate memory or create states, then destructors will not be called.
@@ -3631,31 +3298,14 @@ void Aggregator::doRecoverV3(AggregatedDataVariants & data_variants, ReadBuffer
         createAggregateStates(aggregate_data);
         place = aggregate_data;
 
-        if (expanded_data_type == ExpandedDataType::UpdatedWithRetracted)
-        {
-            UpdatedDataEx::deserialize(place, rb_);
-
-            auto & retracted = RetractedDataEx::getRetracted(place);
-            bool has_retracted = false;
-            readBoolText(has_retracted, rb_);
-            if (has_retracted)
-            {
-                auto tmp_retracted = arena->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
-                createAggregateStates</*use_compiled_functions*/ false, /*skip_expanded_data*/ true>(tmp_retracted);
-                retracted = tmp_retracted;
-                for (size_t i = 0; i < params.aggregates_size; ++i)
-                    aggregate_functions[i]->deserialize(retracted + offsets_of_aggregate_states[i], rb_, std::nullopt, arena);
-            }
-        }
-        else if (expanded_data_type == ExpandedDataType::Updated)
-            UpdatedDataEx::deserialize(place, rb_);
+        if (trackingUpdatesType() == TrackingUpdatesType::Updates)
+            TrackingUpdates::deserialize(place, rb_);
 
         for (size_t i = 0; i < params.aggregates_size; ++i)
             aggregate_functions[i]->deserialize(place + offsets_of_aggregate_states[i], rb_, std::nullopt, arena);
     };
 
     /// [aggr-func-state-without-key]
-    assert(!params.overflow_row);
     if (data_variants.type == AggregatedDataVariants::Type::without_key)
         state_deserializer(data_variants.without_key, rb, data_variants.aggregates_pool);
 
@@ -3674,7 +3324,7 @@ void Aggregator::doRecoverV3(AggregatedDataVariants & data_variants, ReadBuffer
     else throw Exception("Unknown aggregated data variant.", ErrorCodes::UNKNOWN_AGGREGATED_DATA_VARIANT);
 }
 
-bool Aggregator::checkAndProcessResult(AggregatedDataVariants & result, bool & no_more_keys) const
+bool Aggregator::checkAndProcessResult(AggregatedDataVariants & result) const
 {
     size_t result_size = result.sizeWithoutOverflowRow();
     Int64 current_memory_usage = 0;
@@ -3695,7 +3345,7 @@ bool Aggregator::checkAndProcessResult(AggregatedDataVariants & result, bool & n
         result.convertToTwoLevel();
 
     /// Checking the constraints.
-    if (!checkLimits(result_size, no_more_keys))
+    if (!checkLimits(result_size))
         return true;
 
     /** Flush data to disk if too much RAM is consumed.
@@ -3728,158 +3378,7 @@ bool Aggregator::checkAndProcessResult(AggregatedDataVariants & result, bool & n
     return false;
 }
 
-template <typename Method>
-bool Aggregator::executeAndRetractImpl(
-    Method & method,
-    Arena * aggregates_pool,
-    Arena * retracted_pool,
-    size_t row_begin,
-    size_t row_end,
-    ColumnRawPtrs & key_columns,
-    AggregateFunctionInstruction * aggregate_instructions) const
-{
-    typename Method::State state(key_columns, key_sizes, aggregation_state_cache);
-    bool need_finalization = false;
-
-    /// NOTE: only row_end-row_start is required, but:
-    /// - this affects only optimize_aggregation_in_order,
-    /// - this is just a pointer, so it should not be significant,
-    /// - and plus this will require other changes in the interface.
-    std::unique_ptr<AggregateDataPtr[]> places(new AggregateDataPtr[row_end]);
-
-    /// For all rows.
-    for (size_t i = row_begin; i < row_end; ++i)
-    {
-        AggregateDataPtr aggregate_data = nullptr;
-
-        auto emplace_result = state.emplaceKey(method.data, i, *aggregates_pool);
-
-        /// If a new key is inserted, initialize the states of the aggregate functions, and possibly something related to the key.
-        if (emplace_result.isInserted())
-        {
-            /// exception-safety - if you can not allocate memory or create states, then destructors will not be called.
-            emplace_result.setMapped(nullptr);
-
-            aggregate_data = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
-            /// TODO: support use_compiled_functions
-            createAggregateStates(aggregate_data);
-            emplace_result.setMapped(aggregate_data);
-        }
-        else
-        {
-            aggregate_data = emplace_result.getMapped();
-
-            /// Save changed group with retracted state (used for emit changed group)
-            /// If there are aggregate data and no retracted data, copy aggregate data to retracted data before changed
-            if (!UpdatedDataEx::isEmpty(aggregate_data) && !RetractedDataEx::hasRetracted(aggregate_data))
-            {
-                auto & retracted = RetractedDataEx::getRetracted(aggregate_data);
-                auto tmp_retracted = retracted_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
-                createAggregateStates</*use_compiled_functions*/ false, /*skip_expanded_data*/ true>(tmp_retracted);
-                retracted = tmp_retracted;
-                mergeAggregateStates(retracted, aggregate_data, retracted_pool, /*clear_states*/ false);
-            }
-        }
-
-        assert(aggregate_data != nullptr);
-        places[i] = aggregate_data;
-    }
-
-    /// Add values to the aggregate functions.
-    for (size_t i = 0; i < aggregate_functions.size(); ++i)
-    {
-        AggregateFunctionInstruction * inst = aggregate_instructions + i;
-
-        if (inst->offsets)
-            inst->batch_that->addBatchArray(row_begin, row_end, places.get(), inst->state_offset, inst->batch_arguments, inst->offsets, aggregates_pool);
-        else
-            inst->batch_that->addBatch(row_begin, row_end, places.get(), inst->state_offset, inst->batch_arguments, aggregates_pool, -1, inst->delta_column);
-
-        if (inst->batch_that->isUserDefined())
-        {
-            AggregateDataPtr * places_ptr = places.get();
-            /// It is ok to re-flush if it is flush already, then we don't need maintain a map to check if it is ready flushed
-            for (size_t j = row_begin; j < row_end; ++j)
-            {
-                if (places_ptr[j])
-                {
-                    inst->batch_that->flush(places_ptr[j] + inst->state_offset);
-                    if (!need_finalization)
-                        need_finalization = (inst->batch_that->getEmitTimes(places_ptr[j] + inst->state_offset) > 0);
-                }
-            }
-        }
-    }
-
-    if (hasExpandedData())
-        UpdatedDataEx::addBatch(row_begin, row_end, places.get(), aggregate_instructions ? aggregate_instructions->delta_column : nullptr);
-
-    return need_finalization;
-}
-
-std::pair<bool, bool> Aggregator::executeAndRetractOnBlock(
-    Columns columns,
-    size_t row_begin,
-    size_t row_end,
-    AggregatedDataVariants & result,
-    ColumnRawPtrs & key_columns,
-    AggregateColumns & aggregate_columns,
-    bool & no_more_keys) const
-{
-    std::pair<bool, bool> return_result = {false, false};
-    auto & need_abort = return_result.first;
-    auto & need_finalization = return_result.second;
-
-    if (unlikely(row_end <= row_begin))
-        return return_result;
-
-    result.aggregator = this;
-    if (result.empty())
-    {
-        initDataVariants(result, method_chosen, key_sizes, params);
-        initStatesForWithoutKeyOrOverflow(result);
-        LOG_TRACE(log, "Aggregation method: {}", result.getMethodName());
-    }
-
-    Columns materialized_columns = materializeKeyColumns(columns, key_columns, params, result.isLowCardinality());
-
-    setupAggregatesPoolTimestamps(row_begin, row_end, key_columns, result.aggregates_pool);
-
-    NestedColumnsHolder nested_columns_holder;
-    AggregateFunctionInstructions aggregate_functions_instructions;
-    prepareAggregateInstructions(columns, aggregate_columns, materialized_columns, aggregate_functions_instructions, nested_columns_holder);
-
-    assert(!params.overflow_row && !no_more_keys);
-    assert(expanded_data_type == ExpandedDataType::UpdatedWithRetracted);
-    if (result.type == AggregatedDataVariants::Type::without_key)
-    {
-        /// Save last finalization state into `retracted_result` before processing new data.
-        /// We shall clear and reset it after finalization
-        if (!UpdatedDataEx::isEmpty(result.without_key) && !RetractedDataEx::hasRetracted(result.without_key))
-        {
-            auto & retracted = RetractedDataEx::getRetracted(result.without_key);
-            auto tmp_retracted = result.retracted_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
-            createAggregateStates</*use_compiled_functions*/ false, /*skip_expanded_data*/ true>(tmp_retracted);
-            retracted = tmp_retracted;
-            mergeAggregateStates(retracted, result.without_key, result.retracted_pool.get(), /*clear_states*/ false);
-        }
-
-        need_finalization = executeWithoutKeyImpl<false>(
-            result.without_key, row_begin, row_end, aggregate_functions_instructions.data(), result.aggregates_pool);
-    }
-
-#define M(NAME, IS_TWO_LEVEL) \
-    else if (result.type == AggregatedDataVariants::Type::NAME) \
-        need_finalization = executeAndRetractImpl(*result.NAME, result.aggregates_pool, result.retracted_pool.get(), row_begin, row_end, key_columns, aggregate_functions_instructions.data());
-
-    APPLY_FOR_AGGREGATED_VARIANTS_STREAMING(M)
-#undef M
-
-    need_abort = checkAndProcessResult(result, no_more_keys);
-    return return_result;
-}
-
-BlocksList Aggregator::convertUpdatedToBlocks(AggregatedDataVariants & data_variants) const
+BlocksList Aggregator::convertUpdatesToBlocks(AggregatedDataVariants & data_variants) const
 {
     LOG_DEBUG(log, "Converting updated aggregated data to blocks");
 
@@ -3891,17 +3390,14 @@ BlocksList Aggregator::convertUpdatedToBlocks(AggregatedDataVariants & data_vari
     if (data_variants.empty())
         return blocks;
 
-    if (unlikely(params.overflow_row))
-        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Overflow row processing is not implemented in streaming aggregation");
-
     constexpr bool final = true;
     constexpr bool clear_states = false;
     if (data_variants.type == AggregatedDataVariants::Type::without_key)
-        blocks.emplace_back(prepareBlockAndFillWithoutKey(data_variants, final, false, clear_states, AggregateStateType::OnlyUpdated));
+        blocks.emplace_back(prepareBlockAndFillWithoutKey(data_variants, final, clear_states, ConvertType::OnlyUpdates));
     else if (!data_variants.isTwoLevel())
-        blocks.emplace_back(prepareBlockAndFillSingleLevel(data_variants, final, clear_states, AggregateStateType::OnlyUpdated));
+        blocks.emplace_back(prepareBlockAndFillSingleLevel(data_variants, final, clear_states, ConvertType::OnlyUpdates));
     else
-        blocks.splice(blocks.end(), prepareBlocksAndFillTwoLevel(data_variants, final, clear_states, /*max_threads*/ 1, AggregateStateType::OnlyUpdated));
+        blocks.splice(blocks.end(), prepareBlocksAndFillTwoLevel(data_variants, final, clear_states, /*max_threads*/ 1, ConvertType::OnlyUpdates));
 
     size_t rows = 0;
     size_t bytes = 0;
@@ -3924,7 +3420,7 @@ BlocksList Aggregator::convertUpdatedToBlocks(AggregatedDataVariants & data_vari
 
 
 template <typename Method, bool is_two_level>
-void NO_INLINE Aggregator::mergeUpdatedGroupsImpl(ManyAggregatedDataVariants & non_empty_data, Arena * arena) const
+void NO_INLINE Aggregator::mergeUpdateGroupsImpl(ManyAggregatedDataVariants & non_empty_data, Arena * arena) const
 {
     AggregatedDataVariantsPtr & res = non_empty_data[0];
     auto & dst_table = getDataVariant<Method>(*res).data;
@@ -3939,19 +3435,16 @@ void NO_INLINE Aggregator::mergeUpdatedGroupsImpl(ManyAggregatedDataVariants & n
     ///
     /// 1) Collect all updated groups
     /// `dst` <= (group-1, group-2)
-    bool no_more_keys = false;
     using Table = typename Method::Data;
     for (size_t result_num = 1, size = non_empty_data.size(); result_num < size; ++result_num)
     {
-        if (!checkLimits(res->sizeWithoutOverflowRow(), no_more_keys))
+        if (!checkLimits(res->sizeWithoutOverflowRow()))
             break;
 
-        assert(!no_more_keys);
-
         auto & src_table = getDataVariant<Method>(*non_empty_data[result_num]).data;
         auto merge_updated_func = [&](const auto & key, auto & mapped) {
             /// Skip no updated group
-            if (!UpdatedDataEx::isUpdated(mapped))
+            if (!TrackingUpdates::updated(mapped))
                 return;
 
             typename Table::LookupResult dst_it;
@@ -3964,7 +3457,7 @@ void NO_INLINE Aggregator::mergeUpdatedGroupsImpl(ManyAggregatedDataVariants & n
                 auto & dst = dst_it->getMapped();
                 dst = nullptr; /// exception-safety - if you can not allocate memory or create states, then destructors will not be called.
                 auto aggregate_data = arena->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
-                createAggregateStates</*use_compiled_functions*/ false, /*skip_expanded_data*/ true>(aggregate_data);
+                createAggregateStates(aggregate_data, /*prefix_with_updates_tracking_state*/ false);
                 dst = aggregate_data;
             }
         };
@@ -3985,27 +3478,24 @@ void NO_INLINE Aggregator::mergeUpdatedGroupsImpl(ManyAggregatedDataVariants & n
             {
                 mergeAggregateStates(mapped, find_it->getMapped(), arena, /*clear_states*/ false);
                 /// NOTE: We always reset the updated flag after merged
-                UpdatedDataEx::resetUpdated(find_it->getMapped());
+                TrackingUpdates::resetUpdated(find_it->getMapped());
             }
         });
     }
 }
 
-AggregatedDataVariantsPtr Aggregator::mergeUpdatedGroups(ManyAggregatedDataVariants & data_variants) const
+AggregatedDataVariantsPtr Aggregator::mergeUpdateGroups(ManyAggregatedDataVariants & data_variants) const
 {
     auto prepared_data_ptr = prepareVariantsToMerge(data_variants, /*always_merge_into_empty*/ true);
     if (prepared_data_ptr->empty())
         return {};
 
-    if (unlikely(params.overflow_row))
-        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Overflow row processing is not implemented in streaming aggregation");
-
     BlocksList blocks;
     auto & first = *prepared_data_ptr->at(0);
     if (first.type == AggregatedDataVariants::Type::without_key)
     {
         if (std::ranges::none_of(*prepared_data_ptr, [](auto & variants) {
-                return variants->without_key && UpdatedDataEx::isUpdated(variants->without_key);
+                return variants->without_key && TrackingUpdates::updated(variants->without_key);
             }))
             return {};
 
@@ -4014,7 +3504,7 @@ AggregatedDataVariantsPtr Aggregator::mergeUpdatedGroups(ManyAggregatedDataVaria
 
 #define M(NAME, IS_TWO_LEVEL) \
     else if (first.type == AggregatedDataVariants::Type::NAME) \
-        mergeUpdatedGroupsImpl<decltype(first.NAME)::element_type, IS_TWO_LEVEL>(*prepared_data_ptr, first.aggregates_pool);
+        mergeUpdateGroupsImpl<decltype(first.NAME)::element_type, IS_TWO_LEVEL>(*prepared_data_ptr, first.aggregates_pool);
 
     APPLY_FOR_AGGREGATED_VARIANTS_STREAMING(M)
 #undef M
@@ -4023,218 +3513,6 @@ AggregatedDataVariantsPtr Aggregator::mergeUpdatedGroups(ManyAggregatedDataVaria
     return prepared_data_ptr->at(0);
 }
 
-BlocksList Aggregator::convertRetractedToBlocks(AggregatedDataVariants & data_variants) const
-{
-    LOG_DEBUG(log, "Converting retracted aggregated data to blocks");
-
-    Stopwatch watch;
-
-    BlocksList blocks;
-
-    /// In what data structure is the data aggregated?
-    if (data_variants.empty())
-        return blocks;
-
-    if (unlikely(params.overflow_row))
-        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Overflow row processing is not implemented in streaming aggregation");
-
-    constexpr bool final = true;
-    constexpr bool clear_states = true;
-    if (data_variants.type == AggregatedDataVariants::Type::without_key)
-        blocks.emplace_back(prepareBlockAndFillWithoutKey(data_variants, final, false, clear_states, AggregateStateType::OnlyRetracted));
-    else if (!data_variants.isTwoLevel())
-        blocks.emplace_back(prepareBlockAndFillSingleLevel(data_variants, final, clear_states, AggregateStateType::OnlyRetracted));
-    else
-        blocks.splice(blocks.end(), prepareBlocksAndFillTwoLevel(data_variants, final, clear_states, /*max_threads*/ 1, AggregateStateType::OnlyRetracted));
-
-    size_t rows = 0;
-    size_t bytes = 0;
-
-    for (const auto & block : blocks)
-    {
-        rows += block.rows();
-        bytes += block.bytes();
-    }
-
-    double elapsed_seconds = watch.elapsedSeconds();
-    LOG_DEBUG(log,
-        "Converted retracted aggregated data to blocks. {} rows, {} in {} sec. ({:.3f} rows/sec., {}/sec.)",
-        rows, ReadableSize(bytes),
-        elapsed_seconds, rows / elapsed_seconds,
-        ReadableSize(bytes / elapsed_seconds));
-
-    return blocks;
-}
-
-template <typename Method>
-void Aggregator::mergeRetractedGroupsImpl(ManyAggregatedDataVariants & non_empty_data, Arena * arena) const
-{
-    AggregatedDataVariantsPtr & res = non_empty_data[0];
-    auto & dst_table = getDataVariant<Method>(*res).data;
-    /// Always merge retracted data into empty first.
-    assert(dst_table.empty());
-
-    /// For example:
-    ///                 thread-1        thread-2
-    ///     group-1     retracted       non-retracted
-    ///     group-2     non-retracted   retracted
-    ///     group-3     non-retracted   non-retracted
-    ///
-    /// 1) Collect all retracted groups
-    /// `dst` <= (group-1, group-2)
-    bool no_more_keys = false;
-    using Table = typename Method::Data;
-    for (size_t result_num = 1, size = non_empty_data.size(); result_num < size; ++result_num)
-    {
-        if (!checkLimits(res->sizeWithoutOverflowRow(), no_more_keys))
-            break;
-
-        assert(!no_more_keys);
-
-        auto & src_table = getDataVariant<Method>(*non_empty_data[result_num]).data;
-        src_table.forEachValue([&](const auto & key, auto & mapped) {
-            /// Skip no retracted group
-            if (!RetractedDataEx::hasRetracted(mapped))
-                return;
-
-            typename Table::LookupResult dst_it;
-            bool inserted;
-            /// For StringRef `key`, it is safe to store to `dst_table`
-            /// since the `dst_table` is temporary and the `src_table` will not be cleaned in the meantime
-            dst_table.emplace(key, dst_it, inserted);
-            if (inserted)
-            {
-                auto & dst = dst_it->getMapped();
-                dst = nullptr; /// exception-safety - if you can not allocate memory or create states, then destructors will not be called.
-                auto aggregate_data = arena->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
-                createAggregateStates</*use_compiled_functions*/ false, /*skip_expanded_data*/ true>(aggregate_data);
-                dst = aggregate_data;
-            }
-        });
-    }
-
-    /// 2) Merge all retracted groups parts for each thread (based on `1)` )
-    /// `dst` <= (thread-1: group-1  group-2) + (thread-2: group-1 group-2)
-    for (size_t result_num = 1, size = non_empty_data.size(); result_num < size; ++result_num)
-    {
-        auto & current = *non_empty_data[result_num];
-        auto & src_table = getDataVariant<Method>(current).data;
-        dst_table.forEachValue([&](const auto & key, auto & mapped) {
-            if (auto find_it = src_table.find(key))
-            {
-                auto & src_mapped = find_it->getMapped();
-                if (RetractedDataEx::hasRetracted(src_mapped))
-                    mergeAggregateStates(mapped, RetractedDataEx::getRetracted(src_mapped), arena, /*clear_states*/ true);
-                else
-                    /// If retracted data not exist, assume it does't be changed, we should use original data
-                    mergeAggregateStates(mapped, src_mapped, arena, /*clear_states*/ false);
-            }
-        });
-
-        current.resetRetractedPool();
-    }
-}
-
-AggregatedDataVariantsPtr Aggregator::mergeRetractedGroups(ManyAggregatedDataVariants & data_variants) const
-{
-    auto prepared_data_ptr = prepareVariantsToMerge(data_variants, /*always_merge_into_empty*/ true);
-    if (prepared_data_ptr->empty())
-        return {};
-
-    if (unlikely(params.overflow_row))
-        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Overflow row processing is not implemented in streaming aggregation");
-
-    auto & first = *prepared_data_ptr->at(0);
-    if (first.type == AggregatedDataVariants::Type::without_key)
-    {
-        if (std::ranges::none_of(*prepared_data_ptr, [](auto & variants) { return RetractedDataEx::hasRetracted(variants->without_key); }))
-            return {}; /// Skip if no retracted
-
-        for (size_t result_num = 1, size = prepared_data_ptr->size(); result_num < size; ++result_num)
-        {
-            auto & src_without_key = (*prepared_data_ptr)[result_num]->without_key;
-            if (RetractedDataEx::hasRetracted(src_without_key))
-                mergeAggregateStates(first.without_key, RetractedDataEx::getRetracted(src_without_key), first.aggregates_pool, /*clear_states*/ true);
-            else
-                /// If retracted data not exist, assume it does't be changed, we should use original data
-                mergeAggregateStates(first.without_key, src_without_key, first.aggregates_pool, /*clear_states*/ false);
-        }
-    }
-
-#define M(NAME) \
-    else if (first.type == AggregatedDataVariants::Type::NAME) \
-        mergeRetractedGroupsImpl<decltype(first.NAME)::element_type>(*prepared_data_ptr, first.aggregates_pool);
-
-    APPLY_FOR_VARIANTS_SINGLE_LEVEL_STREAMING(M)
-    APPLY_FOR_VARIANTS_ALL_TWO_LEVEL(M)
-#undef M
-    else throw Exception("Unknown aggregated data variant.", ErrorCodes::UNKNOWN_AGGREGATED_DATA_VARIANT);
-
-    return prepared_data_ptr->at(0);
-}
-
-template <typename Method>
-void Aggregator::mergeRetractedIntoImpl(Method & method, Method & retracted_method, Arena * arena) const
-{
-    using Table = typename Method::Data;
-    Table & table = method.data;
-    Table & retracted_table = retracted_method.data;
-
-    retracted_table.forEachValue([&](const auto & key, auto & retracted_mapped) {
-
-        auto find_it = table.find(key);
-        assert(find_it);
-
-        auto & mapped = find_it->getMapped();
-        assert(!RetractedDataEx::hasRetracted(mapped));
-        UpdatedDataEx::setUpdated(mapped);
-
-        /// For old impl, no retracted data for new group
-        if (!retracted_mapped)
-            return;
-
-        auto & retracted = RetractedDataEx::getRetracted(mapped);
-        auto tmp_retracted = arena->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
-        createAggregateStates</*use_compiled_functions*/ false, /*skip_expanded_data*/ true>(tmp_retracted);
-        retracted = tmp_retracted;
-        mergeAggregateStates(retracted, retracted_mapped, arena, /*clear_states*/ true);
-    });
-}
-
-void Aggregator::mergeRetractedInto(AggregatedDataVariants & result, AggregatedDataVariants && retracted_result) const
-{
-    assert(expanded_data_type == ExpandedDataType::UpdatedWithRetracted);
-    if (result.type != retracted_result.type) [[unlikely]]
-        throw Exception(
-            ErrorCodes::LOGICAL_ERROR,
-            "Don't merge retracted aggregation result, the current data variants type is {}, but retracted data variants type is {}",
-            magic_enum::enum_name(result.type),
-            magic_enum::enum_name(retracted_result.type));
-
-    Arena * arena = result.retracted_pool.get();
-    if (result.type == AggregatedDataVariants::Type::without_key)
-    {
-        if (retracted_result.without_key)
-        {
-            assert(!RetractedDataEx::hasRetracted(result.without_key));
-            auto & retracted = RetractedDataEx::getRetracted(result.without_key);
-            auto tmp_retracted = arena->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
-            createAggregateStates</*use_compiled_functions*/ false, /*skip_expanded_data*/ true>(tmp_retracted);
-            retracted = tmp_retracted;
-            mergeAggregateStates(retracted, retracted_result.without_key, arena, /*clear_states*/ true);
-        }
-    }
-
-#define M(NAME, IS_TWO_LEVEL) \
-    else if (result.type == AggregatedDataVariants::Type::NAME) \
-        mergeRetractedIntoImpl(*result.NAME, *retracted_result.NAME, arena);
-
-    APPLY_FOR_AGGREGATED_VARIANTS_STREAMING(M)
-#undef M
-
-    retracted_result.reset();
-}
-
 void Aggregator::updateMetrics(const AggregatedDataVariants & variants, AggregatedDataMetrics & metrics) const
 {
     switch (variants.type)
diff --git a/src/Interpreters/Streaming/Aggregator.h b/src/Interpreters/Streaming/Aggregator.h
index 945260170a0..32b1a2e141d 100644
--- a/src/Interpreters/Streaming/Aggregator.h
+++ b/src/Interpreters/Streaming/Aggregator.h
@@ -37,7 +37,7 @@
 #include <Core/Streaming/SubstreamID.h>
 #include <DataTypes/DataTypeDateTime64.h>
 #include <Interpreters/Aggregator.h>
-#include <Interpreters/Streaming/AggregateDataEx.h>
+#include <Interpreters/Streaming/UpdatesTrackingData.h>
 #include <Interpreters/Streaming/WindowCommon.h>
 #include <Parsers/ASTFunction.h>
 #include <Common/HashTable/Hash.h>
@@ -76,11 +76,10 @@ namespace Streaming
   *  best suited for different cases, and this approach is just one of them, chosen for a combination of reasons.
   */
 
-enum class AggregateStateType
+enum class ConvertType : uint8_t
 {
-    Normal,
-    OnlyUpdated,
-    OnlyRetracted,
+    Normal = 0,
+    OnlyUpdates = 1,
 };
 
 /// using TimeBucketAggregatedDataWithUInt16Key = TimeBucketHashMap<FixedImplicitZeroHashMap<UInt16, AggregateDataPtr>>;
@@ -129,7 +128,6 @@ SERDE struct AggregatedDataVariants : private boost::noncopyable
     /// Pools for states of aggregate functions. Ownership will be later transferred to ColumnAggregateFunction.
     Arenas aggregates_pools;
     Arena * aggregates_pool{};    /// The pool that is currently used for allocation.
-    std::unique_ptr<Arena> retracted_pool;  /// Use an independent pool to manage retracted data, which will be cleared after each finalization
 
     /** Specialization for the case when there are no keys, and for keys not fitted into max_rows_to_group_by.
       */
@@ -371,17 +369,16 @@ SERDE struct AggregatedDataVariants : private boost::noncopyable
         /// proton: ends;
     }
 
+    /// \param reset - clean up all in memory states and the corresponding arena pools used to hold these states
     void reset();
 
-    void resetAggregatesPool()
+    void resetAndCreateAggregatesPools()
     {
         aggregates_pools = Arenas(1, std::make_shared<Arena>());
         aggregates_pool = aggregates_pools.back().get();
         aggregates_pool->enableRecycle(true);
     }
 
-    void resetRetractedPool() { retracted_pool = std::make_unique<Arena>(); }
-
     /// Number of rows (different keys).
     size_t size() const
     {
@@ -667,8 +664,7 @@ class Aggregator final
 
         WindowParamsPtr window_params;
 
-        bool tracking_changes = false;
-        bool tracking_updated = false;
+        TrackingUpdatesType tracking_updates_type;
         /// proton: ends
 
         /// proton: starts
@@ -690,8 +686,7 @@ class Aggregator final
             ssize_t delta_col_pos_ = -1,
             size_t window_keys_num_ = 0,
             WindowParamsPtr window_params_ = nullptr,
-            bool tracking_changes_ = false,
-            bool tracking_updated_ = false)
+            TrackingUpdatesType tracking_updates_type_ = TrackingUpdatesType::None)
         : src_header(src_header_),
             intermediate_header(intermediate_header_),
             keys(keys_), aggregates(aggregates_), keys_size(keys.size()), aggregates_size(aggregates.size()),
@@ -709,8 +704,7 @@ class Aggregator final
             delta_col_pos(delta_col_pos_),
             window_keys_num(window_keys_num_),
             window_params(window_params_),
-            tracking_changes(tracking_changes_),
-            tracking_updated(tracking_updated_)
+            tracking_updates_type(tracking_updates_type_)
         {
         }
         /// proton: ends
@@ -750,17 +744,21 @@ class Aggregator final
     /// Process one block. Return {should_abort, need_finalization} pair
     /// should_abort: if the processing should be aborted (with group_by_overflow_mode = 'break') return true, otherwise false.
     /// need_finalization : only for UDA aggregation. If there is no UDA, always false
-    std::pair<bool, bool> executeOnBlock(const Block & block,
+    std::pair<bool, bool> executeOnBlock(
+        const Block & block,
         AggregatedDataVariants & result,
         ColumnRawPtrs & key_columns,
-        AggregateColumns & aggregate_columns, /// Passed to not create them anew for each block
-        bool & no_more_keys) const;
+        AggregateColumns & aggregate_columns /// Passed to not create them anew for each block
+    ) const;
 
-    std::pair<bool, bool> executeOnBlock(Columns columns,
-        size_t row_begin, size_t row_end,
+    std::pair<bool, bool> executeOnBlock(
+        Columns columns,
+        size_t row_begin,
+        size_t row_end,
         AggregatedDataVariants & result,
-        ColumnRawPtrs & key_columns, AggregateColumns & aggregate_columns, /// Passed to not create them anew for each block
-        bool & no_more_keys) const;
+        ColumnRawPtrs & key_columns,
+        AggregateColumns & aggregate_columns /// Passed to not create them anew for each block
+    ) const;
 
     /// Execute and retract state for changed groups:
     /// 1) For new group:
@@ -780,15 +778,12 @@ class Aggregator final
         size_t row_begin,
         size_t row_end,
         AggregatedDataVariants & result,
+        AggregatedDataVariants & retracted_result,
         ColumnRawPtrs & key_columns,
-        AggregateColumns & aggregate_columns, /// Passed to not create them anew for each block
-        bool & no_more_keys) const;
-
-    bool mergeOnBlock(Block block, AggregatedDataVariants & result, bool & no_more_keys) const;
+        AggregateColumns & aggregate_columns /// Passed to not create them anew for each block
+    ) const;
 
     /** Convert the aggregation data structure into a block.
-      * If overflow_row = true, then aggregates for rows that are not included in max_rows_to_group_by are put in the first block.
-      *
       * If final = false, then ColumnAggregateFunction is created as the aggregation columns with the state of the calculations,
       *  which can then be combined with other states (for distributed query processing or checkpoint).
       * If final = true, then columns with ready values are created as aggregate columns.
@@ -808,9 +803,11 @@ class Aggregator final
       *       a. SELECT count(), avg(i), sum(k) FROM ( <-- second level global aggr, need prune its state at this level
       *            SELECT avg(i) AS i, sum(k) AS k FROM my_stream GROUP BY device_id <-- first level global aggr, don't prune states
       *          );
+      *
+      * \param max_threads      - limits max threads for converting two level aggregate state in parallel
       */
-    BlocksList convertToBlocks(AggregatedDataVariants & data_variants, bool final, bool clear_states, size_t max_threads) const;
-    BlocksList mergeAndConvertToBlocks(ManyAggregatedDataVariants & data_variants, bool final, bool clear_states, size_t max_threads) const;
+    BlocksList convertToBlocks(AggregatedDataVariants & data_variants, bool final, size_t max_threads) const;
+    BlocksList mergeAndConvertToBlocks(ManyAggregatedDataVariants & data_variants, bool final, size_t max_threads) const;
 
     /// For Tumble/Session window function, there is only one bucket
     /// For Hop window function, merge multiple gcd windows (buckets) to a hop window
@@ -818,30 +815,22 @@ class Aggregator final
     ///   gcd_bucket1 - [00:00, 00:02)
     ///                            =>  result block - [00:00, 00:04)
     ///   gcd_bucket2 - [00:02, 00:04)
-    Block spliceAndConvertBucketsToBlock(
-        AggregatedDataVariants & variants, bool final, bool clear_states, const std::vector<Int64> & gcd_buckets) const;
-    Block mergeAndSpliceAndConvertBucketsToBlock(
-        ManyAggregatedDataVariants & variants, bool final, bool clear_states, const std::vector<Int64> & gcd_buckets) const;
-
-    /// Convert the `updated data` (different with `normal data`)
-    BlocksList convertUpdatedToBlocks(AggregatedDataVariants & data_variants) const;
+    Block spliceAndConvertBucketsToBlock(AggregatedDataVariants & variants, bool final, const std::vector<Int64> & gcd_buckets) const;
+    Block mergeAndSpliceAndConvertBucketsToBlock(ManyAggregatedDataVariants & variants, bool final, const std::vector<Int64> & gcd_buckets) const;
 
-    /// \return: merged updated data if exists
-    /// NOTE: The merged data is as `normal data`, which should use `convertToBlocks` to convert
-    AggregatedDataVariantsPtr mergeUpdatedGroups(ManyAggregatedDataVariants & data_variants) const;
+    /// Only convert the states of update groups tracked
+    BlocksList convertUpdatesToBlocks(AggregatedDataVariants & data_variants) const;
 
-    /// Convert the `retracted data` (different with `normal data`)
-    BlocksList convertRetractedToBlocks(AggregatedDataVariants & data_variants) const;
+    /// \return: merged updated data if exists, when there is no update data, return nullptr
+    AggregatedDataVariantsPtr mergeUpdateGroups(ManyAggregatedDataVariants & data_variants) const;
 
-    /// \return: merged retracted data if exists
-    /// NOTE: The merged data is as `normal data`, which should use `convertToBlocks` to convert
-    AggregatedDataVariantsPtr mergeRetractedGroups(ManyAggregatedDataVariants & data_variants) const;
+    /// For some streaming queries with `emit on update` or `emit changelog`, need tracking updates (with retract)
+    bool needTrackUpdates() const { return params.tracking_updates_type != TrackingUpdatesType::None; }
+    TrackingUpdatesType trackingUpdatesType() const { return params.tracking_updates_type; }
 
-    /// Used for merge legacy retracted data into result
-    void mergeRetractedInto(AggregatedDataVariants & result, AggregatedDataVariants && retracted_result) const;
-
-    bool hasExpandedData() const { return expanded_data_type != ExpandedDataType::None; }
-    ExpandedDataType expandedDataType() const { return expanded_data_type; }
+    /// Used for merge changed groups and return the <retracted_state, aggregated_state> of changed groups
+    std::pair<AggregatedDataVariantsPtr, AggregatedDataVariantsPtr>
+    mergeRetractedGroups(ManyAggregatedDataVariants & aggregated_data, ManyAggregatedDataVariants & retracted_data) const;
 
     std::vector<Int64> bucketsBefore(const AggregatedDataVariants & result, Int64 max_bucket) const;
     void removeBucketsBefore(AggregatedDataVariants & result, Int64 max_bucket) const;
@@ -849,22 +838,12 @@ class Aggregator final
     /// If @p always_merge_into_empty is true, always add an empty variants at front even if there is only one 
     ManyAggregatedDataVariantsPtr prepareVariantsToMerge(ManyAggregatedDataVariants & data_variants, bool always_merge_into_empty = false) const;
 
-    using BucketToBlocks = std::map<Int32, BlocksList>;
-    /// Merge partially aggregated blocks separated to buckets into one data structure.
-    void mergeBlocks(BucketToBlocks bucket_to_blocks, AggregatedDataVariants & result, size_t max_threads);
-
-    /// Merge several partially aggregated blocks into one.
-    /// Precondition: for all blocks block.info.is_overflows flag must be the same.
-    /// (either all blocks are from overflow data or none blocks are).
-    /// The resulting block has the same value of is_overflows flag.
-    Block mergeBlocks(BlocksList & blocks, bool final, bool clear_states, bool only_updated);
-
     /** Split block with partially-aggregated data to many blocks, as if two-level method of aggregation was used.
       * This is needed to simplify merging of that data with other results, that are already two-level.
       */
     std::vector<Block> convertBlockToTwoLevel(const Block & block) const;
 
-    void initStatesForWithoutKeyOrOverflow(AggregatedDataVariants & data_variants) const;
+    void initStatesForWithoutKey(AggregatedDataVariants & data_variants) const;
 
     /// For external aggregation.
     void writeToTemporaryFile(AggregatedDataVariants & data_variants, const String & tmp_path) const;
@@ -940,8 +919,6 @@ class Aggregator final
 
     bool all_aggregates_has_trivial_destructor = false;
 
-    ExpandedDataType expanded_data_type = ExpandedDataType::None;
-
     /// How many RAM were used to process the query before processing the first block.
     Int64 memory_usage_before_aggregation = 0;
 
@@ -971,8 +948,7 @@ class Aggregator final
 
     /** Create states of aggregate functions for one key.
       */
-    template <bool use_compiled_functions = false, bool skip_expanded_data = false>
-    void createAggregateStates(AggregateDataPtr & aggregate_data) const;
+    void createAggregateStates(AggregateDataPtr & aggregate_data, bool prefix_with_updates_tracking_state = true) const;
 
     /** Call `destroy` methods for states of aggregate functions.
       * Used in the exception handler for aggregation, since RAII in this case is not applicable.
@@ -984,35 +960,19 @@ class Aggregator final
         size_t row_begin,
         size_t row_end,
         ColumnRawPtrs & key_columns,
-        AggregateFunctionInstruction * aggregate_instructions,
-        bool no_more_keys,
-        AggregateDataPtr overflow_row = nullptr) const;
+        AggregateFunctionInstruction * aggregate_instructions) const;
 
     /// Process one data block, aggregate the data into a hash table.
     template <typename Method>
-    bool executeImpl(
-        Method & method,
-        Arena * aggregates_pool,
-        size_t row_begin,
-        size_t row_end,
-        ColumnRawPtrs & key_columns,
-        AggregateFunctionInstruction * aggregate_instructions,
-        bool no_more_keys,
-        AggregateDataPtr overflow_row) const;
-
-    /// Specialization for a particular value no_more_keys.
-    template <bool no_more_keys, bool use_compiled_functions, typename Method>
     bool executeImplBatch(
         Method & method,
-        typename Method::State & state,
         Arena * aggregates_pool,
         size_t row_begin,
         size_t row_end,
-        AggregateFunctionInstruction * aggregate_instructions,
-        AggregateDataPtr overflow_row) const;
+        ColumnRawPtrs & key_columns,
+        AggregateFunctionInstruction * aggregate_instructions) const;
 
     /// For case when there are no keys (all aggregate into one row). For UDA with own strategy, return 'true' means the UDA should emit after execution
-    template <bool use_compiled_functions>
     bool executeWithoutKeyImpl(
         AggregatedDataWithoutKey & res,
         size_t row_begin,
@@ -1020,16 +980,6 @@ class Aggregator final
         AggregateFunctionInstruction * aggregate_instructions,
         Arena * arena) const;
 
-#if 0 /// Unused for now
-    static void executeOnIntervalWithoutKeyImpl(
-        AggregatedDataWithoutKey & res,
-        size_t row_begin,
-        size_t row_end,
-        AggregateFunctionInstruction * aggregate_instructions,
-        Arena * arena,
-        const IColumn * delta_col);
-#endif
-
     template <typename Method>
     void writeToTemporaryFileImpl(
         AggregatedDataVariants & data_variants,
@@ -1046,7 +996,7 @@ class Aggregator final
 
     /// Merge data from hash table `src` into `dst`.
     using EmptyKeyHandler = void *;
-    template <typename Method, bool use_compiled_functions, typename Table, typename KeyHandler = EmptyKeyHandler>
+    template <typename Method, typename Table, typename KeyHandler = EmptyKeyHandler>
     void mergeDataImpl(
         Table & table_dst,
         Table & table_src,
@@ -1061,7 +1011,7 @@ class Aggregator final
 
     template <typename Method, typename Table>
     Block convertToBlockImpl(
-        Method & method, Table & data, Arena * arena, Arenas & aggregates_pools, bool final, size_t rows, bool clear_states, AggregateStateType type) const;
+        Method & method, Table & data, Arena * arena, Arenas & aggregates_pools, bool final, size_t rows, bool clear_states, ConvertType type) const;
 
     template <typename Mapped>
     void insertAggregatesIntoColumns(
@@ -1070,13 +1020,12 @@ class Aggregator final
         Arena * arena,
         bool clear_states) const;
 
-    template <bool use_compiled_functions>
     Block insertResultsIntoColumns(
         PaddedPODArray<AggregateDataPtr> & places, OutputBlockColumns && out_cols, Arena * arena, bool clear_states) const;
 
-    template <typename Method, bool use_compiled_functions, AggregateStateType type, typename Table>
+    template <typename Method, typename Table>
     Block convertToBlockImplFinal(
-        Method & method, Table & data, Arena * arena, Arenas & aggregates_pools, size_t rows, bool clear_states) const;
+        Method & method, Table & data, Arena * arena, Arenas & aggregates_pools, size_t rows, bool clear_states, ConvertType type) const;
 
     template <typename Method, typename Table>
     Block convertToBlockImplNotFinal(Method & method, Table & data, Arenas & aggregates_pools, size_t rows) const;
@@ -1089,7 +1038,7 @@ class Aggregator final
         bool final,
         bool clear_states,
         Int64 bucket,
-        AggregateStateType type = AggregateStateType::Normal) const;
+        ConvertType type = ConvertType::Normal) const;
 
     /// proton: starts.
     template <typename Method>
@@ -1111,58 +1060,34 @@ class Aggregator final
     void serializeAggregateStates(const AggregateDataPtr & place, WriteBuffer & wb) const;
     void deserializeAggregateStates(AggregateDataPtr & place, ReadBuffer & rb, Arena * arena) const;
 
-    void clearDataVariants(AggregatedDataVariants & data_variants) const;
-
-    /// @return does need abort ?
-    bool checkAndProcessResult(AggregatedDataVariants & result, bool & no_more_keys) const;
+    /// \return true means execution must be aborted, false means normal 
+    bool checkAndProcessResult(AggregatedDataVariants & result) const;
 
     template <typename Method>
     bool executeAndRetractImpl(
         Method & method,
         Arena * aggregates_pool,
+        Method & retracted_method,
         Arena * retracted_pool,
         size_t row_begin,
         size_t row_end,
         ColumnRawPtrs & key_columns,
         AggregateFunctionInstruction * aggregate_instructions) const;
 
-    template <typename Method, bool is_two_level>
-    void mergeUpdatedGroupsImpl(ManyAggregatedDataVariants & non_empty_data, Arena * arena) const;
     template <typename Method>
-    void mergeRetractedGroupsImpl(ManyAggregatedDataVariants & non_empty_data, Arena * arena) const;
+    void mergeRetractedGroupsImpl(ManyAggregatedDataVariants & aggregated_data, ManyAggregatedDataVariants & retracted_data) const;
 
-    template <typename Method>
-    void mergeRetractedIntoImpl(Method & method, Method & retracted_method, Arena * arena) const;
+    template <typename Method, bool is_two_level>
+    void mergeUpdateGroupsImpl(ManyAggregatedDataVariants & non_empty_data, Arena * arena) const;
     /// proton: ends.
 
-    Block prepareBlockAndFillWithoutKey(AggregatedDataVariants & data_variants, bool final, bool is_overflows, bool clear_states, AggregateStateType type = AggregateStateType::Normal) const;
-    Block prepareBlockAndFillSingleLevel(AggregatedDataVariants & data_variants, bool final, bool clear_states, AggregateStateType type = AggregateStateType::Normal) const;
-    BlocksList prepareBlocksAndFillTwoLevel(AggregatedDataVariants & data_variants, bool final, bool clear_states, size_t max_threads, AggregateStateType type = AggregateStateType::Normal) const;
+    Block prepareBlockAndFillWithoutKey(AggregatedDataVariants & data_variants, bool final, bool clear_states, ConvertType type = ConvertType::Normal) const;
+    Block prepareBlockAndFillSingleLevel(AggregatedDataVariants & data_variants, bool final, bool clear_states, ConvertType type = ConvertType::Normal) const;
+    BlocksList prepareBlocksAndFillTwoLevel(AggregatedDataVariants & data_variants, bool final, bool clear_states, size_t max_threads, ConvertType type = ConvertType::Normal) const;
 
     template <typename Method>
     BlocksList prepareBlocksAndFillTwoLevelImpl(
-        AggregatedDataVariants & data_variants, Method & method, bool final, bool clear_states, ThreadPool * thread_pool, AggregateStateType type) const;
-
-    template <bool no_more_keys, typename Method, typename Table>
-    void mergeStreamsImplCase(
-        Block & block,
-        Arena * aggregates_pool,
-        Method & method,
-        Table & data,
-        AggregateDataPtr overflow_row) const;
-
-    template <typename Method, typename Table>
-    void mergeStreamsImpl(
-        Block & block,
-        Arena * aggregates_pool,
-        Method & method,
-        Table & data,
-        AggregateDataPtr overflow_row,
-        bool no_more_keys) const;
-
-    void mergeWithoutKeyStreamsImpl(
-        Block & block,
-        AggregatedDataVariants & result) const;
+        AggregatedDataVariants & data_variants, Method & method, bool final, bool clear_states, ThreadPool * thread_pool, ConvertType type) const;
 
     template <typename Method>
     void mergeBucketImpl(
@@ -1187,9 +1112,8 @@ class Aggregator final
       * If it is exceeded, then, depending on the group_by_overflow_mode, either
       * - throws an exception;
       * - returns false, which means that execution must be aborted;
-      * - sets the variable no_more_keys to true.
       */
-    bool checkLimits(size_t result_size, bool & no_more_keys) const;
+    bool checkLimits(size_t result_size) const;
 
     void prepareAggregateInstructions(
         Columns columns,
@@ -1213,9 +1137,9 @@ class Aggregator final
     /// Existed versions:
     ///   STATE V1 - Legacy version (REVISION 1)
     ///   STATE V2 - REVISION 1 (Enable revision)
-    ///   STATE V3 - REVISION 3 (Add expanded data)
+    ///   STATE V3 - REVISION 3 (Add updates tracking state)
     static constexpr UInt64 STATE_V2_MIN_REVISION = 1;
-    static constexpr UInt64 STATE_V3_MIN_REVISION = 3;
+    // static constexpr UInt64 STATE_V3_MIN_REVISION = 3; /// will enable it later
 
     VersionType getVersionFromRevision(UInt64 revision) const;
     VersionType getVersion() const;
@@ -1224,11 +1148,9 @@ class Aggregator final
     void recover(AggregatedDataVariants & data_variants, ReadBuffer & rb) const;
 
 private:
-    /// [Version-3]
     void doCheckpointV3(const AggregatedDataVariants & data_variants, WriteBuffer & wb) const;
     void doRecoverV3(AggregatedDataVariants & data_variants, ReadBuffer & rb) const;
 
-    /// [Version-2]
     void doCheckpointV2(const AggregatedDataVariants & data_variants, WriteBuffer & wb) const;
     void doRecoverV2(AggregatedDataVariants & data_variants, ReadBuffer & rb) const;
 
diff --git a/src/Interpreters/Streaming/UpdatesTrackingData.h b/src/Interpreters/Streaming/UpdatesTrackingData.h
new file mode 100644
index 00000000000..bd44c79e3d7
--- /dev/null
+++ b/src/Interpreters/Streaming/UpdatesTrackingData.h
@@ -0,0 +1,105 @@
+#pragma once
+
+#include <IO/ReadHelpers.h>
+#include <IO/WriteHelpers.h>
+#include <Common/serde.h>
+
+namespace DB
+{
+using AggregateDataPtr = char *;
+using ConstAggregateDataPtr = const char *;
+
+namespace Streaming
+{
+SERDE struct TrackingUpdates
+{
+    static ALWAYS_INLINE TrackingUpdates & data(AggregateDataPtr __restrict place) { return *reinterpret_cast<TrackingUpdates *>(place); }
+    static ALWAYS_INLINE const TrackingUpdates & data(ConstAggregateDataPtr __restrict place) { return *reinterpret_cast<const TrackingUpdates *>(place); }
+
+    static ALWAYS_INLINE bool empty(ConstAggregateDataPtr __restrict place) { return data(place).updates == 0; }
+    static ALWAYS_INLINE bool updated(ConstAggregateDataPtr __restrict place) { return data(place).updated_since_last_finalization; }
+    static ALWAYS_INLINE void setUpdated(AggregateDataPtr __restrict place) { data(place).updated_since_last_finalization = true; }
+    static ALWAYS_INLINE void resetUpdated(AggregateDataPtr __restrict place) { data(place).updated_since_last_finalization = false; }
+
+    static void addBatch(size_t row_begin, size_t row_end, AggregateDataPtr * places, const IColumn * delta_col)
+    {
+        if (delta_col == nullptr)
+        {
+            for (size_t i = row_begin; i < row_end; ++i)
+                if (places[i])
+                    data(places[i]).add();
+        }
+        else
+        {
+            const auto & delta_flags = assert_cast<const ColumnInt8 &>(*delta_col).getData();
+            for (size_t i = row_begin; i < row_end; ++i)
+            {
+                if (places[i])
+                {
+                    if (delta_flags[i] >= 0)
+                        data(places[i]).add();
+                    else
+                        data(places[i]).negate();
+                }
+            }
+        }
+    }
+
+    static void addBatchSinglePlace(size_t row_begin, size_t row_end, AggregateDataPtr __restrict place, const IColumn * delta_col)
+    {
+        if (!place)
+            return;
+
+        auto & data_ex = data(place);
+        if (delta_col == nullptr)
+            data_ex.updates += row_end - row_begin;
+        else
+        {
+            const auto & delta_flags = assert_cast<const ColumnInt8 &>(*delta_col).getData();
+            data_ex.updates = std::accumulate(delta_flags.begin(), delta_flags.end(), data_ex.updates);
+        }
+
+        data_ex.updated_since_last_finalization = true;
+    }
+
+    static void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & wb)
+    {
+        const auto & data_ex = data(place);
+        writeIntBinary(data_ex.updates, wb);
+        writeBinary(data_ex.updated_since_last_finalization, wb);
+    }
+
+    static void deserialize(AggregateDataPtr __restrict place, ReadBuffer & rb)
+    {
+        auto & data_ex = data(place);
+        readIntBinary(data_ex.updates, rb);
+        readBinary(data_ex.updated_since_last_finalization, rb);
+    }
+
+    ALWAYS_INLINE void add()
+    {
+        ++updates;
+        updated_since_last_finalization = true;
+    }
+
+    ALWAYS_INLINE void negate()
+    {
+        --updates;
+        updated_since_last_finalization = true;
+    }
+
+    /// Used to track if the target to be tracked has zero sum changes
+    UInt64 updates = 0;
+
+    /// Used to track if the target group tracked has updates since last finalization 
+    bool updated_since_last_finalization = true;
+};
+
+enum class TrackingUpdatesType : uint8_t
+{
+    None = 0,
+    Updates = 1,
+};
+
+}
+}
diff --git a/src/Processors/Transforms/Streaming/AggregatingHelper.cpp b/src/Processors/Transforms/Streaming/AggregatingHelper.cpp
index 849b82b802c..c6bca59183f 100644
--- a/src/Processors/Transforms/Streaming/AggregatingHelper.cpp
+++ b/src/Processors/Transforms/Streaming/AggregatingHelper.cpp
@@ -30,18 +30,12 @@ Chunk mergeBlocksToChunk(BlocksList && blocks)
     return merged_chunk;
 }
 
-Chunk convertToChunkImpl(AggregatedDataVariants & data, const AggregatingTransformParams & params, AggregateStateType type)
+Chunk convertToChunkImpl(AggregatedDataVariants & data, const AggregatingTransformParams & params)
 {
     if (data.empty())
         return {};
 
-    BlocksList blocks;
-    if (type == AggregateStateType::OnlyUpdated)
-        blocks = params.aggregator.convertUpdatedToBlocks(data);
-    else if (type == AggregateStateType::OnlyRetracted)
-        blocks = params.aggregator.convertRetractedToBlocks(data);
-    else
-        blocks = params.aggregator.convertToBlocks(data, params.final, !params.params.keep_state, params.params.max_threads);
+    BlocksList blocks = params.aggregator.convertToBlocks(data, params.final, params.params.max_threads);
 
     /// FIXME: When global aggr states was converted two level hash table, the merged chunk may be too large
     return mergeBlocksToChunk(std::move(blocks));
@@ -52,12 +46,12 @@ namespace AggregatingHelper
 {
 Chunk convertToChunk(AggregatedDataVariants & data, const AggregatingTransformParams & params)
 {
-    return convertToChunkImpl(data, params, AggregateStateType::Normal);
+    return convertToChunkImpl(data, params);
 }
 
 Chunk mergeAndConvertToChunk(ManyAggregatedDataVariants & data, const AggregatingTransformParams & params)
 {
-    auto blocks = params.aggregator.mergeAndConvertToBlocks(data, params.final, !params.params.keep_state, params.params.max_threads);
+    auto blocks = params.aggregator.mergeAndConvertToBlocks(data, params.final, params.params.max_threads);
     /// FIXME: When global aggr states was converted two level hash table, the merged chunk may be too large
     return mergeBlocksToChunk(std::move(blocks));
 }
@@ -65,68 +59,51 @@ Chunk mergeAndConvertToChunk(ManyAggregatedDataVariants & data, const Aggregatin
 Chunk spliceAndConvertBucketsToChunk(
     AggregatedDataVariants & data, const AggregatingTransformParams & params, const std::vector<Int64> & buckets)
 {
-    return convertToChunk(params.aggregator.spliceAndConvertBucketsToBlock(data, params.final, /*clear_states*/ false, buckets));
+    return convertToChunk(params.aggregator.spliceAndConvertBucketsToBlock(data, params.final, buckets));
 }
 
 Chunk mergeAndSpliceAndConvertBucketsToChunk(
     ManyAggregatedDataVariants & data, const AggregatingTransformParams & params, const std::vector<Int64> & buckets)
 {
-    return convertToChunk(params.aggregator.mergeAndSpliceAndConvertBucketsToBlock(data, params.final, /*clear_states*/ false, buckets));
+    return convertToChunk(params.aggregator.mergeAndSpliceAndConvertBucketsToBlock(data, params.final, buckets));
 }
 
-ChunkPair convertToChangelogChunk(AggregatedDataVariants & data, const AggregatingTransformParams & params)
-{
-    if (data.empty())
-        return {};
-
-    auto retracted_chunk = convertToChunkImpl(data, params, AggregateStateType::OnlyRetracted);
-    if (retracted_chunk)
-    {
-        auto retracted_delta_col = ColumnInt8::create(retracted_chunk.rows(), Int8(-1));
-        retracted_chunk.addColumn(std::move(retracted_delta_col));
-        retracted_chunk.setConsecutiveDataFlag();
-    }
-
-    auto chunk = convertToChunkImpl(data, params, AggregateStateType::OnlyUpdated);
-    if (chunk)
-    {
-        auto delta_col = ColumnInt8::create(chunk.rows(), Int8(1));
-        chunk.addColumn(std::move(delta_col));
-    }
-    return {std::move(retracted_chunk), std::move(chunk)};
-}
-
-ChunkPair mergeAndConvertToChangelogChunk(ManyAggregatedDataVariants & data, const AggregatingTransformParams & params)
-{
-    if (data.size() == 1)
-        return convertToChangelogChunk(*data[0], params);
-
-    ChunkPair results;
-    auto & [retracted_chunk, chunk] = results;
-
-    auto merged_retracted_data = params.aggregator.mergeRetractedGroups(data);
-    if (merged_retracted_data)
-    {
-        retracted_chunk = convertToChunk(*merged_retracted_data, params);
-        if (retracted_chunk)
-        {
-            auto retracted_delta_col = ColumnInt8::create(retracted_chunk.rows(), Int8(-1));
-            retracted_chunk.addColumn(std::move(retracted_delta_col));
-            retracted_chunk.setConsecutiveDataFlag();
-        }
-    }
-
-    auto merged_updated_data = params.aggregator.mergeUpdatedGroups(data);
-    if (merged_updated_data)
-    {
-        chunk = convertToChunk(*merged_updated_data, params);
-        if (chunk)
-        {
-            auto delta_col = ColumnInt8::create(chunk.rows(), Int8(1));
-            chunk.addColumn(std::move(delta_col));
-        }
-    }
-    return results;
+ChunkPair
+ convertToChangelogChunk(AggregatedDataVariants & data, RetractedDataVariants & retracted_data, const AggregatingTransformParams & params)
+ {
+     if (data.empty())
+         return {};
+
+     assert(!retracted_data.empty());
+
+     auto retracted_chunk = convertToChunk(retracted_data, params);
+     if (retracted_chunk)
+     {
+         auto retracted_delta_col = ColumnInt8::create(retracted_chunk.rows(), Int8(-1));
+         retracted_chunk.addColumn(std::move(retracted_delta_col));
+         retracted_chunk.setConsecutiveDataFlag();
+     }
+     retracted_data.reset(); /// Clean up retract data after finalized
+
+     auto chunk = convertToChunk(data, params);
+     if (chunk)
+     {
+         auto delta_col = ColumnInt8::create(chunk.rows(), Int8(1));
+         chunk.addColumn(std::move(delta_col));
+     }
+
+     return {std::move(retracted_chunk), std::move(chunk)};
+ }
+
+ ChunkPair mergeAndConvertToChangelogChunk(
+     ManyAggregatedDataVariants & data, ManyRetractedDataVariants & retracted_data, const AggregatingTransformParams & params)
+ {
+     auto [merged_data, merged_retracted_data] = params.aggregator.mergeRetractedGroups(data, retracted_data);
+     if (!merged_data)
+         return {};
+
+     assert(merged_retracted_data);
+     return convertToChangelogChunk(*merged_data, *merged_retracted_data, params);
 }
 }
 }
diff --git a/src/Processors/Transforms/Streaming/AggregatingHelper.h b/src/Processors/Transforms/Streaming/AggregatingHelper.h
index 85b177b5b51..5ca32f6fc00 100644
--- a/src/Processors/Transforms/Streaming/AggregatingHelper.h
+++ b/src/Processors/Transforms/Streaming/AggregatingHelper.h
@@ -38,13 +38,16 @@ Chunk mergeAndSpliceAndConvertBucketsToChunk(
     ManyAggregatedDataVariants & data, const AggregatingTransformParams & params, const std::vector<Int64> & buckets);
 
 /// Only used for emit changelog
-/// @brief only convert the state of changed groups (retracted: last state, aggregated: current state)
-///  \data: current aggregated state of all groups (contains retracted states and updated states)
+/// @brief Based on new/updated groups @p retracted_data , only convert the state of changed groups (retracted: last state, aggregated: current state)
+///  \data: current aggregated state of all groups
+///  \retracted_data: only have last state of changed groups (i.e. new/updated/deleted)
 /// @returns <retracted_chunk, aggregated_chunk>
 /// retracted_chunk: just contains retracted data of changed groups
 /// aggregated_chunk: just contains aggregated data of changed groups
-ChunkPair convertToChangelogChunk(AggregatedDataVariants & data, const AggregatingTransformParams & params);
-ChunkPair mergeAndConvertToChangelogChunk(ManyAggregatedDataVariants & data, const AggregatingTransformParams & params);
+ChunkPair
+convertToChangelogChunk(AggregatedDataVariants & data, RetractedDataVariants & retracted_data, const AggregatingTransformParams & params);
+ChunkPair mergeAndConvertToChangelogChunk(
+    ManyAggregatedDataVariants & data, ManyRetractedDataVariants & retracted_data, const AggregatingTransformParams & params);
 }
 
 }
diff --git a/src/Processors/Transforms/Streaming/AggregatingTransform.cpp b/src/Processors/Transforms/Streaming/AggregatingTransform.cpp
index b9fa8205e75..22489fc9bc3 100644
--- a/src/Processors/Transforms/Streaming/AggregatingTransform.cpp
+++ b/src/Processors/Transforms/Streaming/AggregatingTransform.cpp
@@ -189,22 +189,11 @@ void AggregatingTransform::consume(Chunk chunk)
 std::pair<bool, bool> AggregatingTransform::executeOrMergeColumns(Chunk & chunk, size_t num_rows)
 {
     auto columns = chunk.detachColumns();
-    if (params->only_merge)
-    {
-        auto block = getInputs().front().getHeader().cloneWithColumns(columns);
-        materializeBlockInplace(block);
-        /// FIXME
-        /// Blocking finalization during execution on current variant
-        std::lock_guard lock(variants_mutex);
-        auto success = params->aggregator.mergeOnBlock(block, variants, no_more_keys);
-        return {!success, false};
-    }
-    else
-    {
-        /// Blocking finalization during execution on current variant
-        std::lock_guard lock(variants_mutex);
-        return params->aggregator.executeOnBlock(std::move(columns), 0, num_rows, variants, key_columns, aggregate_columns, no_more_keys);
-    }
+    assert(!params->only_merge && !no_more_keys);
+
+    /// Blocking finalization during execution on current variant
+    std::lock_guard lock(variants_mutex);
+    return params->aggregator.executeOnBlock(std::move(columns), 0, num_rows, variants, key_columns, aggregate_columns);
 }
 
 void AggregatingTransform::emitVersion(Chunk & chunk)
diff --git a/src/Processors/Transforms/Streaming/AggregatingTransformWithSubstream.cpp b/src/Processors/Transforms/Streaming/AggregatingTransformWithSubstream.cpp
index fd1cda27554..208549bb6f4 100644
--- a/src/Processors/Transforms/Streaming/AggregatingTransformWithSubstream.cpp
+++ b/src/Processors/Transforms/Streaming/AggregatingTransformWithSubstream.cpp
@@ -226,10 +226,10 @@ std::pair<bool, bool> AggregatingTransformWithSubstream::executeOrMergeColumns(C
     /// according to partition keys
     auto num_rows = chunk.getNumRows();
 
-    assert(!params->only_merge);
+    assert(!params->only_merge && !no_more_keys);
 
     return params->aggregator.executeOnBlock(
-        chunk.detachColumns(), 0, num_rows, substream_ctx->variants, key_columns, aggregate_columns, no_more_keys);
+        chunk.detachColumns(), 0, num_rows, substream_ctx->variants, key_columns, aggregate_columns);
 }
 
 SubstreamContextPtr AggregatingTransformWithSubstream::getOrCreateSubstreamContext(const SubstreamID & id)
@@ -270,7 +270,7 @@ void AggregatingTransformWithSubstream::checkpoint(CheckpointContextPtr ckpt_ctx
         for (const auto & [id, substream_ctx] : substream_contexts)
         {
             assert(id == substream_ctx->id);
-            serialize(*substream_ctx, wb, getVersion());
+            substream_ctx->serialize(wb, getVersion());
         }
     });
 }
@@ -284,7 +284,7 @@ void AggregatingTransformWithSubstream::recover(CheckpointContextPtr ckpt_ctx)
         for (size_t i = 0; i < num_substreams; ++i)
         {
             auto substream_ctx = std::make_shared<SubstreamContext>(this);
-            deserialize(*substream_ctx, rb, version_);
+            substream_ctx->deserialize(rb, version_);
             substream_contexts.emplace(substream_ctx->id, std::move(substream_ctx));
         }
     });
@@ -294,7 +294,7 @@ void SubstreamContext::serialize(WriteBuffer & wb, VersionType version) const
 {
     DB::Streaming::serialize(id, wb);
 
-    DB::serialize(variants, wb, aggregating_transform->params->aggregator);
+    variants.serialize(wb, aggregating_transform->params->aggregator);
 
     DB::writeIntBinary(finalized_watermark, wb);
 
@@ -312,7 +312,7 @@ void SubstreamContext::deserialize(ReadBuffer & rb, VersionType version)
 {
     DB::Streaming::deserialize(id, rb);
 
-    DB::deserialize(variants, rb, aggregating_transform->params->aggregator);
+    variants.deserialize(rb, aggregating_transform->params->aggregator);
 
     DB::readIntBinary(finalized_watermark, rb);
 
diff --git a/src/Processors/Transforms/Streaming/GlobalAggregatingTransform.cpp b/src/Processors/Transforms/Streaming/GlobalAggregatingTransform.cpp
index 3049e4bebce..365c5621a37 100644
--- a/src/Processors/Transforms/Streaming/GlobalAggregatingTransform.cpp
+++ b/src/Processors/Transforms/Streaming/GlobalAggregatingTransform.cpp
@@ -9,7 +9,6 @@ namespace ErrorCodes
 {
 extern const int NOT_IMPLEMENTED;
 extern const int UNSUPPORTED;
-extern const int RECOVER_CHECKPOINT_FAILED;
 }
 
 namespace Streaming
@@ -41,58 +40,35 @@ GlobalAggregatingTransform::GlobalAggregatingTransform(
     if (unlikely(params->params.overflow_row))
         throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Overflow row processing is not implemented in global aggregation");
 
+    /// Need extra retracted data
     if (params->emit_changelog)
     {
         if (params->emit_version)
             throw Exception(ErrorCodes::UNSUPPORTED, "'emit_version()' is not supported in global aggregation emit changelog");
 
-        bool retract_enabled = false;
+        ManyRetractedDataVariants retracted_data(many_data->variants.size());
+        for (auto & elem : retracted_data)
+            elem = std::make_shared<AggregatedDataVariants>();
+
         many_data->setField(
-            {retract_enabled,
+            {std::move(retracted_data),
              /// Field serializer
-             [](const std::any & field, WriteBuffer & wb, [[maybe_unused]] VersionType version) {
-                 assert(version >= IMPL_V2_MIN_VERSION);
-                 DB::writeBoolText(std::any_cast<bool>(field), wb);
+             [this](const std::any & field, WriteBuffer & wb, VersionType) {
+                 const auto & data = std::any_cast<const ManyRetractedDataVariants &>(field);
+                 DB::writeIntBinary(data.size(), wb);
+                 for (const auto & elem : data)
+                     elem->serialize(wb, params->aggregator);
              },
              /// Field deserializer
-             [this](std::any & field, ReadBuffer & rb, VersionType version) {
-                 if (version >= IMPL_V2_MIN_VERSION)
-                 {
-                     DB::readBoolText(std::any_cast<bool &>(field), rb);
-                 }
-                 else
+             [this](std::any & field, ReadBuffer & rb, VersionType) {
+                 auto & data = std::any_cast<ManyRetractedDataVariants &>(field);
+                 size_t num;
+                 DB::readIntBinary(num, rb);
+                 data.resize(num);
+                 for (auto & elem : data)
                  {
-                     /// Convert old impl to new impl V2
-                     if (params->aggregator.expandedDataType() != ExpandedDataType::UpdatedWithRetracted)
-                         throw Exception(
-                             ErrorCodes::RECOVER_CHECKPOINT_FAILED,
-                             "Failed to recover aggregation checkpoint. Recover old version '{}' checkpoint, checkpointed need retracted, "
-                             "but "
-                             "current not need",
-                             version);
-
-                     size_t retracted_num;
-                     DB::readIntBinary(retracted_num, rb);
-                     if (retracted_num != many_data->variants.size())
-                         throw Exception(
-                             ErrorCodes::RECOVER_CHECKPOINT_FAILED,
-                             "Failed to recover aggregation checkpoint. Recover old version '{}' checkpoint but the scale of the pipeline "
-                             "is "
-                             "inconsistent, checkpointed={}, current={}",
-                             version,
-                             retracted_num,
-                             many_data->variants.size());
-
-                     bool has_retracted = false;
-                     for (auto & current : many_data->variants)
-                     {
-                         AggregatedDataVariants retracted;
-                         DB::deserialize(retracted, rb, params->aggregator);
-                         has_retracted |= retracted.size() > 0;
-                         params->aggregator.mergeRetractedInto(*current, std::move(retracted));
-                     }
-
-                     std::any_cast<bool &>(field) = many_data->emited_version > 0 || has_retracted; /// retracted enabled
+                     elem = std::make_shared<AggregatedDataVariants>();
+                     elem->deserialize(rb, params->aggregator);
                  }
              }});
     }
@@ -126,18 +102,15 @@ std::pair<bool, bool> GlobalAggregatingTransform::executeOrMergeColumns(Chunk &
 {
     if (params->emit_changelog)
     {
-        assert(!params->only_merge);
+        assert(!params->only_merge && !no_more_keys);
+
+        auto & retracted_variants = many_data->getField<ManyRetractedDataVariants>()[current_variant];
+        auto & aggregated_variants = many_data->variants[current_variant];
+
         /// Blocking finalization during execution on current variant
         std::lock_guard lock(variants_mutex);
-
-        /// Enable retract after first finalization
-        auto retract_enabled = many_data->getField<bool>();
-        if (retract_enabled) [[likely]]
-            return params->aggregator.executeAndRetractOnBlock(
-                chunk.detachColumns(), 0, num_rows, variants, key_columns, aggregate_columns, no_more_keys);
-        else
-            return params->aggregator.executeOnBlock(
-                chunk.detachColumns(), 0, num_rows, variants, key_columns, aggregate_columns, no_more_keys);
+        return params->aggregator.executeAndRetractOnBlock(
+            chunk.detachColumns(), 0, num_rows, *aggregated_variants, *retracted_variants, key_columns, aggregate_columns);
     }
     else
         return AggregatingTransform::executeOrMergeColumns(chunk, num_rows);
@@ -154,9 +127,8 @@ void GlobalAggregatingTransform::finalize(const ChunkContextPtr & chunk_ctx)
 
     if (params->emit_changelog)
     {
-        auto [retracted_chunk, chunk] = AggregatingHelper::mergeAndConvertToChangelogChunk(many_data->variants, *params);
-        /// Enable retract after first finalization
-        many_data->getField<bool &>() |= chunk.rows();
+        auto [retracted_chunk, chunk] = AggregatingHelper::mergeAndConvertToChangelogChunk(
+            many_data->variants, many_data->getField<ManyRetractedDataVariants>(), *params);
 
         chunk.setChunkContext(chunk_ctx);
         setCurrentChunk(std::move(chunk), std::move(retracted_chunk));
diff --git a/src/Processors/Transforms/Streaming/GlobalAggregatingTransform.h b/src/Processors/Transforms/Streaming/GlobalAggregatingTransform.h
index 975fe4e115f..474824e1977 100644
--- a/src/Processors/Transforms/Streaming/GlobalAggregatingTransform.h
+++ b/src/Processors/Transforms/Streaming/GlobalAggregatingTransform.h
@@ -28,8 +28,6 @@ class GlobalAggregatingTransform final : public AggregatingTransform
     bool prepareFinalization(Int64 min_watermark) override;
 
     void finalize(const ChunkContextPtr & chunk_ctx) override;
-
-    static constexpr VersionType IMPL_V2_MIN_VERSION = 3;
 };
 
 }
diff --git a/src/Processors/Transforms/Streaming/GlobalAggregatingTransformWithSubstream.cpp b/src/Processors/Transforms/Streaming/GlobalAggregatingTransformWithSubstream.cpp
index d59f40c2199..b682e02ea85 100644
--- a/src/Processors/Transforms/Streaming/GlobalAggregatingTransformWithSubstream.cpp
+++ b/src/Processors/Transforms/Streaming/GlobalAggregatingTransformWithSubstream.cpp
@@ -9,7 +9,6 @@ namespace ErrorCodes
 {
 extern const int NOT_IMPLEMENTED;
 extern const int UNSUPPORTED;
-extern const int RECOVER_CHECKPOINT_FAILED;
 }
 
 namespace Streaming
@@ -29,41 +28,19 @@ GlobalAggregatingTransformWithSubstream::GlobalAggregatingTransformWithSubstream
 SubstreamContextPtr GlobalAggregatingTransformWithSubstream::getOrCreateSubstreamContext(const SubstreamID & id)
 {
     auto substream_ctx = AggregatingTransformWithSubstream::getOrCreateSubstreamContext(id);
-    /// Need extra retracted data for old version impl
     if (params->emit_changelog && !substream_ctx->hasField())
     {
-        bool retract_enabled = false;
         substream_ctx->setField(
-            {retract_enabled,
+            {std::make_shared<RetractedDataVariants>(),
              /// Field serializer
-             [](const std::any & field, WriteBuffer & wb, VersionType version) {
-                 assert(version >= IMPL_V2_MIN_VERSION);
-                 DB::writeBoolText(std::any_cast<bool>(field), wb);
+             [this](const std::any & field, WriteBuffer & wb, VersionType) {
+                 const auto & data = std::any_cast<const RetractedDataVariantsPtr &>(field);
+                 data->serialize(wb, params->aggregator);
              },
              /// Field deserializer
-             [substream_ctx, this](std::any & field, ReadBuffer & rb, VersionType version) {
-                 if (version >= IMPL_V2_MIN_VERSION)
-                 {
-                     DB::readBoolText(std::any_cast<bool &>(field), rb);
-                 }
-                 else
-                 {
-                     /// Convert old impl to new impl V2
-                     if (params->aggregator.expandedDataType() != ExpandedDataType::UpdatedWithRetracted)
-                         throw Exception(
-                             ErrorCodes::RECOVER_CHECKPOINT_FAILED,
-                             "Failed to recover aggregation checkpoint. Recover old version '{}' checkpoint, checkpointed need retracted, "
-                             "but "
-                             "current not need",
-                             version);
-
-                     AggregatedDataVariants retracted;
-                     DB::deserialize(retracted, rb, params->aggregator);
-                     bool has_retracted = retracted.size() > 0;
-                     params->aggregator.mergeRetractedInto(substream_ctx->variants, std::move(retracted));
-
-                     std::any_cast<bool &>(field) = substream_ctx->emited_version > 0 || has_retracted; /// retracted enabled
-                 }
+             [this](std::any & field, ReadBuffer & rb, VersionType) {
+                 auto & data = std::any_cast<RetractedDataVariantsPtr &>(field);
+                 data->deserialize(rb, params->aggregator);
              }});
     }
     return substream_ctx;
@@ -74,15 +51,14 @@ GlobalAggregatingTransformWithSubstream::executeOrMergeColumns(Chunk & chunk, co
 {
     if (params->emit_changelog)
     {
-        assert(!params->only_merge);
+        assert(!params->only_merge && !no_more_keys);
+
         auto num_rows = chunk.getNumRows();
-        auto retract_enabled = substream_ctx->getField<bool>();
-        if (retract_enabled) [[likely]]
-            return params->aggregator.executeAndRetractOnBlock(
-                chunk.detachColumns(), 0, num_rows, substream_ctx->variants, key_columns, aggregate_columns, no_more_keys);
-        else
-            return params->aggregator.executeOnBlock(
-                chunk.detachColumns(), 0, num_rows, substream_ctx->variants, key_columns, aggregate_columns, no_more_keys);
+        auto & retracted_variants = substream_ctx->getField<RetractedDataVariantsPtr>();
+        auto & aggregated_variants = substream_ctx->variants;
+
+        return params->aggregator.executeAndRetractOnBlock(
+            chunk.detachColumns(), 0, num_rows, aggregated_variants, *retracted_variants, key_columns, aggregate_columns);
     }
     else
         return AggregatingTransformWithSubstream::executeOrMergeColumns(chunk, substream_ctx);
@@ -111,10 +87,8 @@ void GlobalAggregatingTransformWithSubstream::finalize(const SubstreamContextPtr
     auto start = MonotonicMilliseconds::now();
     if (params->emit_changelog)
     {
-        auto [retracted_chunk, chunk] = AggregatingHelper::convertToChangelogChunk(variants, *params);
-        /// Enable retract after first finalization
-        substream_ctx->getField<bool &>() |= chunk.rows();
-
+        auto [retracted_chunk, chunk]
+            = AggregatingHelper::convertToChangelogChunk(variants, *substream_ctx->getField<RetractedDataVariantsPtr>(), *params);
         chunk.setChunkContext(chunk_ctx);
         setCurrentChunk(std::move(chunk), std::move(retracted_chunk));
     }
diff --git a/src/Processors/Transforms/Streaming/GlobalAggregatingTransformWithSubstream.h b/src/Processors/Transforms/Streaming/GlobalAggregatingTransformWithSubstream.h
index 72bc161bf7c..27c69ba6ac5 100644
--- a/src/Processors/Transforms/Streaming/GlobalAggregatingTransformWithSubstream.h
+++ b/src/Processors/Transforms/Streaming/GlobalAggregatingTransformWithSubstream.h
@@ -21,8 +21,6 @@ class GlobalAggregatingTransformWithSubstream final : public AggregatingTransfor
 
 private:
     void finalize(const SubstreamContextPtr & substream_ctx, const ChunkContextPtr & chunk_ctx) override;
-
-    static constexpr VersionType IMPL_V2_MIN_VERSION = 3;
 };
 
 }

From 7a78f20e1dd244775b2d2f92fb8da74c2510bfcb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lisen=20=E6=9D=A8?= <li.yang@timeplus.io>
Date: Sat, 3 Feb 2024 15:13:44 +0800
Subject: [PATCH 5/5] fix comments 2  * rename UpdatesTrackingData.h to
 TrackingUpdatesData.h  * use temp arena instread of shared ptr of arena

---
 src/Common/serde.h                            | 44 ------------------
 src/Interpreters/Streaming/Aggregator.cpp     | 45 ++++++++-----------
 src/Interpreters/Streaming/Aggregator.h       |  5 ++-
 src/Interpreters/Streaming/HashJoin.cpp       | 32 ++++++-------
 ...esTrackingData.h => TrackingUpdatesData.h} |  0
 src/Interpreters/Streaming/joinData.cpp       | 12 ++---
 .../tests/gtest_streaming_hash_join.cpp       |  6 +--
 .../Streaming/AggregatingTransform.cpp        |  4 +-
 .../Streaming/ChangelogConvertTransform.cpp   |  8 ++--
 9 files changed, 53 insertions(+), 103 deletions(-)
 rename src/Interpreters/Streaming/{UpdatesTrackingData.h => TrackingUpdatesData.h} (100%)

diff --git a/src/Common/serde.h b/src/Common/serde.h
index d6e51e17dc0..b6bfa951081 100644
--- a/src/Common/serde.h
+++ b/src/Common/serde.h
@@ -1,53 +1,9 @@
 #pragma once
 
-#include <IO/VarInt.h>
-#include <base/types.h>
 #include <Common/VersionRevision.h>
 
 namespace DB
 {
-/// REQUIRES: The object must support versioned serialization/deserialization
-template <typename S, typename WB, typename... Args>
-concept VersionedSerializable
-    = requires(const S & s, WB & wb, VersionType version, Args &&... args) { s.serialize(wb, version, std::forward<Args>(args)...); };
-
-template <typename S, typename RB, typename... Args>
-concept VersionedDeserializable
-    = requires(S & s, RB & rb, VersionType version, Args &&... args) { s.deserialize(rb, version, std::forward<Args>(args)...); };
-
-template <typename WB, typename... Args, VersionedSerializable<WB, Args...> S>
-void ALWAYS_INLINE serialize(const S & s, WB & wb, VersionType version, Args &&... args)
-{
-    s.serialize(wb, version, std::forward<Args>(args)...);
-}
-
-template <typename RB, typename... Args, VersionedDeserializable<RB, Args...> S>
-void ALWAYS_INLINE deserialize(S & s, RB & rb, VersionType version, Args &&... args)
-{
-    s.deserialize(rb, version, std::forward<Args>(args)...);
-}
-
-/// With owned versions
-template <typename S, typename WB, typename... Args>
-concept Serializable
-    = requires(const S & s, WB & wb, Args &&... args) { s.serialize(wb, std::forward<Args>(args)...); };
-
-template <typename S, typename RB, typename... Args>
-concept Deserializable
-    = requires(S & s, RB & rb, Args &&... args) { s.deserialize(rb, std::forward<Args>(args)...); };
-
-template <typename WB, typename... Args, Serializable<WB, Args...> S>
-void ALWAYS_INLINE serialize(const S & s, WB & wb, Args &&... args)
-{
-    s.serialize(wb, std::forward<Args>(args)...);
-}
-
-template <typename RB, typename... Args, Deserializable<RB, Args...> S>
-void ALWAYS_INLINE deserialize(S & s, RB & rb, Args &&... args)
-{
-    s.deserialize(rb, std::forward<Args>(args)...);
-}
-
 /// macro tag to indicate the data members or struct or class will
 /// be serialized / deserialized via network or file system IO.
 /// Hence, data structure versioning / backward / forward compatibility
diff --git a/src/Interpreters/Streaming/Aggregator.cpp b/src/Interpreters/Streaming/Aggregator.cpp
index d82dc8b1f8e..8de5a063791 100644
--- a/src/Interpreters/Streaming/Aggregator.cpp
+++ b/src/Interpreters/Streaming/Aggregator.cpp
@@ -109,8 +109,9 @@ template <typename BucketConverter>
 BlocksList convertBucketsInParallel(ThreadPool * thread_pool, const std::vector<Int64> & buckets, BucketConverter && bucket_converter)
 {
     std::atomic<UInt32> next_bucket_idx_to_merge = 0;
-    auto converter = [&](Arena * pool, const std::atomic_flag * cancelled) {
+    auto converter = [&](const std::atomic_flag * cancelled) {
         BlocksList blocks;
+        Arena arena;
         while (true)
         {
             if (cancelled && cancelled->test())
@@ -121,7 +122,7 @@ BlocksList convertBucketsInParallel(ThreadPool * thread_pool, const std::vector<
                 break;
 
             auto bucket = buckets[bucket_idx];
-            blocks.splice(blocks.end(), bucket_converter(bucket, pool));
+            blocks.splice(blocks.end(), bucket_converter(bucket, &arena));
         }
         return blocks;
     };
@@ -129,16 +130,10 @@ BlocksList convertBucketsInParallel(ThreadPool * thread_pool, const std::vector<
     size_t num_threads = thread_pool ? std::min(thread_pool->getMaxThreads(), buckets.size()) : 1;
     if (num_threads <= 1)
     {
-        auto arena = std::make_shared<Arena>();
-        return converter(arena.get(), nullptr);
+        return converter(nullptr);
     }
 
     /// Process in parallel
-    Arenas pools;
-    pools.reserve(num_threads);
-    for (size_t i = pools.size(); i < num_threads; ++i)
-        pools.push_back(std::make_shared<Arena>());
-
     auto results = std::make_shared<std::vector<BlocksList>>();
     results->resize(num_threads);
     thread_pool->setMaxThreads(num_threads);
@@ -148,10 +143,10 @@ BlocksList convertBucketsInParallel(ThreadPool * thread_pool, const std::vector<
 
         for (size_t thread_id = 0; thread_id < num_threads; ++thread_id)
         {
-            thread_pool->scheduleOrThrowOnError([&pools, thread_id, group = CurrentThread::getGroup(), results, &converter, &cancelled] {
+            thread_pool->scheduleOrThrowOnError([thread_id, group = CurrentThread::getGroup(), results, &converter, &cancelled] {
                 CurrentThread::attachToIfDetached(group);
                 SCOPE_EXIT_SAFE( CurrentThread::detachQueryIfNotDetached() );
-                (*results)[thread_id] = converter(pools[thread_id].get(), &cancelled);
+                (*results)[thread_id] = converter(&cancelled);
             });
         }
 
@@ -857,7 +852,7 @@ template <typename Method>
                     [&](AggregateDataPtr & aggregate_data)
                     {
                         auto data = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
-                        createAggregateStates(data, /*prefix_with_updates_tracking_state*/ false);
+                        createAggregateStates(data, /*prefix_with_updates_tracking_state=*/ false);
                         aggregate_data = data;
                     },
                     state.getKeyData(),
@@ -1483,7 +1478,7 @@ Block NO_INLINE Aggregator::convertToBlockImplFinal(
     PaddedPODArray<AggregateDataPtr> places;
     places.reserve(rows);
 
-    bool only_updates = (type == ConvertType::OnlyUpdates);
+    bool only_updates = (type == ConvertType::Updates);
 
     data.forEachValue([&](const auto & key, auto & mapped)
     {
@@ -1617,11 +1612,11 @@ Block Aggregator::prepareBlockAndFillWithoutKey(AggregatedDataVariants & data_va
 
     assert(data_variants.type == AggregatedDataVariants::Type::without_key);
 
-    if (type == ConvertType::OnlyUpdates && !TrackingUpdates::updated(data_variants.without_key))
+    if (type == ConvertType::Updates && !TrackingUpdates::updated(data_variants.without_key))
         return res_header.cloneEmpty();
 
     AggregatedDataWithoutKey & data = [&]() -> AggregateDataPtr & {
-        if (type == ConvertType::OnlyUpdates)
+        if (type == ConvertType::Updates)
         {
             TrackingUpdates::resetUpdated(data_variants.without_key);
             return data_variants.without_key;
@@ -1692,7 +1687,7 @@ BlocksList Aggregator::prepareBlocksAndFillTwoLevelImpl(
 {
     return convertBucketsInParallel(thread_pool, method.data.buckets(), [&](Int64 bucket, Arena * arena) -> BlocksList {
         /// Skip no changed bucket if only updated is requested
-        if (type == ConvertType::OnlyUpdates && !method.data.isBucketUpdated(bucket))
+        if (type == ConvertType::Updates && !method.data.isBucketUpdated(bucket))
             return {};
 
         return {convertOneBucketToBlockImpl(data_variants, method, arena, final, clear_states, bucket, type)};
@@ -2331,8 +2326,7 @@ void Aggregator::checkpoint(const AggregatedDataVariants & data_variants, WriteB
 
     if (version <= 1)
         return const_cast<Aggregator *>(this)->doCheckpointLegacy(data_variants, wb);
-
-    if (version <= 2)
+    else if (version <= 2)
         return doCheckpointV2(data_variants, wb);
     else
         return doCheckpointV3(data_variants, wb);
@@ -2351,8 +2345,7 @@ void Aggregator::recover(AggregatedDataVariants & data_variants, ReadBuffer & rb
     /// FIXME: Legacy layout needs to be cleaned after no use
     if (recovered_version <= 1)
         return const_cast<Aggregator *>(this)->doRecoverLegacy(data_variants, rb);
-
-    if (recovered_version <= 2)
+    else if (recovered_version <= 2)
         return doRecoverV2(data_variants, rb);
     else
         return doRecoverV3(data_variants, rb);
@@ -3220,7 +3213,7 @@ void Aggregator::doCheckpointV3(const AggregatedDataVariants & data_variants, Wr
     /// 1) Without key: [uint8][uint16][aggr-func-state-without-key]
     /// 2) Otherwise: [uint8][uint16][aggr-func-state-for-overflow-row][is_two_level][aggr-func-state-in-hash-map]
     bool inited = !data_variants.empty();
-    writeBoolText(inited, wb);
+    writeBinary(inited, wb);
     if (!inited)
         return; /// No aggregated data yet
 
@@ -3259,7 +3252,7 @@ void Aggregator::doCheckpointV3(const AggregatedDataVariants & data_variants, Wr
 void Aggregator::doRecoverV3(AggregatedDataVariants & data_variants, ReadBuffer & rb) const
 {
     bool inited = !data_variants.empty();
-    readBoolText(inited, rb);
+    readBinary(inited, rb);
     if (!inited)
         return;
 
@@ -3393,11 +3386,11 @@ BlocksList Aggregator::convertUpdatesToBlocks(AggregatedDataVariants & data_vari
     constexpr bool final = true;
     constexpr bool clear_states = false;
     if (data_variants.type == AggregatedDataVariants::Type::without_key)
-        blocks.emplace_back(prepareBlockAndFillWithoutKey(data_variants, final, clear_states, ConvertType::OnlyUpdates));
+        blocks.emplace_back(prepareBlockAndFillWithoutKey(data_variants, final, clear_states, ConvertType::Updates));
     else if (!data_variants.isTwoLevel())
-        blocks.emplace_back(prepareBlockAndFillSingleLevel(data_variants, final, clear_states, ConvertType::OnlyUpdates));
+        blocks.emplace_back(prepareBlockAndFillSingleLevel(data_variants, final, clear_states, ConvertType::Updates));
     else
-        blocks.splice(blocks.end(), prepareBlocksAndFillTwoLevel(data_variants, final, clear_states, /*max_threads*/ 1, ConvertType::OnlyUpdates));
+        blocks.splice(blocks.end(), prepareBlocksAndFillTwoLevel(data_variants, final, clear_states, /*max_threads*/ 1, ConvertType::Updates));
 
     size_t rows = 0;
     size_t bytes = 0;
@@ -3457,7 +3450,7 @@ void NO_INLINE Aggregator::mergeUpdateGroupsImpl(ManyAggregatedDataVariants & no
                 auto & dst = dst_it->getMapped();
                 dst = nullptr; /// exception-safety - if you can not allocate memory or create states, then destructors will not be called.
                 auto aggregate_data = arena->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
-                createAggregateStates(aggregate_data, /*prefix_with_updates_tracking_state*/ false);
+                createAggregateStates(aggregate_data, /*prefix_with_updates_tracking_state=*/ false);
                 dst = aggregate_data;
             }
         };
diff --git a/src/Interpreters/Streaming/Aggregator.h b/src/Interpreters/Streaming/Aggregator.h
index 32b1a2e141d..37aeac85bfe 100644
--- a/src/Interpreters/Streaming/Aggregator.h
+++ b/src/Interpreters/Streaming/Aggregator.h
@@ -37,7 +37,7 @@
 #include <Core/Streaming/SubstreamID.h>
 #include <DataTypes/DataTypeDateTime64.h>
 #include <Interpreters/Aggregator.h>
-#include <Interpreters/Streaming/UpdatesTrackingData.h>
+#include <Interpreters/Streaming/TrackingUpdatesData.h>
 #include <Interpreters/Streaming/WindowCommon.h>
 #include <Parsers/ASTFunction.h>
 #include <Common/HashTable/Hash.h>
@@ -79,7 +79,7 @@ namespace Streaming
 enum class ConvertType : uint8_t
 {
     Normal = 0,
-    OnlyUpdates = 1,
+    Updates = 1,
 };
 
 /// using TimeBucketAggregatedDataWithUInt16Key = TimeBucketHashMap<FixedImplicitZeroHashMap<UInt16, AggregateDataPtr>>;
@@ -376,6 +376,7 @@ SERDE struct AggregatedDataVariants : private boost::noncopyable
     {
         aggregates_pools = Arenas(1, std::make_shared<Arena>());
         aggregates_pool = aggregates_pools.back().get();
+        /// Enable GC for arena by default. For cases like global aggregation, we will disable it further in \init
         aggregates_pool->enableRecycle(true);
     }
 
diff --git a/src/Interpreters/Streaming/HashJoin.cpp b/src/Interpreters/Streaming/HashJoin.cpp
index 108cc609cc1..91da977f3eb 100644
--- a/src/Interpreters/Streaming/HashJoin.cpp
+++ b/src/Interpreters/Streaming/HashJoin.cpp
@@ -2546,9 +2546,9 @@ void HashJoin::serialize(WriteBuffer & wb, VersionType version) const
 
     /// Part-4: Buffered data of left/right join stream
     if (bidirectional_hash_join)
-        DB::serialize(left_data, wb, version);
+        left_data.serialize(wb, version);
 
-    DB::serialize(right_data, wb, version);
+    right_data.serialize(wb, version);
 
     /// Part-5: Asof type (Optional)
     bool need_asof = streaming_strictness == Strictness::Range || streaming_strictness == Strictness::Asof;
@@ -2564,12 +2564,12 @@ void HashJoin::serialize(WriteBuffer & wb, VersionType version) const
     if (join_results.has_value())
     {
         assert(retract_push_down && emit_changelog);
-        DB::serialize(*join_results, wb, version, *this);
+        join_results->serialize(wb, version, *this);
     }
 
     /// Part-7: Others
     DB::writeIntBinary(combined_watermark.load(), wb);
-    DB::serialize(join_metrics, wb, version);
+    join_metrics.serialize(wb, version);
 }
 
 void HashJoin::deserialize(ReadBuffer & rb, VersionType version)
@@ -2664,9 +2664,9 @@ void HashJoin::deserialize(ReadBuffer & rb, VersionType version)
 
     /// Part-4: Buffered data of left/right join stream
     if (bidirectional_hash_join)
-        DB::deserialize(left_data, rb, version);
+        left_data.deserialize(rb, version);
 
-    DB::deserialize(right_data, rb, version);
+    right_data.deserialize(rb, version);
 
     /// Part-5: Asof type (Optional)
     bool need_asof = streaming_strictness == Strictness::Range || streaming_strictness == Strictness::Asof;
@@ -2705,7 +2705,7 @@ void HashJoin::deserialize(ReadBuffer & rb, VersionType version)
                 join_results.has_value());
 
         assert(retract_push_down && emit_changelog);
-        DB::deserialize(*join_results, rb, version, *this);
+        join_results->deserialize(rb, version, *this);
     }
 
     /// Part-7: Others
@@ -2713,7 +2713,7 @@ void HashJoin::deserialize(ReadBuffer & rb, VersionType version)
     DB::readIntBinary(recovered_combined_watermark, rb);
     combined_watermark = recovered_combined_watermark;
 
-    DB::deserialize(join_metrics, rb, version);
+    join_metrics.deserialize(rb, version);
 }
 
 void HashJoin::JoinResults::serialize(WriteBuffer & wb, VersionType version, const HashJoin & join) const
@@ -2723,7 +2723,7 @@ void HashJoin::JoinResults::serialize(WriteBuffer & wb, VersionType version, con
     serializeHashJoinMapsVariants(blocks, *maps, wb, version, sample_block, join);
 
     if (version <= CachedBlockMetrics::SERDE_REQUIRED_MAX_VERSION)
-        DB::serialize(metrics, wb, version);
+        metrics.serialize(wb, version);
 }
 
 void HashJoin::JoinResults::deserialize(ReadBuffer & rb, VersionType version, const HashJoin & join)
@@ -2733,7 +2733,7 @@ void HashJoin::JoinResults::deserialize(ReadBuffer & rb, VersionType version, co
     deserializeHashJoinMapsVariants(blocks, *maps, rb, version, pool, sample_block, join);
 
     if (version <= CachedBlockMetrics::SERDE_REQUIRED_MAX_VERSION)
-        DB::deserialize(metrics, rb, version);
+        metrics.deserialize(rb, version);
 }
 
 void HashJoin::JoinData::serialize(WriteBuffer & wb, VersionType version) const
@@ -2748,7 +2748,7 @@ void HashJoin::JoinData::serialize(WriteBuffer & wb, VersionType version) const
     if (has_primary_key_hash_table)
     {
         SerializedRowRefListMultipleToIndices serialized_row_ref_list_multiple_to_indices;
-        DB::serialize(*buffered_data, wb, version, &serialized_row_ref_list_multiple_to_indices);
+        buffered_data->serialize(wb, version, &serialized_row_ref_list_multiple_to_indices);
 
         primary_key_hash_table->map.serialize(
             /*MappedSerializer*/
@@ -2758,7 +2758,7 @@ void HashJoin::JoinData::serialize(WriteBuffer & wb, VersionType version) const
             wb);
     }
     else
-        DB::serialize(*buffered_data, wb, version, nullptr);
+        buffered_data->serialize(wb, version, nullptr);
 }
 
 void HashJoin::JoinData::deserialize(ReadBuffer & rb, VersionType version)
@@ -2789,7 +2789,7 @@ void HashJoin::JoinData::deserialize(ReadBuffer & rb, VersionType version)
     if (has_primary_key_hash_table)
     {
         DeserializedIndicesToRowRefListMultiple<JoinDataBlock> deserialized_indices_to_multiple_ref;
-        DB::deserialize(*buffered_data, rb, version, &deserialized_indices_to_multiple_ref);
+        buffered_data->deserialize(rb, version, &deserialized_indices_to_multiple_ref);
 
         primary_key_hash_table->map.deserialize(
             /*MappedDeserializer*/
@@ -2801,7 +2801,7 @@ void HashJoin::JoinData::deserialize(ReadBuffer & rb, VersionType version)
             rb);
     }
     else
-        DB::deserialize(*buffered_data, rb, version, nullptr);
+        buffered_data->deserialize(rb, version, nullptr);
 }
 
 void HashJoin::JoinGlobalMetrics::serialize(WriteBuffer & wb, VersionType) const
@@ -2828,7 +2828,7 @@ void serializeHashJoinMapsVariants(
     SerializedRowRefListMultipleToIndices * serialized_row_ref_list_multiple_to_indices)
 {
     SerializedBlocksToIndices serialized_blocks_to_indices;
-    DB::serialize(blocks, wb, version, header, &serialized_blocks_to_indices);
+    blocks.serialize(wb, version, header, &serialized_blocks_to_indices);
 
     assert(maps.map_variants.size() >= 1);
     DB::writeIntBinary<UInt16>(static_cast<UInt16>(maps.map_variants.size()), wb);
@@ -2883,7 +2883,7 @@ void deserializeHashJoinMapsVariants(
     DeserializedIndicesToRowRefListMultiple<JoinDataBlock> * deserialized_indices_to_multiple_ref)
 {
     DeserializedIndicesToBlocks<JoinDataBlock> deserialized_indices_to_blocks;
-    DB::deserialize(blocks, rb, version, header, &deserialized_indices_to_blocks);
+    blocks.deserialize(rb, version, header, &deserialized_indices_to_blocks);
 
     UInt16 maps_size;
     DB::readIntBinary<UInt16>(maps_size, rb);
diff --git a/src/Interpreters/Streaming/UpdatesTrackingData.h b/src/Interpreters/Streaming/TrackingUpdatesData.h
similarity index 100%
rename from src/Interpreters/Streaming/UpdatesTrackingData.h
rename to src/Interpreters/Streaming/TrackingUpdatesData.h
diff --git a/src/Interpreters/Streaming/joinData.cpp b/src/Interpreters/Streaming/joinData.cpp
index 7896803d732..9c2853a3d31 100644
--- a/src/Interpreters/Streaming/joinData.cpp
+++ b/src/Interpreters/Streaming/joinData.cpp
@@ -326,18 +326,18 @@ void BufferedStreamData::serialize(
     DB::writeIntBinary(block_id, wb);
 
     assert(current_hash_blocks);
-    DB::serialize(*current_hash_blocks, wb, version, sample_block, *join, serialized_row_ref_list_multiple_to_indices);
+    current_hash_blocks->serialize(wb, version, sample_block, *join, serialized_row_ref_list_multiple_to_indices);
 
     DB::writeIntBinary<UInt32>(static_cast<UInt32>(range_bucket_hash_blocks.size()), wb);
     for (const auto & [bucket, hash_blocks] : range_bucket_hash_blocks)
     {
         DB::writeIntBinary(bucket, wb);
         assert(hash_blocks);
-        DB::serialize(*hash_blocks, wb, version, sample_block, *join, serialized_row_ref_list_multiple_to_indices);
+        hash_blocks->serialize(wb, version, sample_block, *join, serialized_row_ref_list_multiple_to_indices);
     }
 
     if (version <= CachedBlockMetrics::SERDE_REQUIRED_MAX_VERSION)
-        DB::serialize(metrics, wb, version);
+        metrics.serialize(wb, version);
 }
 
 void BufferedStreamData::deserialize(
@@ -358,7 +358,7 @@ void BufferedStreamData::deserialize(
     DB::readIntBinary(block_id, rb);
 
     assert(current_hash_blocks);
-    DB::deserialize(*current_hash_blocks, rb, version, sample_block, *join, deserialized_indices_to_row_ref_list_multiple);
+    current_hash_blocks->deserialize(rb, version, sample_block, *join, deserialized_indices_to_row_ref_list_multiple);
 
     UInt32 size;
     Int64 bucket;
@@ -370,11 +370,11 @@ void BufferedStreamData::deserialize(
         assert(inserted);
         /// Init hash table
         join->initHashMaps(iter->second->maps->map_variants);
-        DB::deserialize(*iter->second, rb, version, sample_block, *join, deserialized_indices_to_row_ref_list_multiple);
+        iter->second->deserialize(rb, version, sample_block, *join, deserialized_indices_to_row_ref_list_multiple);
     }
 
     if (version <= CachedBlockMetrics::SERDE_REQUIRED_MAX_VERSION)
-        DB::deserialize(metrics, rb, version);
+        metrics.deserialize(rb, version);
 }
 
 HashBlocksPtr BufferedStreamData::newHashBlocks()
diff --git a/src/Interpreters/Streaming/tests/gtest_streaming_hash_join.cpp b/src/Interpreters/Streaming/tests/gtest_streaming_hash_join.cpp
index 244eb64a6a6..7b0fdefedfe 100644
--- a/src/Interpreters/Streaming/tests/gtest_streaming_hash_join.cpp
+++ b/src/Interpreters/Streaming/tests/gtest_streaming_hash_join.cpp
@@ -258,14 +258,14 @@ std::shared_ptr<Streaming::HashJoin> initHashJoin(
 void serdeAndCheck(const Streaming::HashJoin & join, Streaming::HashJoin & recovered_join, std::string_view msg)
 {
     WriteBufferFromOwnString wb;
-    DB::serialize(join, wb, ProtonRevision::getVersionRevision());
+    join.serialize(wb, ProtonRevision::getVersionRevision());
     auto original_string = wb.str();
 
     ReadBufferFromOwnString rb(original_string);
-    DB::deserialize(recovered_join, rb, ProtonRevision::getVersionRevision());
+    recovered_join.deserialize(rb, ProtonRevision::getVersionRevision());
 
     WriteBufferFromOwnString wb2;
-    DB::serialize(recovered_join, wb2, ProtonRevision::getVersionRevision());
+    recovered_join.serialize(wb2, ProtonRevision::getVersionRevision());
     auto recovered_string = wb2.str();
 
     ASSERT_EQ(original_string, recovered_string) << msg << ": (FAILED)\n";
diff --git a/src/Processors/Transforms/Streaming/AggregatingTransform.cpp b/src/Processors/Transforms/Streaming/AggregatingTransform.cpp
index 22489fc9bc3..1bd1c496237 100644
--- a/src/Processors/Transforms/Streaming/AggregatingTransform.cpp
+++ b/src/Processors/Transforms/Streaming/AggregatingTransform.cpp
@@ -487,7 +487,7 @@ void AggregatingTransform::checkpoint(CheckpointContextPtr ckpt_ctx)
         }
 
         /// Serializing no shared data
-        DB::serialize(variants, wb, params->aggregator);
+        variants.serialize(wb, params->aggregator);
 
         DB::writeIntBinary(watermark, wb);
 
@@ -543,7 +543,7 @@ void AggregatingTransform::recover(CheckpointContextPtr ckpt_ctx)
         }
 
         /// Serializing local or stable data during checkpointing
-        DB::deserialize(variants, rb, params->aggregator);
+        variants.deserialize(rb, params->aggregator);
 
         DB::readIntBinary(watermark, rb);
 
diff --git a/src/Processors/Transforms/Streaming/ChangelogConvertTransform.cpp b/src/Processors/Transforms/Streaming/ChangelogConvertTransform.cpp
index 2b9f8ceb183..7521957516b 100644
--- a/src/Processors/Transforms/Streaming/ChangelogConvertTransform.cpp
+++ b/src/Processors/Transforms/Streaming/ChangelogConvertTransform.cpp
@@ -407,7 +407,7 @@ void ChangelogConvertTransform::checkpoint(CheckpointContextPtr ckpt_ctx)
 {
     ckpt_ctx->coordinator->checkpoint(getVersion(), getLogicID(), ckpt_ctx, [this](WriteBuffer & wb) {
         SerializedBlocksToIndices serialized_blocks_to_indices;
-        DB::serialize(source_chunks, wb, getVersion(), getInputs().front().getHeader(), &serialized_blocks_to_indices);
+        source_chunks.serialize(wb, getVersion(), getInputs().front().getHeader(), &serialized_blocks_to_indices);
 
         index.serialize(
             /*MappedSerializer*/
@@ -420,7 +420,7 @@ void ChangelogConvertTransform::checkpoint(CheckpointContextPtr ckpt_ctx)
         DB::writeIntBinary(late_rows, wb);
 
         if (version <= CachedBlockMetrics::SERDE_REQUIRED_MAX_VERSION)
-            DB::serialize(cached_block_metrics, wb, getVersion());
+            cached_block_metrics.serialize(wb, getVersion());
     });
 }
 
@@ -428,7 +428,7 @@ void ChangelogConvertTransform::recover(CheckpointContextPtr ckpt_ctx)
 {
     ckpt_ctx->coordinator->recover(getLogicID(), ckpt_ctx, [this](VersionType version_, ReadBuffer & rb) {
         DeserializedIndicesToBlocks<LightChunk> deserialized_indices_to_blocks;
-        DB::deserialize(source_chunks, rb, version_, getInputs().front().getHeader(), &deserialized_indices_to_blocks);
+        source_chunks.deserialize(rb, version_, getInputs().front().getHeader(), &deserialized_indices_to_blocks);
 
         index.deserialize(
             /*MappedDeserializer*/
@@ -442,7 +442,7 @@ void ChangelogConvertTransform::recover(CheckpointContextPtr ckpt_ctx)
         DB::readIntBinary(late_rows, rb);
 
         if (version_ <= CachedBlockMetrics::SERDE_REQUIRED_MAX_VERSION)
-            DB::deserialize(cached_block_metrics, rb, version_);
+            cached_block_metrics.deserialize(rb, version_);
     });
 }
 }