In [2]:
import clickhouse_connect

client = clickhouse_connect.get_client(host='localhost', username='default')

In [3]:
client.command('CREATE DATABASE IF NOT EXISTS dev_test')
client.database = 'dev_test'

client.command("""DROP TABLE IF EXISTS calls""")
client.command("""DROP TABLE IF EXISTS objects""")

client.command("""
    CREATE TABLE IF NOT EXISTS calls (
        id String NOT NULL,
        payload_dump String NOT NULL
    ) ENGINE = MergeTree()
    ORDER BY id;
""")

client.command("""
    CREATE TABLE IF NOT EXISTS objects (
        id String NOT NULL,
        payload_dump String NOT NULL
    ) ENGINE = MergeTree()
    ORDER BY id;
""")

<clickhouse_connect.driver.summary.QuerySummary at 0x112c76ef0>

In [4]:
import json
import uuid
import pandas as pd

def insert_call(payload: dict) -> str:
    insert_id = str(uuid.uuid4())
    payload_dump = json.dumps(payload)
    client.insert("calls", data=[[insert_id, payload_dump]], column_names=['id', 'payload_dump'])
    return insert_id

def insert_object(payload: dict) -> str:
    insert_id = str(uuid.uuid4())
    payload_dump = json.dumps(payload)
    client.insert("objects", data=[[insert_id, payload_dump]], column_names=['id', 'payload_dump'])
    return insert_id

def get_calls() -> pd.DataFrame:
    res = client.query_df("SELECT * FROM calls")
    return res

def get_objects() -> pd.DataFrame:
    res = client.query_df("SELECT * FROM objects")
    return res

def object_ref_from_id(object_id: str) -> str:
    return "ref://objects/" + object_id

def insert_ref_object(payload: dict) -> str:
    object_id = insert_object(payload)
    return object_ref_from_id(object_id)


In [5]:
def get_calls_user_perspective() -> pd.DataFrame:
    calls = get_calls().to_dict(orient='records')
    calls = list(calls)
    for call in calls:
        call['payload'] = json.loads(call.pop('payload_dump'))

    objects = get_objects().to_dict(orient='records')
    objects = list(objects)
    for obj in objects:
        obj['payload'] = json.loads(obj.pop('payload_dump'))

    def resolve_ref(ref: str) -> dict:
        if ref.startswith('ref://objects/'):
            object_id = ref[len('ref://objects/'):]
            for obj in objects:
                if obj['id'] == object_id:
                    return obj['payload']
            return None
        else:
            return None
        
    def resolve_dict(d: dict, path=None) -> dict:
        resolved_paths = {}
        if path is None:
            path = []
        for k, v in d.items():
            if isinstance(v, dict):
                res, paths = resolve_dict(v, path + [k])
                resolved_paths.update(paths)
                d[k] = res
            elif isinstance(v, str):
                if v.startswith('ref://'):
                    resolved_paths['.'.join(path + [k])] = v
                    resolved = resolve_ref(v)
                    res, paths = resolve_dict(resolved, path + [k])
                    d[k] = res
                    resolved_paths.update(paths)
        return d, resolved_paths
    
    for call in calls:
        res, paths = resolve_dict(call['payload'])
        call['payload'] = res
        call['__ref_paths__'] = paths
        
    return pd.DataFrame(calls)

In [9]:
calls = get_calls_user_perspective()
calls

Unnamed: 0,id,payload,__ref_paths__
0,a09bb677-2c05-4c8d-80c5-5793ee31d988,{'a': {'b': {'c': {'d': 8}}}},{'a': 'ref://objects/a89c6be1-2422-4794-9764-3...
1,8bceb22d-3895-4f81-8ec1-ee081dfb1001,{'a': {'b': {'c': {'d': 5}}}},{'a': 'ref://objects/bf0b7013-3fe6-497a-8d72-d...
2,8c0933b1-8adb-428f-9427-9daa2c8bba27,{'a': {'b': {'c': {'d': 3}}}},{'a.b': 'ref://objects/5db25f53-055a-4df7-90e5...
3,9620a7a6-8316-49a4-8f09-ee65cbbc4b9a,{'a': {'b': {'c': {'d': 1}}}},{}
4,b9f35417-45c0-4d32-95fd-eecacf836c16,{'a': {'b': {'c': {'d': 4}}}},{'a.b.c': 'ref://objects/5373a5a2-d4e7-4158-b8...
5,dcc83303-f6b3-4c21-9462-ad7fd1dbbb74,{'a': {'b': {'c': {'d': 2}}}},{'a': 'ref://objects/d9fd5ba2-eb66-450e-b437-4...
6,eef41dde-2d41-4a59-add0-dc40fe25342e,{'a': {'b': {'c': {'d': 6}}}},{'a': 'ref://objects/9c9b4226-4a85-4f91-a7f3-b...
7,df335efd-d7b4-46bd-9188-5cd3c1975980,{'a': {'b': {'c': {'d': 7}}}},{'a.b': 'ref://objects/337d128d-1776-454d-83a6...


In [6]:
# A variety of test cases, all with the the shape of:
# {'a': {'b': {'c': {'d': NUMERIC_VALUE}}}}
# but with different combinations of references

# No refs
res = insert_call({'a': {'b': {'c': {'d': 1}}}})

# Ref at A
res = insert_call({'a': insert_ref_object({'b': {'c': {'d': 2}}})})
# Ref at B
res = insert_call({'a': {'b': insert_ref_object({'c': {'d': 3}})}})
# Ref at C
res = insert_call({'a': {'b': {'c': insert_ref_object({'d': 4})}}})


# Ref at A and B
res = insert_call({'a': insert_ref_object({'b': insert_ref_object({'c': {'d': 5}})})})
# Ref at A and C
res = insert_call({'a': insert_ref_object({'b': {'c': insert_ref_object({'d': 6})}})})
# Ref at B and C
res = insert_call({'a': {'b': insert_ref_object({'c': insert_ref_object({'d': 7})})}})

# Ref at A, B and C
res = insert_call({'a': insert_ref_object({'b': insert_ref_object({'c': insert_ref_object({'d': 8})})})})

res

'a09bb677-2c05-4c8d-80c5-5793ee31d988'

In [7]:
get_calls()

Unnamed: 0,id,payload_dump
0,8bceb22d-3895-4f81-8ec1-ee081dfb1001,"{""a"": ""ref://objects/bf0b7013-3fe6-497a-8d72-d..."
1,8c0933b1-8adb-428f-9427-9daa2c8bba27,"{""a"": {""b"": ""ref://objects/5db25f53-055a-4df7-..."
2,9620a7a6-8316-49a4-8f09-ee65cbbc4b9a,"{""a"": {""b"": {""c"": {""d"": 1}}}}"
3,b9f35417-45c0-4d32-95fd-eecacf836c16,"{""a"": {""b"": {""c"": ""ref://objects/5373a5a2-d4e7..."
4,dcc83303-f6b3-4c21-9462-ad7fd1dbbb74,"{""a"": ""ref://objects/d9fd5ba2-eb66-450e-b437-4..."
5,eef41dde-2d41-4a59-add0-dc40fe25342e,"{""a"": ""ref://objects/9c9b4226-4a85-4f91-a7f3-b..."
6,df335efd-d7b4-46bd-9188-5cd3c1975980,"{""a"": {""b"": ""ref://objects/337d128d-1776-454d-..."
7,a09bb677-2c05-4c8d-80c5-5793ee31d988,"{""a"": ""ref://objects/a89c6be1-2422-4794-9764-3..."


In [8]:
get_objects()

Unnamed: 0,id,payload_dump
0,a89c6be1-2422-4794-9764-3aba73d66421,"{""b"": ""ref://objects/a8da028e-1585-49de-a6c1-8..."
1,0adf3e8b-8f1f-4c1b-815f-56639640fa3f,"{""d"": 6}"
2,337d128d-1776-454d-83a6-8b5337a2a2e0,"{""c"": ""ref://objects/f56218a4-e026-47d2-93a4-e..."
3,5373a5a2-d4e7-4158-b806-371dae40092e,"{""d"": 4}"
4,5db25f53-055a-4df7-90e5-33ade3f17f9a,"{""c"": {""d"": 3}}"
5,81548d94-01d9-46ed-9733-2957ebd0c505,"{""c"": {""d"": 5}}"
6,9c9b4226-4a85-4f91-a7f3-b04d9a9f8dbc,"{""b"": {""c"": ""ref://objects/0adf3e8b-8f1f-4c1b-..."
7,a5d75cd7-12c9-4e74-93d5-2293c0c4ca54,"{""d"": 8}"
8,a8da028e-1585-49de-a6c1-8bdb8a169ae7,"{""c"": ""ref://objects/a5d75cd7-12c9-4e74-93d5-2..."
9,bf0b7013-3fe6-497a-8d72-d1dd0522fcdb,"{""b"": ""ref://objects/81548d94-01d9-46ed-9733-2..."


Learnings:

1. The type of query we want to do requires one of two things:
    1. The query needs to support some form of recursion
    2. The data needs to be de-normalized
2. Clickhouse just added support for recursive CTEs (https://clickhouse.com/blog/clickhouse-release-24-04), but our use case is sufficiently complicated such that it seems not possible to be performant.
3. The primary drawback of de-normalization is a massive amount of data duplication.

recursive CTE
in-memory recursion
de-normalization -> memory explosion
dynamic query -

... maybe the recursion can happen at query build time since we know the depth!?!?!?!!



In [10]:
def custom_query_for_d(path):
    query_count = 0
    def build_query_for_path_part(part, subquery):
        nonlocal query_count
        query_count += 1
        subquery_name = f"subquery_{query_count}"
        return f"""
            SELECT 
                {subquery_name}.id as id,
                JSONExtractString({subquery_name}.payload_dump, '{part}') as payload_dump
            FROM
                ({subquery}) as {subquery_name}
        """
    def replace_ref_query(query):
        nonlocal query_count
        query_count += 1

        subquery_name = f"subquery_{query_count}"
        objquery_name = f"objquery_{query_count}"
        return f""" 
                SELECT 
                    {subquery_name}.id as id,
                    if({objquery_name}.payload_dump != '', {objquery_name}.payload_dump, {subquery_name}.payload_dump) as payload_dump
                FROM
                    ({query}) as {subquery_name}
                LEFT JOIN
                    objects as {objquery_name}
                ON
                    substring({subquery_name}.payload_dump, 15) = {objquery_name}.id
        """
            
            
    base_query = replace_ref_query("SELECT calls.id as id, calls.payload_dump as payload_dump FROM calls")
    for part in path:
        base_query = replace_ref_query(build_query_for_path_part(part, base_query))
    return base_query

In [11]:
query = custom_query_for_d([])
# print(query)
client.query_df(query)

Unnamed: 0,id,payload_dump
0,a09bb677-2c05-4c8d-80c5-5793ee31d988,"{""a"": ""ref://objects/a89c6be1-2422-4794-9764-3..."
1,df335efd-d7b4-46bd-9188-5cd3c1975980,"{""a"": {""b"": ""ref://objects/337d128d-1776-454d-..."
2,8bceb22d-3895-4f81-8ec1-ee081dfb1001,"{""a"": ""ref://objects/bf0b7013-3fe6-497a-8d72-d..."
3,8c0933b1-8adb-428f-9427-9daa2c8bba27,"{""a"": {""b"": ""ref://objects/5db25f53-055a-4df7-..."
4,9620a7a6-8316-49a4-8f09-ee65cbbc4b9a,"{""a"": {""b"": {""c"": {""d"": 1}}}}"
5,b9f35417-45c0-4d32-95fd-eecacf836c16,"{""a"": {""b"": {""c"": ""ref://objects/5373a5a2-d4e7..."
6,dcc83303-f6b3-4c21-9462-ad7fd1dbbb74,"{""a"": ""ref://objects/d9fd5ba2-eb66-450e-b437-4..."
7,eef41dde-2d41-4a59-add0-dc40fe25342e,"{""a"": ""ref://objects/9c9b4226-4a85-4f91-a7f3-b..."


In [14]:
# client.query_df(custom_query_for_d(['a', 'b', 'c', 'd']))
print(custom_query_for_d(['a', 'b', 'c', 'd']))

 
                SELECT 
                    subquery_9.id as id,
                    if(objquery_9.payload_dump != '', objquery_9.payload_dump, subquery_9.payload_dump) as payload_dump
                FROM
                    (
            SELECT 
                subquery_8.id as id,
                JSONExtractString(subquery_8.payload_dump, 'd') as payload_dump
            FROM
                ( 
                SELECT 
                    subquery_7.id as id,
                    if(objquery_7.payload_dump != '', objquery_7.payload_dump, subquery_7.payload_dump) as payload_dump
                FROM
                    (
            SELECT 
                subquery_6.id as id,
                JSONExtractString(subquery_6.payload_dump, 'c') as payload_dump
            FROM
                ( 
                SELECT 
                    subquery_5.id as id,
                    if(objquery_5.payload_dump != '', objquery_5.payload_dump, subquery_5.payload_dump) as payload_dump
           

In [208]:
client.query_df(custom_query_for_d(['a', 'b', 'c', 'd']))
# print(custom_query_for_d(['a', 'b', 'c', 'd']))

Unnamed: 0,id,payload_dump
0,07c241fe-8183-4158-a549-464fa9b104bb,4
1,222f7a11-b395-4b15-9c15-e0c72ae0b4c8,8
2,42af39c7-2c16-4430-ae8e-5d472b4288e2,3
3,46bb213b-b108-47b5-a6eb-b7ec83afbcca,6
4,4bc9edaa-db95-4b19-b193-660b37ff982c,7
5,c07f8f93-636a-4168-80c9-4468e4ee8744,1
6,d77d2bfc-39fd-4cce-9702-c0b8923f7bd9,2
7,f2d0a0fa-8284-4f28-9ad3-f162d538fede,5


In [209]:
client.query_df(custom_query_for_d(['a', 'b']))

Unnamed: 0,id,payload_dump
0,07c241fe-8183-4158-a549-464fa9b104bb,"{""c"":""ref://objects/202f16ae-f0aa-4f5c-b89e-2a..."
1,222f7a11-b395-4b15-9c15-e0c72ae0b4c8,"{""c"": ""ref://objects/c1e38a33-17dd-44fe-86ce-7..."
2,42af39c7-2c16-4430-ae8e-5d472b4288e2,"{""c"": {""d"": 3}}"
3,46bb213b-b108-47b5-a6eb-b7ec83afbcca,"{""c"":""ref://objects/70d00e2c-3518-40c1-9019-c8..."
4,4bc9edaa-db95-4b19-b193-660b37ff982c,"{""c"": ""ref://objects/9c85d610-e00e-4abd-a19e-c..."
5,c07f8f93-636a-4168-80c9-4468e4ee8744,"{""c"":{""d"":1}}"
6,d77d2bfc-39fd-4cce-9702-c0b8923f7bd9,"{""c"":{""d"":2}}"
7,f2d0a0fa-8284-4f28-9ad3-f162d538fede,"{""c"": {""d"": 5}}"


In [None]:
def custom_query_for_d(path):
    query_count = 0
    def build_query_for_path_part(part, subquery):
        nonlocal query_count
        query_count += 1
        subquery_name = f"subquery_{query_count}"
        return f"""
            SELECT 
                {subquery_name}.id as id,
                JSONExtractString({subquery_name}.payload_dump, '{part}') as payload_dump
            FROM
                ({subquery}) as {subquery_name}
        """
    def replace_ref_query(query):
        nonlocal query_count
        query_count += 1
        subquery_name = f"subquery_{query_count}"

        query_count += 1
        subquery_name_2 = f"subquery_{query_count}"
        objquery_name_2 = f"objquery_{query_count}"
        return f""" 
                SELECT 
                    {subquery_name_2}.id as id,
                    -- {objquery_name_2}.payload_dump as payload_dump
                    if({objquery_name_2}.payload_dump != '', {objquery_name_2}.payload_dump, {subquery_name_2}.payload_dump) as payload_dump
                FROM
                    ({query}) as {subquery_name_2}
                LEFT JOIN
                    objects as {objquery_name_2}
                ON
                    substring({subquery_name_2}.payload_dump, 15) = {objquery_name_2}.id
        """
        # return f"""    
        #         SELECT 
        #             {subquery_name}.id as id,
        #             {subquery_name}.payload_dump as payload_dump
        #         FROM
        #             ({query}) as {subquery_name}
        #         WHERE
        #             not(startsWith({subquery_name}.payload_dump, 'ref://objects/'))
        

        #     UNION ALL

        #         SELECT 
        #             {subquery_name_2}.id as id,
        #             {objquery_name_2}.payload_dump as payload_dump
        #         FROM
        #             (SELECT id, payload_dump FROM ({query}) WHERE startsWith(payload_dump, 'ref://objects/')) as {subquery_name_2}
        #         JOIN
        #             objects as {objquery_name_2}
        #         ON
        #             substring({subquery_name_2}.payload_dump, 15) = {objquery_name_2}.id
        # """
            
            
    base_query = replace_ref_query("SELECT calls.id as id, calls.payload_dump as payload_dump FROM calls")
    for part in path:
        base_query = replace_ref_query(build_query_for_path_part(part, base_query))
    return base_query