In [1]:
import numpy as np
from uuid import uuid4
from sklearn.manifold import TSNE
from sklearn.cluster import AffinityPropagation
import json
from copy import copy, deepcopy

import pprint

In [2]:
pp = pprint.PrettyPrinter().pprint

In [17]:
# file_name = 'labeled_sentiments.json'
file_name = 'output_1000.json'
with open(file_name, 'r') as f:
    lines = f.readlines()
for line in lines:
    line = line.strip()

print(len(lines))
embedding_data = [json.loads(line) for line in lines] 
# embedding_data[0]

1002


In [18]:
def run_tsne(x):
    """
    Reduce the dimension of the input vector

    Arg:
        x (numpy.array): a list of lists containing vectors.

    Retrun:
        (numpy.array): reduced dimension
    """
    low_dim_x = TSNE(
        n_components=2,
        learning_rate=200,
        perplexity=30
    ).fit_transform(x)
    return low_dim_x

def reduce_dimension(embedding_data):
    """
    Extracts the hight dimension vectors from the data, 
    run the dimension reducsion algorithm and add the 
    low dimension vectors to the original data
    """
    # get vectors
    embedding_data = copy(embedding_data)
    vect_list = []
    for e in embedding_data:
        vect_list.append(e.get('embedding'))

    # reduce dimension
    vect_array = np.array(vect_list)
    low_dim_embeddings = run_tsne(vect_array)
    
    # Merge back the low dimensions into the original data
    for i, item in enumerate(embedding_data):
        item['low_dim_embedding'] = list(low_dim_embeddings[i])
    return embedding_data

# Reducing dimension
data_w_low_dim = reduce_dimension(embedding_data)

In [19]:
pp(data_w_low_dim[:10])

[{'embedding': [0.08265981823205948,
                -0.003910927567631006,
                -0.09897520393133163,
                0.00018613849533721805,
                0.011625337414443493,
                -0.0034296722151339054,
                -0.046466533094644547,
                0.03300270438194275,
                -0.07402294874191284,
                -0.052507832646369934,
                0.00039215153083205223,
                -0.12111396342515945,
                0.030718045309185982,
                0.08059383183717728,
                0.005496548023074865,
                0.004270592704415321,
                0.007647651247680187,
                -0.018710695207118988,
                -0.017675651237368584,
                -0.0005542440921999514,
                -0.07012727111577988,
                -0.02079487033188343,
                -0.010672145523130894,
                -0.018036838620901108,
                -0.04745117574930191,
                -0.032987404614686966,

                -0.0626048669219017,
                -0.00021994159033056349,
                0.04366880655288696,
                -0.037254463881254196,
                0.04405467212200165,
                -0.016697950661182404,
                -0.0034348941408097744,
                0.026299981400370598,
                0.02652963064610958,
                0.048898108303546906,
                0.028441661968827248,
                0.014805504120886326,
                -0.06345105916261673,
                0.006073157303035259,
                -0.004523515701293945,
                -0.005357160232961178,
                -0.053843721747398376,
                -0.0351078100502491,
                -0.03561000898480415,
                0.008432178758084774,
                0.05817033722996712,
                0.06351743638515472,
                0.02580982632935047,
                -0.05583164095878601,
                -0.059516940265893936,
                -0.030072400346398354,
        

                -0.04542454704642296,
                0.018669037148356438,
                0.04271560534834862,
                0.0311855711042881,
                0.08016658574342728,
                -0.03334623947739601,
                0.016812259331345558,
                -0.015602529980242252,
                0.02126012183725834,
                -0.06531551480293274,
                -0.08023795485496521,
                -0.036516737192869186,
                -0.015377311035990715,
                0.02737925387918949,
                0.004650977440178394,
                -0.08836865425109863,
                -0.03170783445239067,
                -0.06336480379104614,
                -0.0642133429646492,
                0.0032281847670674324,
                -0.0359475314617157,
                -0.06925161927938461,
                0.02018934115767479,
                -0.04761599749326706,
                -0.02530832774937153,
                -0.0017645707121118903,
               

                -0.030351513996720314,
                0.03781536594033241,
                -0.020494533702731133,
                0.09782963991165161,
                -0.07415862381458282,
                0.02562379464507103,
                -0.08757392317056656,
                -0.03661714121699333,
                0.0338212251663208,
                0.03804837167263031,
                -0.0496508851647377,
                0.046160366386175156,
                -0.015210391953587532,
                -0.03038235753774643,
                0.006438202690333128,
                0.01806839369237423,
                0.015541622415184975,
                -0.014376196078956127,
                -0.07148326933383942,
                -0.034313689917325974,
                0.009282423183321953,
                -0.022881342098116875,
                0.005776528734713793,
                0.07044285535812378,
                0.005243434105068445,
                0.01266319490969181,
                

In [20]:
def ap_cluster(x):
    """
    Clusters data using affinity propagation algorithm.
    """
    clustering = AffinityPropagation(
        random_state=5, damping=0.95
    ).fit(x)

    cluster_labels = clustering.labels_
    cluster_centers = clustering.cluster_centers_
    return cluster_labels, cluster_centers

def cluster_data(data, coordinates_key=None):
    """
    cluster a listof objects with a 'coordinates' key
    
    Args
        coordinates_key : string, A name for the lookup key for the coordinates
        data : list[dicts], A list of objects that has a key for coordinates
        [
            {
                coordinates_key: [x1, x2, ...],
                ...
            },
            ...
        ]
        
    Returns
        Adds the clustering_info to each object in the input list of data
    """
    # Cluster data
    coordinates = []
    for item in data:
        coordinates.append(item[coordinates_key])
    cluster_labels, cluster_centers = ap_cluster(np.array(coordinates))
    
    # add clustering info to the data structure
    for i, item in enumerate(data):
        cluster_info = {}
        cluster_info['is_cluster_head'] = item[coordinates_key] in cluster_centers
        cluster_info['cluster_label'] = cluster_labels[i]
        item['cluster_info'] = cluster_info
        # TODO: remove this lines
        item['embedding'] = ''
        item['text'] = ''
    
    # sort data
    result = sorted(
        data, key=lambda x:
            (x['cluster_info']['cluster_label'], not x['cluster_info']['is_cluster_head'])
    )

    return result

clustered_data = cluster_data(
    data_w_low_dim,
    coordinates_key='low_dim_embedding'
)
uuid_cluster = {e['uuid']: e['cluster_info']['cluster_label'] for e in clustered_data}
# pp(uuid_cluster)
# pp(clustered_data[:100])

In [21]:
def format_to_nested_clustering(clustered_data):
    """
    transform a list of object into a nested list based on clustering info.
    If there is only one cluster, it retruns the same input
    """
    # check the number of cluster heads; return if there is only one cluster
    num_cluster_heads = 0
    for item in clustered_data:
        if item['cluster_info']['is_cluster_head']:
            num_cluster_heads += 1
    if num_cluster_heads == 1:
        return clustered_data

    # Break down if there are more than one cluster
    result = []
    for item in clustered_data:
        item['children'] = item.get('children', [])
        if item['cluster_info']['is_cluster_head']:
            # Add cluster head to the tree and also add it as the first child
            result.append(item)
            if not item.get('children'):
                result[-1]['children'].append(deepcopy(item))
        else:
            result[-1]['children'].append(item)
    return result


In [22]:
def cluster_hierarchically(embedding_data_w_low_dim, include_original_cluster_label=True):
    """
    Gets an array of input data with dimension and performs
    clustering on them and represents data as hierarchical
    
    This function can be called recursively 
    
    Args
        [ {'low_dim_embedding': [], ...}, ...]
    """
    embedding_data_w_low_dim = deepcopy(embedding_data_w_low_dim)

    clustered_data = cluster_data(
        embedding_data_w_low_dim,
        coordinates_key='low_dim_embedding'
    )
    if include_original_cluster_label:
        for item in clustered_data:
            item['original_cluster_label'] = item['cluster_info']['cluster_label']

    nested_clusters = format_to_nested_clustering(clustered_data)
    return nested_clusters

nested_clusters = cluster_hierarchically(
    data_w_low_dim[:100]
)
pp(nested_clusters)

[{'children': [{'children': [],
                'cluster_info': {'cluster_label': 0, 'is_cluster_head': True},
                'embedding': '',
                'low_dim_embedding': [-0.21157175, -30.756844],
                'original_cluster_label': 0,
                'text': '',
                'uuid': 'd5806e7b-8371-446a-a3e4-014f8e7ff629'},
               {'children': [],
                'cluster_info': {'cluster_label': 0, 'is_cluster_head': False},
                'embedding': '',
                'low_dim_embedding': [-1.8962759, -35.300694],
                'original_cluster_label': 0,
                'text': '',
                'uuid': '4eded89d-8af4-48a2-b998-f198a724aaf4'},
               {'children': [],
                'cluster_info': {'cluster_label': 0, 'is_cluster_head': False},
                'embedding': '',
                'low_dim_embedding': [-0.21251723, -35.955177],
                'original_cluster_label': 0,
                'text': '',
                'uuid': '7

                'low_dim_embedding': [-1.6792332, 12.812733],
                'original_cluster_label': 4,
                'text': '',
                'uuid': '587958d1-cf4f-4bb1-bb3f-378c5b51c21e'},
               {'children': [],
                'cluster_info': {'cluster_label': 4, 'is_cluster_head': False},
                'embedding': '',
                'low_dim_embedding': [-6.290936, 9.255702],
                'original_cluster_label': 4,
                'text': '',
                'uuid': '35a32de3-c600-4cd0-90a2-3b61c63e33c2'},
               {'children': [],
                'cluster_info': {'cluster_label': 4, 'is_cluster_head': False},
                'embedding': '',
                'low_dim_embedding': [-2.8262923, 10.929265],
                'original_cluster_label': 4,
                'text': '',
                'uuid': '54464830-7e15-4848-9937-36442e56d886'},
               {'children': [],
                'cluster_info': {'cluster_label': 4, 'is_cluster_head': False},


In [23]:
len(nested_clusters)

5

In [24]:
MAX_CLUSTER_SIZE = 10

# traverse tree and break down 
def bfs_break_down(head):
    """
    Traverse the nested clustering and break down if a node has too many 
    children
    """
    frontiers = [head]
    while frontiers:
        next = frontiers.pop(0)
        if len(next['children']) > MAX_CLUSTER_SIZE:
            next['children'] = cluster_hierarchically(next['children'])
        frontiers.extend(next['children'])
    

head = {}
head['children'] = nested_clusters
# pp(head)

bfs_break_down(head)

pp(head['children'])

[{'children': [{'children': [{'children': [],
                              'cluster_info': {'cluster_label': 0,
                                               'is_cluster_head': True},
                              'embedding': '',
                              'low_dim_embedding': [-0.43632254, -35.35521],
                              'original_cluster_label': 0,
                              'text': '',
                              'uuid': '934b0cfe-98c1-4a69-ba01-61565d7ab709'},
                             {'children': [],
                              'cluster_info': {'cluster_label': 0,
                                               'is_cluster_head': False},
                              'embedding': '',
                              'low_dim_embedding': [-1.8962759, -35.300694],
                              'original_cluster_label': 0,
                              'text': '',
                              'uuid': '4eded89d-8af4-48a2-b998-f198a724aaf4'},
                   

                              'low_dim_embedding': [-2.5506296, 17.90814],
                              'original_cluster_label': 0,
                              'text': '',
                              'uuid': '8270c989-f6b6-43df-a339-f15f43692593'},
                             {'children': [],
                              'cluster_info': {'cluster_label': 0,
                                               'is_cluster_head': False},
                              'embedding': '',
                              'low_dim_embedding': [-2.7421045, 18.767754],
                              'original_cluster_label': 0,
                              'text': '',
                              'uuid': '772f59c2-cff1-479b-9659-0556c14d00c7'},
                             {'children': [],
                              'cluster_info': {'cluster_label': 0,
                                               'is_cluster_head': False},
                              'embedding': '',
                     

In [25]:
pp(head)

{'children': [{'children': [{'children': [{'children': [],
                                           'cluster_info': {'cluster_label': 0,
                                                            'is_cluster_head': True},
                                           'embedding': '',
                                           'low_dim_embedding': [-0.43632254,
                                                                 -35.35521],
                                           'original_cluster_label': 0,
                                           'text': '',
                                           'uuid': '934b0cfe-98c1-4a69-ba01-61565d7ab709'},
                                          {'children': [],
                                           'cluster_info': {'cluster_label': 0,
                                                            'is_cluster_head': False},
                                           'embedding': '',
                                           'low_dim_emb

                                           'cluster_info': {'cluster_label': 0,
                                                            'is_cluster_head': False},
                                           'embedding': '',
                                           'low_dim_embedding': [-3.7187383,
                                                                 -20.829605],
                                           'original_cluster_label': 0,
                                           'text': '',
                                           'uuid': '4a2b9d02-da71-4e22-a379-32193ecba85b'},
                                          {'children': [],
                                           'cluster_info': {'cluster_label': 0,
                                                            'is_cluster_head': False},
                                           'embedding': '',
                                           'low_dim_embedding': [-5.2465606,
                                    

                                                         'text': '',
                                                         'uuid': '603ac7be-e5c6-40f8-96ba-74830b1ef2c7'},
                                                        {'children': [],
                                                         'cluster_info': {'cluster_label': 2,
                                                                          'is_cluster_head': False},
                                                         'embedding': '',
                                                         'low_dim_embedding': [-3.4781935,
                                                                               12.684799],
                                                         'original_cluster_label': 2,
                                                         'text': '',
                                                         'uuid': '1274f190-ce54-47ac-b4aa-201a4349cf55'},
                                        

In [26]:
def insert_children_count(head):
    """
    Add total number of children for each node recursively
    """
    if not head['children']:
        head['children_count'] = 0
        return 0
    sum = 0
    for node in head['children']:
        sum += 1 + node.get('children_count', insert_children_count(node))
    head['children_count'] = sum
    return sum

insert_children_count(head)
print(head['children_count'])
pp(head)

119
{'children': [{'children': [{'children': [{'children': [],
                                           'children_count': 0,
                                           'cluster_info': {'cluster_label': 0,
                                                            'is_cluster_head': True},
                                           'embedding': '',
                                           'low_dim_embedding': [-0.43632254,
                                                                 -35.35521],
                                           'original_cluster_label': 0,
                                           'text': '',
                                           'uuid': '934b0cfe-98c1-4a69-ba01-61565d7ab709'},
                                          {'children': [],
                                           'children_count': 0,
                                           'cluster_info': {'cluster_label': 0,
                                                            'is_cluste

                                                                 -20.769049],
                                           'original_cluster_label': 0,
                                           'text': '',
                                           'uuid': '1baedbc3-989f-471b-9ca3-e62f02c5b312'},
                                          {'children': [],
                                           'children_count': 0,
                                           'cluster_info': {'cluster_label': 0,
                                                            'is_cluster_head': False},
                                           'embedding': '',
                                           'low_dim_embedding': [-5.813054,
                                                                 -21.68851],
                                           'original_cluster_label': 0,
                                           'text': '',
                                           'uuid': '5ea3d38f-f1dd-4e4e-a4

                                                                               11.6702],
                                                         'original_cluster_label': 0,
                                                         'text': '',
                                                         'uuid': '6f681e46-3aa1-437d-a1fe-533a29782590'},
                                                        {'children': [],
                                                         'children_count': 0,
                                                         'cluster_info': {'cluster_label': 0,
                                                                          'is_cluster_head': False},
                                                         'embedding': '',
                                                         'low_dim_embedding': [-0.8683935,
                                                                               11.957237],
                                                

In [27]:
def insert_d3uuid(head):
    """
    Traverse the tree of data and insert a unique identifier for 
    each node that will be used for d3 distinctions later
    """
    if not head:
        return
    seen_uuids = set()
    frontiers = [head]
    while frontiers:
        next = frontiers.pop(0)
        next['d3uuid'] = str(uuid4())
        frontiers.extend(next['children'])
    

insert_d3uuid(head)

pp(head['children'])

[{'children': [{'children': [{'children': [],
                              'children_count': 0,
                              'cluster_info': {'cluster_label': 0,
                                               'is_cluster_head': True},
                              'd3uuid': 'b2365bae-efac-4ff0-b367-cfc38e1575e2',
                              'embedding': '',
                              'low_dim_embedding': [-0.43632254, -35.35521],
                              'original_cluster_label': 0,
                              'text': '',
                              'uuid': '934b0cfe-98c1-4a69-ba01-61565d7ab709'},
                             {'children': [],
                              'children_count': 0,
                              'cluster_info': {'cluster_label': 0,
                                               'is_cluster_head': False},
                              'd3uuid': '9e14bd45-a156-4ba6-b896-42859d350aea',
                              'embedding': '',
              

                              'original_cluster_label': 0,
                              'text': '',
                              'uuid': '1baedbc3-989f-471b-9ca3-e62f02c5b312'},
                             {'children': [],
                              'children_count': 0,
                              'cluster_info': {'cluster_label': 0,
                                               'is_cluster_head': False},
                              'd3uuid': 'b42e80f4-9609-4ade-b6b9-8a062bc2fbde',
                              'embedding': '',
                              'low_dim_embedding': [-5.813054, -21.68851],
                              'original_cluster_label': 0,
                              'text': '',
                              'uuid': '5ea3d38f-f1dd-4e4e-a439-df44ea73903a'},
                             {'children': [],
                              'children_count': 0,
                              'cluster_info': {'cluster_label': 0,
                                    

                                            'd3uuid': '8bd49e7a-ffb3-4c09-978c-52d6b4c9cf0f',
                                            'embedding': '',
                                            'low_dim_embedding': [5.160848,
                                                                  5.396939],
                                            'original_cluster_label': 0,
                                            'text': '',
                                            'uuid': 'da2613d4-4f03-4ce8-80ef-28c2d1ff1058'},
                                           {'children': [],
                                            'children_count': 0,
                                            'cluster_info': {'cluster_label': 0,
                                                             'is_cluster_head': False},
                                            'd3uuid': '07cdf1c9-d27f-4d31-b58f-fd487c2a7aaf',
                                            'embedding': '',
                     

In [28]:
pp(head)

{'children': [{'children': [{'children': [{'children': [],
                                           'children_count': 0,
                                           'cluster_info': {'cluster_label': 0,
                                                            'is_cluster_head': True},
                                           'd3uuid': 'b2365bae-efac-4ff0-b367-cfc38e1575e2',
                                           'embedding': '',
                                           'low_dim_embedding': [-0.43632254,
                                                                 -35.35521],
                                           'original_cluster_label': 0,
                                           'text': '',
                                           'uuid': '934b0cfe-98c1-4a69-ba01-61565d7ab709'},
                                          {'children': [],
                                           'children_count': 0,
                                           'cluster_info': {'

                             'original_cluster_label': 0,
                             'text': '',
                             'uuid': '8db04bbf-1f5d-4103-8043-a4cffbd914dc'},
                            {'children': [],
                             'children_count': 0,
                             'cluster_info': {'cluster_label': 0,
                                              'is_cluster_head': False},
                             'd3uuid': '9b3cc8b8-20fc-4983-8012-300630baf619',
                             'embedding': '',
                             'low_dim_embedding': [24.862854, -13.626811],
                             'original_cluster_label': 0,
                             'text': '',
                             'uuid': '6ce172df-7e85-422e-aa98-b1c1c5591434'},
                            {'children': [],
                             'children_count': 0,
                             'cluster_info': {'cluster_label': 0,
                                              'is_c

                                                                 31.10189],
                                           'original_cluster_label': 1,
                                           'text': '',
                                           'uuid': '08401316-e267-482f-beef-0ab8d4ec2a04'},
                                          {'children': [],
                                           'children_count': 0,
                                           'cluster_info': {'cluster_label': 1,
                                                            'is_cluster_head': False},
                                           'd3uuid': '13a66509-2519-44a3-86a1-ad0c7a4ae7a8',
                                           'embedding': '',
                                           'low_dim_embedding': [14.413234,
                                                                 32.72296],
                                           'original_cluster_label': 1,
                                      

In [29]:
# calculate radius for each bubble 
MIN_RADIUS = 1
MIN_ALLOWED_DISTANCE = 1

def flatten_data(head):
    """
    Gets a tree of data and transform it into a flat list
    and removes repeated items. 
    """
    if not head:
        return
    ret_val = []
    seen_uuids = set()
    frontiers = [head]
    while frontiers:
        next = frontiers.pop(0)
        if next.get('uuid') and next['uuid'] not in seen_uuids:
            next_copy = deepcopy(next)
            next_copy.pop('children')
            ret_val.append(next_copy)
            seen_uuids.add(next['uuid'])                
        frontiers.extend(next['children'])
    return ret_val

def get_multiplier(clustering_data):
    """
    get the max multiplier that is used to inflate the bubble sizes
    Args
        clustering_data : list(dict) : a list of objects. Objects have a key for number of children
    """
    multiplier = float("inf")
    if len(clustering_data) < 2:
        multiplier = 0
    for i in range(len(clustering_data)):
        for j in range(i+1, len(clustering_data)):
            filled = max(
                np.sqrt(clustering_data[i]['children_count']),
                np.sqrt(clustering_data[j]['children_count'])
            )
            p1 = np.array([
                float(clustering_data[i]['low_dim_embedding'][0]),
                float(clustering_data[i]['low_dim_embedding'][1])
            ])
            p2 = np.array([
                float(clustering_data[j]['low_dim_embedding'][0]),
                float(clustering_data[j]['low_dim_embedding'][1])
            ])
            d = np.linalg.norm(p1 - p2)
            if d > MIN_ALLOWED_DISTANCE:
                multiplier = min(multiplier, d / filled)
#                 print(multiplier, d, filled, (
#                     clustering_data[i]['children_count'], clustering_data[j]['children_count']
#                 ))
    return multiplier


# clustering_data = flatten_data(head)
# pp(clustering_data)

radius_multiplier_factor = get_multiplier(head['children'])
print(radius_multiplier_factor)

def insert_radius(head, radius_multiplier_factor):
    """
    Insert the radius in all object in the tree, and also for
    each object, insert the radius of their parent
    """
    frontiers = copy(head['children'])
    while frontiers:
        next = frontiers.pop(0)
        if next['children']:
            next['radius'] = max([
                np.sqrt(next['children_count']) * radius_multiplier_factor,
                MIN_RADIUS
            ])
        else:
            next['radius'] = MIN_RADIUS            
        frontiers.extend(next['children'])

insert_radius(head, radius_multiplier_factor)
pp(head)

1.7251655376626536
{'children': [{'children': [{'children': [{'children': [],
                                           'children_count': 0,
                                           'cluster_info': {'cluster_label': 0,
                                                            'is_cluster_head': True},
                                           'd3uuid': 'b2365bae-efac-4ff0-b367-cfc38e1575e2',
                                           'embedding': '',
                                           'low_dim_embedding': [-0.43632254,
                                                                 -35.35521],
                                           'original_cluster_label': 0,
                                           'radius': 1,
                                           'text': '',
                                           'uuid': '934b0cfe-98c1-4a69-ba01-61565d7ab709'},
                                          {'children': [],
                                           'childr

               'text': '',
               'uuid': '9ad49c28-ffd3-42ee-9b8b-c82720fab52e'},
              {'children': [{'children': [],
                             'children_count': 0,
                             'cluster_info': {'cluster_label': 0,
                                              'is_cluster_head': True},
                             'd3uuid': '8b96638f-a288-4d07-a26b-7a01744cc1af',
                             'embedding': '',
                             'low_dim_embedding': [24.367086, -12.858131],
                             'original_cluster_label': 0,
                             'radius': 1,
                             'text': '',
                             'uuid': '83b57861-6a05-4ce6-a1a2-2990b77e5286'},
                            {'children': [],
                             'children_count': 0,
                             'cluster_info': {'cluster_label': 0,
                                              'is_cluster_head': False},
                       

                                           'radius': 1,
                                           'text': '',
                                           'uuid': 'ef950d77-cf0f-470c-8322-869f3531d74a'}],
                             'children_count': 3,
                             'cluster_info': {'cluster_label': 0,
                                              'is_cluster_head': True},
                             'd3uuid': '680341fd-09bc-4b60-9637-76ff2d2fce2e',
                             'embedding': '',
                             'low_dim_embedding': [-2.5506296, 17.90814],
                             'original_cluster_label': 0,
                             'radius': 2.9880743626985953,
                             'text': '',
                             'uuid': '8270c989-f6b6-43df-a339-f15f43692593'},
                            {'children': [{'children': [],
                                           'children_count': 0,
                                           'cluste

In [32]:
pp(head)

{'children': [{'children': [{'children': [{'children': [],
                                           'children_count': 0,
                                           'cluster_info': {'cluster_label': 0,
                                                            'is_cluster_head': True},
                                           'd3uuid': 'b2365bae-efac-4ff0-b367-cfc38e1575e2',
                                           'embedding': '',
                                           'low_dim_embedding': [-0.43632254,
                                                                 -35.35521],
                                           'original_cluster_label': 0,
                                           'parent': {'d3uuid': '7d12351c-9062-4b00-84da-ad7389f7b396',
                                                      'low_dim_embedding': [-0.43632254,
                                                                            -35.35521],
                                                  

                             'cluster_info': {'cluster_label': 0,
                                              'is_cluster_head': False},
                             'd3uuid': 'f48b182d-fc67-4bf7-9140-2037e787a559',
                             'embedding': '',
                             'low_dim_embedding': [-12.47627, 19.863182],
                             'original_cluster_label': 0,
                             'parent': {'d3uuid': 'c210fae1-7ed1-4425-bfd5-193736e8714e',
                                        'low_dim_embedding': [-9.80331,
                                                              19.604128],
                                        'radius': 7.905701662668299},
                             'radius': 1,
                             'text': '',
                             'uuid': '1eea7d1d-d9b6-4f5c-856a-89ca80294087'},
                            {'children': [],
                             'children_count': 0,
                             'cluster_info

                             'low_dim_embedding': [33.29173, -7.730444],
                             'original_cluster_label': 0,
                             'parent': {'d3uuid': 'cdb0ddc2-ba92-43cd-9701-39ef8fce69b1',
                                        'low_dim_embedding': [24.367086,
                                                              -12.858131],
                                        'radius': 6.681537396818422},
                             'radius': 1,
                             'text': '',
                             'uuid': '9accae91-4339-40e4-87db-03f446295c0c'},
                            {'children': [],
                             'children_count': 0,
                             'cluster_info': {'cluster_label': 0,
                                              'is_cluster_head': False},
                             'd3uuid': 'f33f9811-a10b-4025-ac9d-6fc9e7bfc042',
                             'embedding': '',
                             'low_dim_emb

                                                            'is_cluster_head': False},
                                           'd3uuid': '0d281f66-4652-4049-82ad-7b4c6ec2ecc9',
                                           'embedding': '',
                                           'low_dim_embedding': [-15.794213,
                                                                 -37.523827],
                                           'original_cluster_label': 0,
                                           'parent': {'d3uuid': '9f29cba8-9c4b-4f8c-943a-af0433823d7a',
                                                      'low_dim_embedding': [-15.480947,
                                                                            -25.47159],
                                                      'radius': 5.721726789678759},
                                           'radius': 1,
                                           'text': '',
                                           'uuid': '6baa38

                                           'text': '',
                                           'uuid': 'b497ef04-97cf-4f81-81ed-aceee18cbec5'},
                                          {'children': [{'children': [],
                                                         'children_count': 0,
                                                         'cluster_info': {'cluster_label': 0,
                                                                          'is_cluster_head': True},
                                                         'd3uuid': 'c29bc3f3-7f10-4abd-842c-d6c5aa591822',
                                                         'embedding': '',
                                                         'low_dim_embedding': [-1.1570983,
                                                                               9.506514],
                                                         'original_cluster_label': 0,
                                                         'pa

In [30]:
def insert_parents_info(head):
    """
    Insert parents coordinates and radius in each child
    """
    if not head:
        return
    frontiers = [head]
    seen = []
    while frontiers:
        next = frontiers.pop(0)
        for child in next.get('children', []):
            child['parent'] = {
                'low_dim_embedding': next.get('low_dim_embedding'),
                'radius': next.get('radius'),
                'd3uuid': next.get('d3uuid')
            }
        frontiers.extend(next['children'])
    return

insert_parents_info(head)
pp(head)

{'children': [{'children': [{'children': [{'children': [],
                                           'children_count': 0,
                                           'cluster_info': {'cluster_label': 0,
                                                            'is_cluster_head': True},
                                           'd3uuid': 'b2365bae-efac-4ff0-b367-cfc38e1575e2',
                                           'embedding': '',
                                           'low_dim_embedding': [-0.43632254,
                                                                 -35.35521],
                                           'original_cluster_label': 0,
                                           'parent': {'d3uuid': '7d12351c-9062-4b00-84da-ad7389f7b396',
                                                      'low_dim_embedding': [-0.43632254,
                                                                            -35.35521],
                                                  

                             'parent': {'d3uuid': 'c210fae1-7ed1-4425-bfd5-193736e8714e',
                                        'low_dim_embedding': [-9.80331,
                                                              19.604128],
                                        'radius': 7.905701662668299},
                             'radius': 1,
                             'text': '',
                             'uuid': 'ed37c908-5c9d-45c2-8fe0-f95a6792e119'},
                            {'children': [],
                             'children_count': 0,
                             'cluster_info': {'cluster_label': 0,
                                              'is_cluster_head': False},
                             'd3uuid': 'efee614e-65af-48ce-be0b-1bca870d4df8',
                             'embedding': '',
                             'low_dim_embedding': [-4.142925, 19.08646],
                             'original_cluster_label': 0,
                             'parent': {'d3

                                        'radius': 6.681537396818422},
                             'radius': 1,
                             'text': '',
                             'uuid': 'b890efe5-00ef-420a-8d81-951db75f916c'},
                            {'children': [],
                             'children_count': 0,
                             'cluster_info': {'cluster_label': 0,
                                              'is_cluster_head': False},
                             'd3uuid': 'bb5bff6e-7d44-46b6-8fc2-fbbbaddfc5db',
                             'embedding': '',
                             'low_dim_embedding': [25.293621, -12.825769],
                             'original_cluster_label': 0,
                             'parent': {'d3uuid': 'cdb0ddc2-ba92-43cd-9701-39ef8fce69b1',
                                        'low_dim_embedding': [24.367086,
                                                              -12.858131],
                                       

                                           'embedding': '',
                                           'low_dim_embedding': [14.318196,
                                                                 31.10189],
                                           'original_cluster_label': 1,
                                           'parent': {'d3uuid': '4fb7bc74-8aa9-4c83-a1e6-d4f7f4d16f71',
                                                      'low_dim_embedding': [14.318196,
                                                                            31.10189],
                                                      'radius': 4.2257752891076965},
                                           'radius': 1,
                                           'text': '',
                                           'uuid': '08401316-e267-482f-beef-0ab8d4ec2a04'},
                                          {'children': [],
                                           'children_count': 0,
                           

                                                         'children_count': 0,
                                                         'cluster_info': {'cluster_label': 2,
                                                                          'is_cluster_head': False},
                                                         'd3uuid': 'd2fffa71-dce0-4ac6-b339-92b9102530f1',
                                                         'embedding': '',
                                                         'low_dim_embedding': [-1.5994903,
                                                                               13.292203],
                                                         'original_cluster_label': 2,
                                                         'parent': {'d3uuid': 'a6a16e20-f78f-478f-83a8-74924bca27ae',
                                                                    'low_dim_embedding': [-5.2064185,
                                                          

In [31]:
def insert_meta_data(head):
    """
    Insert the metadat field that includes the following
        metadata:
            max_x, min_x
            max_y, min_y
    """
    min_x = float("inf")
    max_x = float("-inf")
    min_y = float("inf")
    max_y = float("-inf")
    frontiers = deepcopy(head['children'])
    while frontiers:
        next = frontiers.pop(0)
        min_x = min(next['low_dim_embedding'][0] - next['radius'], min_x)
        min_y = min(next['low_dim_embedding'][1] - next['radius'], min_y)
        max_x = max(next['low_dim_embedding'][0] + next['radius'], max_x)
        max_y = max(next['low_dim_embedding'][1] + next['radius'], max_y)
        frontiers.extend(next['children'])
    head['metadata'] = {
        'x': {
            'max': max_x,
            'min': min_x,
        },
        'y': {
            'max': max_y,
            'min': min_y,
        },
        'radius': {
            'max': max(max_x - min_x, max_y - min_y)
        }
    }

insert_meta_data(head)

In [None]:
pp(head)

In [None]:
def get_formatted_item(item):
    """
    Arg:
        An input item
    """
    entry = {
        'x': float(item['low_dim_embedding'][0]),
        'y': float(item['low_dim_embedding'][1]),
        'uuid': item.get('uuid'),
        'd3uuid': item.get('d3uuid'),
        'text': item.get('text'),
        'cluster_label': int(item['original_cluster_label']),
        'children_count': item['children_count'],
        'radius': item['radius'],
        'parent': {
            'x': float(item['parent']['low_dim_embedding'][0]) if item['parent']['low_dim_embedding'] else float(item['low_dim_embedding'][0]),
            'y': float(item['parent']['low_dim_embedding'][1]) if item['parent']['low_dim_embedding'] else float(item['low_dim_embedding'][1]),
            'radius': float(item['parent']['radius']) if item['parent']['radius'] else item['radius'],
            'd3uuid': item['parent']['d3uuid'],
        }
    }
    return entry


def get_formatted_data(node):
    """
    
    """
    if not node:
        return
    new_node = {}
    if 'low_dim_embedding' in node:
        new_node = get_formatted_item(node)
    new_node['children'] = [
        get_formatted_data(c) for c in node['children']
    ]
    return new_node

#
input_data = deepcopy(head)
# pp(input_data)
formatted_data = get_formatted_data(input_data)
formatted_data['metadata'] = head['metadata']

In [None]:
pp(formatted_data)

In [None]:
lines = json.dumps(formatted_data, indent=4)
with open('all_levels.json', 'w') as f:
    f.write(lines)

In [None]:
# get level n of the tree
def get_nth_level_nodes(head, n):
    level = 0
    result = []
    frontiers = [head, None]
    while frontiers and frontiers != [None]:
        next = frontiers.pop(0)
        if next is None:
            if level == n:
                return result
            result = [e for e in result if not e['children']]
            level += 1
            frontiers.append(None)
            continue
        result.append(next)
        frontiers.extend(next['children'])
    if level < n:
        return
    return result

# pp(get_nth_level_nodes(head, 3))
            

In [None]:
all_levels_raw = {0: ['zero level']}
head
level = 0
while all_levels_raw[level]:
    level += 1
    all_levels_raw[level] = get_nth_level_nodes(head, level)

In [None]:
pp(all_levels_raw)

In [None]:
def get_formatted_data(data):
    """
    
    """
    metadata = []
    if not data:
        return 
    for item in data:
        parent_xy = item['parent']['low_dim_embedding']
        if not parent_xy:
            parent_xy = item['low_dim_embedding']
        parent_r = item['parent']['radius']
        if not parent_r:
            parent_r = item['radius']
        entry = {
            'x': float(item['low_dim_embedding'][0]),
            'y': float(item['low_dim_embedding'][1]),
            'uuid': item.get('uuid'),
            'text': item.get('text'),
            'cluster_label': int(item['original_cluster_label']),
            'children_count': item['children_count'],
            'radius': item['radius'],
            'parent': {
                'x': float(parent_xy[0]),
                'y': float(parent_xy[1]),
                'radius': parent_r,
            }
        }
        metadata.append(entry)
    return metadata

#
all_levels = {}
level = 1
while all_levels_raw[level]:
    all_levels[level] = get_formatted_data(all_levels_raw[level])
    level += 1

pp(all_levels)

In [None]:
for l in all_levels:
    print(len(all_levels[l]))

In [None]:
last_level = len(all_levels)
while not all_levels[last_level] and last_level > 0:
    last_level -= 1
all_x = [item['x'] for item in all_levels[last_level]]
all_y = [item['y'] for item in all_levels[last_level]]
x_range = [min(all_x), max(all_x)]
y_range = [min(all_y), max(all_y)]
print(x_range, y_range)

In [None]:
lines = json.dumps(all_levels, indent=4)
with open('all_levels.json', 'w') as f:
    f.write(lines)

In [None]:
dummy_data = [
    {
        'uuid': 'uuid-001',
        'x': '1', 
        'y': '2',
        'text': 'test',
        'is_cluster_head': True
    },
    {
        'uuid': 'uuid-002',
        'x': '1', 
        'y': '2',
        'text': 'test',
        'is_cluster_head': False
    },
    {
        'uuid': 'uuid-003',
        'x': '1', 
        'y': '2',
        'text': 'test',
        'is_cluster_head': False
    },
    {
        'uuid': 'uuid-004',
        'x': '1', 
        'y': '2',
        'text': 'test',
        'is_cluster_head': False
    },
    {
        'uuid': 'uuid-005',
        'x': '1', 
        'y': '2',
        'text': 'test',
        'is_cluster_head': True
    },
    {
        'uuid': 'uuid-006',
        'x': '1', 
        'y': '2',
        'text': 'test',
        'is_cluster_head': False
    },
]

In [None]:
def get_nested_texts(clustred_texts):
    """
    re-formats the flat structure for clustered texts
    to a hierarchical structure
    """
    nested_data = []
    for item in clustred_texts:
        item['children'] = item.get('children', [])
        if item.get('is_cluster_head'):
            nested_data.append(formatted_item)
        else:
            nested_data[-1]['children'].append(formatted_item)
    return nested_data


nested_data = get_nested_texts(data_summary)
# nested_data = get_nested_texts(dummy_data)
import pprint
pprint.PrettyPrinter().pprint(nested_data)

In [None]:
total_count = len(cluster_labels)
from collections import defaultdict
class_count = defaultdict(int)
for label in cluster_labels:
    class_count[label] += 1
class_count