### Convert the tlp file output by GrouseFlocks to MyJSON 
* Observing that all of the tlp files are at most two level, this script only deals with two level
* We treat each open metanode as a "true metanode" that has underlying leaf nodes, but the closed metanode does not have underlying leaf nodes.  They are both in the data structure "metanodes"
* This script only works for the datasets "ivOrigins" and "moviedb", which seems different than other datasets that are not using the TopoLayout

In [17]:
from tulip import tlp
from shapely.geometry import *
from tlp2myjson import chop_segment
import os
import math
import json
SQUARE_ROOT_2 = math.sqrt(2)
from pprint import pprint

In [18]:
DATA_DIR = '../../../data/real-world-compiled/grouseflocks/'
FILENAME = 'grouseflocks-moviedb-rateonly-grouseflocks-2'
grousetlp_filepath = os.path.join(DATA_DIR, FILENAME + '.tlp')
myjson_filepath = os.path.join(DATA_DIR, FILENAME + '.json')

In [19]:
# Load graph and get the useful subgraphs
graph = tlp.loadGraph(grousetlp_filepath)
assert(graph is not None)
hierarchy_subgraphs = graph.getSubGraph('1__HierarchySubGraphs')
final_layout_display = graph.getSubGraph('1__Final Layout Display')
final_layout = graph.getSubGraph('1__Final Layout')
assert(hierarchy_subgraphs is not None)
assert(final_layout_display is not None)
assert(final_layout is not None)

In [20]:
# Initialize the output data structures
metanodes = {}
leaf_nodes = []
edges = []
root = final_layout.getId()
max_height = 1

In [21]:
label2node = {}
view_label = final_layout_display.getStringProperty('viewLabel')

for n in final_layout_display.getNodes():
    label2node[view_label[n]] = n
len(label2node)
print(label2node)

{'': <node 479>, 'Tree 577: n 9338 e 130669 (Hier: n 69 e 68)': <node 21278>, 'Near Complete 0: n 3 e 3 (Hier: n 3 e 3)': <node 21279>, 'Near Complete 1: n 2 e 1 (Hier: n 3 e 3)': <node 21280>, 'Near Complete 2: n 2 e 1 (Hier: n 3 e 3)': <node 21281>, 'Unknown 5: n 9 e 31 (Hier: n 10 e 34)': <node 21282>, 'Near Complete 3: n 2 e 1 (Hier: n 3 e 3)': <node 21283>, 'Near Complete 4: n 2 e 1 (Hier: n 3 e 3)': <node 21284>, 'Near Complete 5: n 2 e 1 (Hier: n 3 e 3)': <node 21285>, 'Near Complete 6: n 2 e 1 (Hier: n 3 e 3)': <node 21286>, 'Near Complete 7: n 2 e 1 (Hier: n 3 e 3)': <node 21287>, 'Unknown 11: n 5 e 7 (Hier: n 6 e 12)': <node 21289>, 'Near Complete 8: n 3 e 3 (Hier: n 4 e 6)': <node 21290>, 'Near Complete 9: n 3 e 3 (Hier: n 3 e 3)': <node 21291>, 'Near Complete 10: n 2 e 1 (Hier: n 3 e 3)': <node 21292>, 'Near Complete 11: n 4 e 6 (Hier: n 5 e 10)': <node 21293>, 'Unknown 16: n 6 e 9 (Hier: n 6 e 12)': <node 21295>, 'Near Complete 12: n 2 e 1 (Hier: n 3 e 3)': <node 21296>, '

In [22]:
# Determine the node hierarchy in "1__Final Layout".
# Although we can find all subgraphs (and its leaf nodes) in "1__HierarchySubGraphs", 
# it's only a flat list of metanodes even if there are multiple levels in the hierarchy.
# Such a misleading name!

def dfs(current_graph, depth):    
    # find out if the current graph is displayed, aka, present in "1__Final Layout Display"
    if current_graph.getName() not in label2node:
        # keep traversing down because this metanode is open
        for g in current_graph.getSubGraphs():
            # record this metanode
            metanodes[g.getId()] = {
                'id': g.getId(),
                'label': g.getName(),
                'parent_metanode': current_graph.getId(),
                'desc_metanodes': {},  
                'level': depth + 1,
                'leaf_nodes': {},        # since we don't need it now, let's leave it blank.  
                'diameter': 0,      
                'geometry': None,  # TODO how to find out geometry for open metanodes (the loose bounding circle)?
            }
            
            dfs(g, depth + 1)
            
        # Fill in the "desc_metanodes" in post order
        if current_graph.getId() in metanodes:
            t = metanodes[current_graph.getId()]
            for g in current_graph.getSubGraphs():
                # the subgraph itself
                t['desc_metanodes'][g.getId()] = True
                # and its descendants
                dict.update(t['desc_metanodes'], metanodes[g.getId()]['desc_metanodes'])
                
dfs(final_layout, 0)
print(len(metanodes), metanodes.keys())
# pprint(metanodes)

40 dict_keys([3, 4, 5687, 5688, 5689, 5690, 5691, 5692, 5693, 5694, 5695, 5696, 5698, 5699, 5700, 5701, 5702, 5704, 5705, 5707, 5708, 5709, 5710, 5711, 5712, 5713, 5714, 5715, 5717, 5718, 5719, 5721, 5722, 5723, 5725, 5726, 5728, 5729, 5730, 5732])


In [23]:
# A mapping between label w metanode id in our data structure "metanodes" is needed
#   because there are mulitple IDs (at least 3 cluster ids and 1 node id) associated with a metanode 
#   in the original tlp files.  
#   We want to make sure we are linking the elements correctly by using a single ID.
#   The label seems to be unique and consistent across different representations in the original tlp file.
label2metanode = {}
for nid, m in metanodes.items():
    label2metanode[m['label']] = nid

In [24]:
# Find out all leaf nodes
view_hier_attr = final_layout_display.getIntegerProperty('view Hier Node to Display')
sub_view_layout = final_layout_display.getLayoutProperty('viewLayout')
sub_view_size = final_layout_display.getSizeProperty('viewSize')
node_in_graph = final_layout_display.getGraphProperty('Node In Graph')
view_label = final_layout_display.getStringProperty('viewLabel')

for n in final_layout_display.getNodes():
    center_coor = sub_view_layout[n]
    width = sub_view_size[n][0]
    # print(n.id, center_coor, diameter)
    if view_hier_attr[n] != -1:
        # leaf nodes
        print('leaf: ', n.id)
        parent_metanode_id = None
        if node_in_graph[n] is not None:
            # Leaf node is an open metanode
            parent_metanode_id = label2metanode[node_in_graph[n].getName()]
        else:
            # leaf node not in an open metanode, this is odd but it happens in ivOrigins
            # but when it happens, the parent_metanode is always the root
            parent_metanode_id = root
        leaf_nodes.append({
            'id': n.id,
            'parent_metanode': parent_metanode_id, 
            'geometry': Point(center_coor.x(), center_coor.y()).buffer(width / 2.0, cap_style=CAP_STYLE.square),
            'diameter': width * SQUARE_ROOT_2,
        })       
    else:
        # closed metanode.  Note that they are already recorded in "metanodes"
        metanode_id = label2metanode[view_label[n]]
        dict.update(metanodes[metanode_id], {
            'geometry': Point(center_coor.x(), center_coor.y()).buffer(width / 2.0, cap_style=CAP_STYLE.round),
            'diatmeter': width
        })
print(len(leaf_nodes))

leaf:  4561
leaf:  682
leaf:  3689
leaf:  9222
leaf:  3112
leaf:  4374
leaf:  3114
leaf:  4569
leaf:  3891
leaf:  7966
leaf:  604
leaf:  5069
leaf:  6051
leaf:  6342
leaf:  3433
leaf:  7994
leaf:  5477
leaf:  2279
leaf:  8300
leaf:  443
leaf:  5395
leaf:  1909
leaf:  9379
leaf:  6372
leaf:  846
leaf:  8706
leaf:  9098
leaf:  274
leaf:  2215
leaf:  3671
leaf:  479
31


In [25]:
leaf_nodes_mapping = {}
for n in leaf_nodes:
    leaf_nodes_mapping[n['id']] = n

In [26]:
# this is for printing the labels and then determine the ids manually by looking at the figure at Tulip
for nid, m in metanodes.items():
    if m['geometry'] is None and m['parent_metanode'] != root:
        # indicates this is an open metanode
        p = metanodes[m['parent_metanode']]
        print('dealing with: ', m['label'], 'parent:', p['label'])
        

In [27]:
# Open metanode IDs for different datasets.  Fill in manually

# ivOrigins-open-2
# corr_node_ids = {
#     'Tree 8: n 10 e 31 (Hier: n 2 e 1)': 126,
#     'Unknown 13: n 9 e 29 (Hier: n 9 e 30)': 138,
#     'Tree 10: n 32 e 201 (Hier: n 3 e 2)': 127,
#     'Unknown 14: n 23 e 167 (Hier: n 24 e 203)': 139
# }

# ivOrigins-open-4
corr_node_ids = {
    'Tree 8: n 10 e 31 (Hier: n 2 e 1)': 126,
    'Tree 10: n 32 e 201 (Hier: n 3 e 2)': 127,
    'Tree 12: n 25 e 112 (Hier: n 3 e 2)': 129,
    'Near Complete 3: n 8 e 17 (Hier: n 8 e 28)': 133
}


In [28]:
# Find the layout info according to the manual defined IDs above
for nid, m in metanodes.items():
    if m['geometry'] is None and m['parent_metanode'] != root:
        # indicates this is an open metanode
        p = metanodes[m['parent_metanode']]
        parent_subgraph = hierarchy_subgraphs.getSubGraph(p['label'])
        assert(parent_subgraph is not None)
        view_layout = parent_subgraph.getLayoutProperty('viewLayout')
        view_size = parent_subgraph.getSizeProperty('viewSize')

        # Find the corresponding node
        which_node = None
        for n in parent_subgraph.getNodes():
            if n.id == corr_node_ids[m['label']]:
                which_node = n
                break
        assert(which_node is not None)
        
        center_coor = view_layout[which_node]
        diameter = view_size[which_node][0]
        dict.update(m, {
            'geometry': Point(center_coor.x(), center_coor.y()).buffer(width / 2.0, cap_style=CAP_STYLE.round),
            'diatmeter': diameter
        })
        pprint(m)


In [29]:
view_layout = final_layout_display.getLayoutProperty('viewLayout')
view_hier_attr = final_layout_display.getIntegerProperty('view Hier Node to Display')
view_label = final_layout_display.getStringProperty('viewLabel')

for e in final_layout_display.getEdges():
    src, tgt = graph.ends(e)
 
    # Make sure this id is the id we are using for myjson since it uses a few different ids for the same metanode
    if view_hier_attr[src] == -1:
        src_id = label2metanode[view_label[src]]
        src_geo = metanodes[src_id]['geometry']
    else:
        src_id = src.id
        src_geo = leaf_nodes_mapping[src_id]['geometry']
        
    if view_hier_attr[tgt] == -1:
        tgt_id = label2metanode[view_label[tgt]]
        tgt_geo = metanodes[tgt_id]['geometry']
    else:
        tgt_id = tgt.id
        tgt_geo = leaf_nodes_mapping[tgt_id]['geometry']
 
    edge_id = '{}-{}'.format(src_id, tgt_id)   
    edges.append({
        'id': edge_id,
        'ends': (src_id, tgt_id),
        'geometry': chop_segment(src_geo, tgt_geo,
                                 (view_layout[src].x(), view_layout[src].y()),
                                 (view_layout[tgt].x(), view_layout[tgt].y()))
    })
print(len(edges))

71


In [30]:
bbox = tlp.computeBoundingBox(final_layout_display)

In [31]:
# Use the mapping function from shapely to serialize the geometry objects
for n in leaf_nodes:
    n['geometry'] = mapping(n['geometry'])
for e in edges:
    e['geometry'] = mapping(e['geometry'])
for _, n in metanodes.items():
    if n['geometry'] is None:
        # dirty fix: the root is not going to intersect with anyone so use a random point
        n['geometry'] = Point(0,0)
    n['geometry'] = mapping(n['geometry']) 
    if n['level'] > max_height:
        max_height = n['level']

json_data = {
    'leaf_nodes': leaf_nodes,
    'edges': edges,
    'height': max_height,
    'root': root,
    'metanodes': metanodes,
    'bounding_box': [[bbox[0].x(), bbox[0].y()], [bbox[1].x(), bbox[1].y()]]
}

In [32]:
json.dump(json_data, open(myjson_filepath, 'w'), indent=2)
print('Converted to ', myjson_filepath, ' #nodes:', len(leaf_nodes), ' #edges: ', len(edges), 
      '#metanodes:', len(metanodes),
      ' height: ', json_data['height'])

Converted to  ../../../data/real-world-compiled/grouseflocks/grouseflocks-moviedb-rateonly-grouseflocks-2.json  #nodes: 31  #edges:  71 #metanodes: 40  height:  2
