In [1]:
import pandas as pd
import numpy as np
import pickle

from ucimlrepo import fetch_ucirepo 

import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
from plotly.express.colors import sample_colorscale

from TopoTree import TopoTree, plot_hierarchical_treemap
from TopoMap import TopoMap
from HierarchicalTopoMap import HierarchicalTopoMap

In [2]:
colorbewer = {3: ['#377eb8','#e41a1c','#4daf4a'],
              4: ['#377eb8','#e41a1c','#4daf4a','#984ea3'],
              5: ['#377eb8','#e41a1c','#4daf4a','#984ea3','#ff7f00'],
              6: ['#377eb8','#e41a1c','#4daf4a','#984ea3','#ff7f00','#ffff33'],
              7: ['#377eb8','#e41a1c','#4daf4a','#984ea3','#ff7f00','#ffff33','#a65628'],
              8: ['#377eb8','#e41a1c','#4daf4a','#984ea3','#ff7f00','#ffff33','#a65628','#f781bf'],
              9: ['#377eb8','#e41a1c','#4daf4a','#984ea3','#ff7f00','#ffff33','#a65628','#f781bf','#999999'],
              10: ['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a']}

In [3]:
def plot_topomap_comparison_highlight(proj_original, proj_new, 
                                      components_to_highligth, df_comp,
                                      hiertopomap=None):
    

    highligth = np.zeros(shape=proj_original.shape[0])

    for i, comp in enumerate(components_to_highligth):
        highligth[df_comp.loc[comp]['points']] = i+1
    
    fig = make_subplots(rows=1, cols=2,
                        specs=[[{'type': 'xy'},
                                {'type': 'xy'}]],
                        subplot_titles=('Original Projection - TopoMap',
                                        'New Projection - Hierarchical TopoMap'),
                        horizontal_spacing = 0.02)
    
    highligth_values = np.unique(highligth)
    l = len(np.unique(highligth))
    if not 0 in highligth_values:
        l += 1

    #colors = px.colors.qualitative.Plotly
    colors = colorbewer[l]

    for i in range(l):
        if i==0:
            if not 0 in highligth_values:
                continue
            name = 'Other points'
        else:
            name = f'Component {components_to_highligth[i-1]}'

        fig.add_trace(
            go.Scatter(x=proj_original[highligth==i,0], 
                    y=proj_original[highligth==i,1],
                    mode='markers',
                    #opacity=0.5,
                    marker=dict(
                        color=colors[i],
                        size=3,
                    ),
                    name=name,
                    legendgroup=name
                    ),
            row=1, col=1
        )

    if not hiertopomap is None:
        alphas = []
        for c in hiertopomap.components_to_scale:
            alphas.append(hiertopomap.components_info[c]['alpha'])

        #min_alpha = min(alphas)
        #range_alpha = max(alphas)-min_alpha
        max_alpha = max(alphas)
        max_color = sample_colorscale('Blues', [1])[0]

        for j in range(len(hiertopomap.components_info)):
            comp_ids = hiertopomap.components_info[j]['points']
            if 'hull' in hiertopomap.components_info[j].keys():
                hull = hiertopomap.components_info[j]['hull']
                points_ids = [comp_ids[i] for i in hull.vertices]
                points = list(hiertopomap.projections[points_ids,:])
                points.append(points[0])
                xs, ys = zip(*points)

                #alpha_scaled = (hiertopomap.components_info[j]['alpha']-min_alpha)/range_alpha
                alpha_scaled = (hiertopomap.components_info[j]['alpha']-1)/(max_alpha-1)
                hull_color = sample_colorscale('Blues', [alpha_scaled])[0]

                fig.add_trace(go.Scatter(x=xs, y=ys,
                                fill='toself', 
                                fillcolor = hull_color,
                                line_color=max_color,
                                opacity=0.5,
                                line_width=1,
                                text=f'Component {j}',
                                name='Components', legendgroup='Components',
                                showlegend=False,
                                ),
                            row=1, col=2)
                
    for i in range(l):
        if i==0:
            if not 0 in highligth_values:
                continue
            name = 'Other points'
        else:
            name = f'Component {components_to_highligth[i-1]}'

        fig.add_trace(
            go.Scatter(x=proj_new[highligth==i,0], 
                    y=proj_new[highligth==i,1],
                    mode='markers',
                    #opacity=0.5,
                    marker=dict(
                        color=colors[i],
                        size=3,
                    ),
                    name=name,
                    legendgroup=name,
                    showlegend=False
                    ),
            row=1, col=2
        )

    fig.update_layout(margin = dict(t=75, l=25, r=25, b=25),
            height=600,
            width=1200,
            legend= {'itemsizing': 'constant'},
            xaxis=dict(showticklabels=False), 
            yaxis=dict(showticklabels=False),
            xaxis2=dict(showticklabels=False), 
            yaxis2=dict(showticklabels=False)
            )

    return fig

## Synthetic datasets

In [4]:
df_blobs = pd.read_csv('data/3blobs.csv')
data_blobs = df_blobs[['x','y','z']].values

topomap_blobs = TopoMap(data_blobs)
proj_topomap_blobs = topomap_blobs.run()

topotree_blobs = TopoTree(data_blobs, min_box_size=0.05*data_blobs.shape[0])
topotree_blobs.mst = topomap_blobs.mst
topotree_blobs.sorted_edges = topomap_blobs.sorted_edges
comp_info_blobs = topotree_blobs.run()
df_comp_blobs = pd.DataFrame.from_dict(comp_info_blobs)

In [5]:
fig = plot_hierarchical_treemap(df_comp_blobs)
fig.update_layout(title='TopoTree - 3 Blobs Dataset')
fig.show()

In [6]:
components_to_highligth = [0,1,2]
edge_lentghs = [e[2]['weight'] for e in topomap_blobs.sorted_edges]
hiertopomap_blobs = HierarchicalTopoMap(data_blobs, 
                                        components_to_scale=components_to_highligth)
hiertopomap_blobs.min_points_component = topotree_blobs.min_box_size
hiertopomap_blobs.mst = topomap_blobs.mst
hiertopomap_blobs.sorted_edges = topomap_blobs.sorted_edges
proj_hier_blobs = hiertopomap_blobs.run()

Scalling component 1 - Scale: 20 scaling - initial area: 110.938... final area: 44375.352...
Scalling component 0 - Scale: 20 scaling - initial area: 301.268... final area: 120507.383...
Scalling component 2 - Scale: 20 scaling - initial area: 56.134... final area: 22453.715...
[INFO] Number of edges hit. Edges processed: 798


In [7]:
fig = plot_topomap_comparison_highlight(proj_topomap_blobs, proj_hier_blobs, 
                                        components_to_highligth, df_comp_blobs,
                                        hiertopomap_blobs)
fig.update_layout(title='3 Blobs Dataset')
fig.show()

In [8]:
df_rings = pd.read_csv('data/3rings.csv')
data_rings = df_rings[['x','y','z']].values

topomap_rings = TopoMap(data_rings)
proj_topomap_rings = topomap_rings.run()

topotree_rings = TopoTree(data_rings, min_box_size=0.05*data_rings.shape[0])
topotree_rings.mst = topomap_rings.mst
topotree_rings.sorted_edges = topomap_rings.sorted_edges
comp_info_rings = topotree_rings.run()
df_comp_rings = pd.DataFrame.from_dict(comp_info_rings)

In [9]:
fig = plot_hierarchical_treemap(df_comp_rings)
fig.update_layout(title='TopoTree - 3 Rings Dataset')
fig.show()

In [10]:
components_to_highligth = [2,11,15,19,3,10]
edge_lentghs = [e[2]['weight'] for e in topomap_rings.sorted_edges]
hiertopomap_rings = HierarchicalTopoMap(data_rings, 
                                        components_to_scale=components_to_highligth)
hiertopomap_rings.min_points_component = topotree_rings.min_box_size
hiertopomap_rings.mst = topomap_rings.mst
hiertopomap_rings.sorted_edges = topomap_rings.sorted_edges
proj_hier_rings = hiertopomap_rings.run()

Scalling component 10 - Scale: 20 scaling - initial area: 11.868... final area: 4747.089...
Scalling component 3 - Scale: 20 scaling - initial area: 3.734... final area: 1493.780...
Scalling component 11 - Scale: 20 scaling - initial area: 14.103... final area: 5641.061...
Scalling component 2 - Scale: 20 scaling - initial area: 3.510... final area: 1404.120...
Scalling component 19 - Scale: 20 scaling - initial area: 33.063... final area: 13225.234...
Scalling component 15 - Scale: 20 scaling - initial area: 8.536... final area: 3414.337...
[INFO] Number of edges hit. Edges processed: 3208


In [11]:
fig = plot_topomap_comparison_highlight(proj_topomap_rings, proj_hier_rings, 
                                        components_to_highligth, df_comp_rings,
                                        hiertopomap_rings)
fig.update_layout(title='3 Rings Dataset')
fig.show()

In [12]:
df_cavities = pd.read_csv('data/2cavities.csv')
df_cavities = df_cavities.rename(columns={'X':'x', 'Y':'y', 'Z':'z'})
data_cavities = df_cavities[['x','y','z']].values

topomap_cavities = TopoMap(data_cavities)
proj_topomap_cavities = topomap_cavities.run()

topotree_cavities = TopoTree(data_cavities, min_box_size=0.06*data_cavities.shape[0])
topotree_cavities.mst = topomap_cavities.mst
topotree_cavities.sorted_edges = topomap_cavities.sorted_edges
comp_info_cavities = topotree_cavities.run()
df_comp_cavities = pd.DataFrame.from_dict(comp_info_cavities)

In [13]:
fig = plot_hierarchical_treemap(df_comp_cavities)
fig.update_layout(title='TopoTree - 2 Cavities Dataset')
fig.show()

In [14]:
components_to_highligth = [8,7,0]
edge_lentghs = [e[2]['weight'] for e in topomap_cavities.sorted_edges]
hiertopomap_cavities = HierarchicalTopoMap(data_cavities, 
                                        components_to_scale=components_to_highligth)
hiertopomap_cavities.min_points_component = topotree_cavities.min_box_size
hiertopomap_cavities.mst = topomap_cavities.mst
hiertopomap_cavities.sorted_edges = topomap_cavities.sorted_edges
proj_hier_cavities = hiertopomap_cavities.run()

Scalling component 8 - Scale: 15.205358154120338 scaling - initial area: 146.292... final area: 33823.203...
Scalling component 7 - Scale: 12.961825440812117 scaling - initial area: 11.111... final area: 1866.750...
Scalling component 0 - Scale: 14.887755335963446 scaling - initial area: 45.525... final area: 10090.493...
[INFO] Number of edges hit. Edges processed: 4000


In [15]:
fig = plot_topomap_comparison_highlight(proj_topomap_cavities, proj_hier_cavities, 
                                        components_to_highligth, df_comp_cavities,
                                        hiertopomap_cavities)
fig.update_layout(title='2 Cavities Dataset')
fig.show()

## MFeat dataset

In [16]:
data_mfeat = pd.read_csv('data/UCI/multiple+features/mfeat-kar', sep='\s+',
                         header=None,
                         names=['x'+str(i) for i in range(1,65)])
data_mfeat['class'] = 0
for i, row in data_mfeat.iterrows():
    data_mfeat.loc[i,'class'] = i//200

X_mfeat = data_mfeat.drop(['class'], axis=1)
y_mfeat = data_mfeat[['class']]

X_mfeat = X_mfeat.to_numpy()

In [17]:
topomap_mfeat = TopoMap(X_mfeat)
proj_topomap_mfeat = topomap_mfeat.run()

In [18]:
topotree_mfeat = TopoTree(X_mfeat, min_box_size=0.05*X_mfeat.shape[0])
topotree_mfeat.mst = topomap_mfeat.mst
topotree_mfeat.sorted_edges = topomap_mfeat.sorted_edges
comp_info_mfeat = topotree_mfeat.run()

df_comp_mfeat = pd.DataFrame.from_dict(comp_info_mfeat)

In [19]:
fig = plot_hierarchical_treemap(df_comp_mfeat, color='died_at')
fig.update_layout(title='TopoTree - Times Square Dataset')
fig.show()

In [20]:
components_to_highligth = [3,8,6,1,7,10]
edge_lentghs = [e[2]['weight'] for e in topomap_mfeat.sorted_edges]
hiertopomap_mfeat = HierarchicalTopoMap(X_mfeat, 
                                        components_to_scale=components_to_highligth,
                                        max_edge_length=edge_lentghs[int(0.99*len(edge_lentghs))])
hiertopomap_mfeat.min_points_component = topotree_mfeat.min_box_size
hiertopomap_mfeat.mst = topomap_mfeat.mst
hiertopomap_mfeat.sorted_edges = topomap_mfeat.sorted_edges
proj_hier_mfeat = hiertopomap_mfeat.run()

Scalling component 3 - Scale: 4.389601411562673 scaling - initial area: 85869.617... final area: 1654587.000...
Scalling component 8 - Scale: 4.6486542128167585 scaling - initial area: 446585.844... final area: 9650713.000...
Scalling component 7 - Scale: 3.9700423606435358 scaling - initial area: 96905.320... final area: 1527347.625...
Scalling component 10 - Scale: 3.644429709886 scaling - initial area: 106824.695... final area: 1418831.375...
Scalling component 6 - Scale: 3.997484792350992 scaling - initial area: 366994.469... final area: 5864529.500...
Scalling component 1 - Scale: 4.112647200572123 scaling - initial area: 319495.875... final area: 5403910.000...
[INFO] Max edge length hit. Distance: 16.624682535516605 | max_edge_length: 16.600813603196666


In [21]:
fig = plot_topomap_comparison_highlight(proj_topomap_mfeat, proj_hier_mfeat, 
                                        components_to_highligth, df_comp_mfeat,
                                        hiertopomap_mfeat)
fig.update_layout(height=600, width=1200, title='MFeat Dataset', 
                  legend= {'itemsizing': 'constant'},
                  xaxis=dict(showticklabels=False), 
                  yaxis=dict(showticklabels=False),
                  xaxis2=dict(showticklabels=False), 
                  yaxis2=dict(showticklabels=False)
                  )
fig.show()

## LLM datasets - MMLU Test

In [22]:
questions_data = pd.read_csv('data/LLM/mmlu_val_questions_data.csv')
last_emb = pickle.load(open('data/LLM/mmlu_val_last_emb.pkl', 'rb'))

last_emb.shape

(1482, 4096)

In [23]:
topomap_emb = TopoMap(last_emb)
proj_topomap_emb = topomap_emb.run()

In [24]:
topotree_emb = TopoTree(last_emb, min_box_size=0.01*last_emb.shape[0])
topotree_emb.mst = topomap_emb.mst
topotree_emb.sorted_edges = topomap_emb.sorted_edges
comp_info_emb = topotree_emb.run()

df_comp_emb = pd.DataFrame.from_dict(comp_info_emb)

In [25]:
fig = plot_hierarchical_treemap(df_comp_emb, color='died_at')
fig.update_layout(title='TopoTree - LLM MMLU Dataset')
fig.show()

In [26]:
components_to_highligth = [0,1,3,4,7,9]
edge_lentghs = [e[2]['weight'] for e in topomap_emb.sorted_edges]
hiertopomap_emb = HierarchicalTopoMap(last_emb, 
                                        components_to_scale=components_to_highligth)
hiertopomap_emb.min_points_component = topotree_emb.min_box_size
hiertopomap_emb.mst = topomap_emb.mst
hiertopomap_emb.sorted_edges = topomap_emb.sorted_edges
proj_hier_emb = hiertopomap_emb.run()

Scalling component 0 - Scale: 19.357750789583243 scaling - initial area: 74648.414... final area: 27972440.000...
Scalling component 1 - Scale: 9.887334062390783 scaling - initial area: 3278081.500... final area: 320463200.000...
Scalling component 3 - Scale: 9.239624441993698 scaling - initial area: 142406.250... final area: 12157314.000...
Scalling component 4 - Scale: 8.957687108731754 scaling - initial area: 151511.609... final area: 12157317.000...
Scalling component 7 - Scale: 8.332840668257095 scaling - initial area: 59268.352... final area: 4115371.750...
Scalling component 9 - Scale: 3.278237458835155 scaling - initial area: 602624.188... final area: 6476305.500...
[INFO] Number of edges hit. Edges processed: 1480


In [27]:
fig = plot_topomap_comparison_highlight(proj_topomap_emb, proj_hier_emb, 
                                        components_to_highligth, df_comp_emb,
                                        hiertopomap_emb)
fig.update_layout(height=600, width=1200, title='LMM MMLU Dataset', 
                  legend= {'itemsizing': 'constant'},
                  xaxis=dict(showticklabels=False), 
                  yaxis=dict(showticklabels=False),
                  xaxis2=dict(showticklabels=False), 
                  yaxis2=dict(showticklabels=False)
                  )
fig.show()

## LLM datasets - Amazon

In [28]:
df_amazon = pd.read_csv('data/LLM/df_amazon.csv')
emb_amazon_all_layers = pickle.load(open('data/LLM/embeddings_amazon.pickle', 'rb'))
emb_amazon = np.array(list(emb_amazon_all_layers[32].values()))

emb_amazon.shape

(1800, 4096)

In [29]:
topomap_amazon = TopoMap(emb_amazon)
proj_topomap_amazon = topomap_amazon.run()

In [30]:
topotree_amazon = TopoTree(emb_amazon, min_box_size=0.01*emb_amazon.shape[0])
topotree_amazon.mst = topomap_amazon.mst
topotree_amazon.sorted_edges = topomap_amazon.sorted_edges
comp_info_amazon = topotree_amazon.run()

df_comp_amazon = pd.DataFrame.from_dict(comp_info_amazon)

In [31]:
fig = plot_hierarchical_treemap(df_comp_amazon, color='died_at')
fig.update_layout(title='TopoTree - LLM Amazon Dataset')
fig.show()

In [32]:
components_to_highligth = [4,7]
edge_lentghs = [e[2]['weight'] for e in topomap_amazon.sorted_edges]
hiertopomap_amazon = HierarchicalTopoMap(emb_amazon, 
                                        components_to_scale=components_to_highligth)
hiertopomap_amazon.min_points_component = topotree_amazon.min_box_size
hiertopomap_amazon.mst = topomap_amazon.mst
hiertopomap_amazon.sorted_edges = topomap_amazon.sorted_edges
proj_hier_amazon = hiertopomap_amazon.run()

Scalling component 7 - Scale: 8.286870837935895 scaling - initial area: 2449577.500... final area: 168217936.000...
Scalling component 4 - Scale: 7.205852908585328 scaling - initial area: 9346447.000... final area: 485307808.000...
[INFO] Number of edges hit. Edges processed: 1798


In [33]:
fig = plot_topomap_comparison_highlight(proj_topomap_amazon, proj_hier_amazon, 
                                        components_to_highligth, df_comp_amazon,
                                        hiertopomap_amazon)
fig.update_layout(height=600, width=1200, title='LMM Amazon Dataset', 
                  legend= {'itemsizing': 'constant'},
                  xaxis=dict(showticklabels=False), 
                  yaxis=dict(showticklabels=False),
                  xaxis2=dict(showticklabels=False), 
                  yaxis2=dict(showticklabels=False)
                  )
fig.show()