In [1]:
from py2neo import *
from Schema import *
from neo4j import GraphDatabase
from pickle5 import pickle
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import plotly.graph_objects as go
from sklearn.manifold import TSNE
import random
import numpy as np

def getRandomColor(n):
    colors = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)])
             for i in range(n)]
    return colors

sc = pickle.load(open('snb_schema_emailLanguage_modifiedEdgeTypeSet','rb',-1))

snb_gt_sc = pickle.load(open('schema_snb-1_all_nodes_rels_ground_truth.pickle','rb',-1))

import pickle
mb6_sc = pickle.load(open('schema_mb6_allNodesRels.pickle','rb',-1))
mb6_gt_sc = pickle.load(open('schema_mb6_ground_truth.pickle','rb',-1))
# shop_sc = pickle.load(open('shop_attr_based_schema.pkl','rb',-1))

In [2]:

def plot_snb(sim_matrix,title,metric=None,perplexity=30):
    x,labels = sc.nodetypeset.plotVectorizedNodes()
    snb_color_dict = {'Company':'#a6cee3','Comment':'#1f78b4','University':'#b2df8a','Tag;':'#33a02c',
                     'TagClass':'#fb9a99','Post':'#e31a1c','Country':'#fdbf6f','City':'#ff7f00',
                      'Continent':'#cab2d6','Forum':'#6a3d9a','Person':'#ffff99'}
    snb_color = []
    for l in labels:
        for k in snb_color_dict:
            if k in l:
#                 print(k)
                snb_color.append(snb_color_dict[k])
#                 mb6_color.append(k)
#     print(labels)
#     print(len(snb_color),len(labels))
    assert(len(labels)==len(snb_color))
    X_2d = None

    if metric!=None:
        x = np.array(x)
        tsne = TSNE(n_components=2, random_state=0,metric=metric,perplexity=perplexity)
        X_2d = tsne.fit_transform(x)
    else:
        sim_matrix = 1 - np.matrix(sim_matrix) #note, the argument should be distance so use 1-sim_matrix
        tsne = TSNE(n_components=2, random_state=0,metric='precomputed',perplexity=perplexity)
        X_2d = tsne.fit_transform(sim_matrix)

    layout = go.Layout(
        plot_bgcolor='rgba(0,0,0,0)'
    )
    fig = go.Figure(data=go.Scatter(x=X_2d[:, 0],
                                    y=X_2d[:, 1],
                                    mode='markers',
                                    marker_color=snb_color,
                                    text=labels),
                                    layout=layout) # hover text goes here

    fig.update_layout(title=title)
    fig.show()

In [6]:
def plot_mb6(sim_matrix,title,metric=None,perplexity=30):
    x,labels = mb6_sc.nodetypeset.plotVectorizedNodes()
    mb6_color_dict = {'Meta':'blue','Synapse;':'red','Segment':'black','SynapseSet':'yellow'}
    mb6_color = []
    for l in labels:
        for k in mb6_color_dict:
            if k in l:
                mb6_color.append(mb6_color_dict[k])
#                 mb6_color.append(k)
    X_2d = None

    if metric!=None:
        x = np.array(x)
        tsne = TSNE(n_components=2, random_state=0,metric=metric,perplexity=perplexity)
        X_2d = tsne.fit_transform(x)
    else:
        sim_matrix = 1 - np.matrix(sim_matrix) #note, the argument should be distance so use 1-sim_matrix
        tsne = TSNE(n_components=2, random_state=0,metric='precomputed',perplexity=perplexity)
        X_2d = tsne.fit_transform(sim_matrix)

    layout = go.Layout(
        plot_bgcolor='rgba(0,0,0,0)'
    )
    fig = go.Figure(data=go.Scatter(x=X_2d[:, 0],
                                    y=X_2d[:, 1],
                                    mode='markers',
                                    marker_color=mb6_color,
                                    text=labels),
                                    layout=layout) # hover text goes here
    fig.update_layout(title=title)
    fig.show()

In [90]:
sim_matrix = sc.nodetypeset.mergeByJaccardSim(return_sim_matrix=True,idf=True)
sim_matrix =  np.matrix(sim_matrix)
# sim_matrix[sim_matrix>0.5] = 1
# sim_matrix[sim_matrix<=0.5] = 0
plot_snb(sim_matrix, 'no idf, no threshold',perplexity=10)





In [100]:
sim_matrix = mb6_sc.nodetypeset.mergeByJaccardSim(return_sim_matrix=True,idf=True)
plot_mb6(sim_matrix, 'with idf, no threshold',perplexity=20)





In [39]:
sim_matrix = mb6_sc.nodetypeset.mergeByJaccardSim(return_sim_matrix=True,idf=True)
sim_matrix = np.matrix(sim_matrix)
sim_matrix[sim_matrix>0.5] = 1
sim_matrix[sim_matrix<=0.5] = 0

plot_mb6(sim_matrix,'with idf, with 0.5 as threshold')






## We can observe that with threshold=0.5, there's still correct cluster, it means 0.5 can be used as a threshold to merge cluster.  if threshold is higher 0.9, there's no more correct clusters formed

In [36]:
sim_matrix = mb6_sc.nodetypeset.mergeByJaccardSim(return_sim_matrix=True,idf=False)
sim_matrix = np.matrix(sim_matrix)
sim_matrix[sim_matrix>0.5] = 1
sim_matrix[sim_matrix<=0.5] = 0

plot_mb6(sim_matrix,'no idf, with 0.5 as threshold')






In [44]:
sim_matrix = mb6_sc.nodetypeset.mergeByJaccardSim(return_sim_matrix=True,idf=False)
sim_matrix = np.matrix(sim_matrix)


plot_mb6(sim_matrix=None,title='cosine', metric='cosine')





In [11]:
srcN_edges_map, tarN_edges_map = mb6_sc.getNodeEdgeMap()
sim_matrix, src_mat, tar_mat  = \
    mb6_sc.nodetypeset.mergeBySimAndEdge(srcN_edges_map,tarN_edges_map,return_sim_matrix=True)

node_matrix = np.array(sim_matrix)
src_mat = np.array(src_mat)
tar_mat = np.array(tar_mat)
# sim_matrix, src_mat, tar_mat = mb6_sc.nodetypeset.mergeBySimAndEdge(return_sim_matrix=True,idf=False)
# sim_matrix = np.matrix(sim_matrix)
avg_matrix = (node_matrix+src_mat+tar_mat) / 3

plot_mb6(sim_matrix=src_mat,title='edge src')
plot_mb6(sim_matrix=tar_mat,title='edge tar')
plot_mb6(sim_matrix=node_matrix,title='node matrix')
plot_mb6(sim_matrix=avg_matrix,title='avg tar')

















In [23]:
srcN_edges_map, tarN_edges_map = sc.getNodeEdgeMap()
sim_matrix, src_mat, tar_mat  = \
    sc.nodetypeset.mergeBySimAndEdge(srcN_edges_map,tarN_edges_map,return_sim_matrix=True)

node_matrix = np.array(sim_matrix)
src_mat = np.array(src_mat)
tar_mat = np.array(tar_mat)
# sim_matrix, src_mat, tar_mat = mb6_sc.nodetypeset.mergeBySimAndEdge(return_sim_matrix=True,idf=False)
# sim_matrix = np.matrix(sim_matrix)
avg_matrix = (node_matrix+src_mat+tar_mat) / 3

plot_snb(sim_matrix=src_mat,title='edge src')
plot_snb(sim_matrix=tar_mat,title='edge tar')
plot_snb(sim_matrix=node_matrix,title='node matrix')
plot_snb(sim_matrix=avg_matrix,title='avg tar')

















In [9]:
def plotMb6Edge(sim_matrix,title='edge similarity'):
    tsne = TSNE(n_components=2, random_state=0,metric='precomputed')
    X_2d = tsne.fit_transform(1-sim_matrix.toarray())
    labels = mb6_sc.edgetypeset.getListOfEdgeInfo()
    layout = go.Layout(
        plot_bgcolor='rgba(0,0,0,0)'
    )
    # get color
    edge_types = [['Segment','Contains'],['SynapsesTo'],['Contains','Synapse'],
                  ['SynapseSet','ConnectsTo'],
                 ['ConnectsTo','mushroombody_Segment']]
    color_list = []
    colomap = getRandomColor(5)
    for edge_pattern in labels:
        lbl_li = edge_pattern.split(',')
        append = 0
        for ind,lblset in enumerate(edge_types):
            cnt = 0
            for indi_lbl in lblset:
                if indi_lbl in lbl_li:
                    cnt += 1 
            if cnt == len(lblset):
                color_list.append(colomap[ind])
                append = 1 
    fig = go.Figure(data=go.Scatter(x=X_2d[:, 0],
                                    y=X_2d[:, 1],
                                    mode='markers',
                                    marker_color=color_list,
                                    text=labels),
                                    layout=layout) # hover text goes here

    fig.update_layout(title=title)
    fig.show()
src_matrix,tar_matrix,edge_matrix = mb6_sc.edgetypeset.mergeByEdgeSim()
plotMb6Edge(src_matrix,title='edge patterns with src similarity')
plotMb6Edge(tar_matrix,title='edge patterns with target similarity')
plotMb6Edge(edge_matrix,title='edge patterns with edge similarity')
plotMb6Edge((src_matrix+tar_matrix+edge_matrix)/3, title='avg similarity edge patterns')

















In [3]:
sim_matrix = sc.edgetypeset.mergeByEdgeSim().toarray()
tsne = TSNE(n_components=2, random_state=0,metric='precomputed')
X_2d = tsne.fit_transform(1-sim_matrix/3)
labels = sc.edgetypeset.getListOfEdgeInfo()
layout = go.Layout(
    plot_bgcolor='rgba(0,0,0,0)'
)
fig = go.Figure(data=go.Scatter(x=X_2d[:, 0],
                                y=X_2d[:, 1],
                                mode='markers',
#                                 marker_color=snb_color,
                                text=labels),
                                layout=layout) # hover text goes here

fig.update_layout(title='snb edge jaccard simliarity, each point is an edge pattern')
fig.show()





In [15]:
from pickle5 import pickle
shop_sc = pickle.load(open('shop_attr_based_schema.pkl','rb',-1))
x,labels = shop_sc.nodetypeset.plotVectorizedNodes()
sim_matrix = shop_sc.nodetypeset.mergeByJaccardSim(return_sim_matrix=True,idf=False)

In [16]:
from lxml import etree
from matplotlib.cm import cool
xml_tree = etree.parse('shop_data/shop.xml')
node_labels = [i.text for i in xml_tree.findall(".//types/alias")]
colors = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)])
             for i in range(len(node_labels))]
# colors = get_n_colors(len(node_labels))
shop_color = []
for l in labels:
    lb_li = l.split(';')
    cnt = 0
    for i,nl in enumerate(node_labels):
        
        if cnt > 1:
            print(l)
        if nl in lb_li:
            cnt += 1 
            shop_color.append(colors[i])
#             break
        

In [17]:
tsne = TSNE(n_components=2, random_state=0,metric='precomputed',perplexity=50)
X_2d = tsne.fit_transform(1-sim_matrix.toarray())
# labels = sc.edgetypeset.getListOfEdgeInfo()
layout = go.Layout(
    plot_bgcolor='rgba(0,0,0,0)'
)
fig = go.Figure(data=go.Scatter(x=X_2d[:, 0],
                                y=X_2d[:, 1],
                                mode='markers',
                                marker_color=shop_color,
                                opacity=0.5,
                                text=labels),
                                layout=layout) # hover text goes here

fig.update_layout(title='shop node jaccard simliarity, each point is an edge pattern,no weighting')
fig.show()



