In [1]:
from manager import DatasetManager
from notebook_helpers import setup_plotly

In [2]:
import pandas as pd
import numpy as np
import networkx as nx
import scipy.spatial.distance as dist
import math
import plotly.plotly as py
from plotly.graph_objs import *

In [50]:
df = pd.read_csv('words_df.csv')
df[0] = df["0"]
del df['Unnamed: 0'], df["0"]
vertices = np.loadtxt('word2vec_data.txt')
setup_plotly()

In [26]:
def get_local_1_skeleton(word, epsilon, dist_f):
    vertex_id = df.index[df[0] == word][0]
    vertex_vector = vertices[vertex_id]
    nbrs = [i for i, vect in enumerate(vertices) if dist_f(vertex_vector, vect) <= epsilon]
    
    skeleton_1 = nx.Graph()
    for nbr in nbrs:
        skeleton_1.add_node(nbr, word = df[0][nbr], distance=dist_f(vertex_vector, vertices[nbr]))
    skeleton_1.add_edges_from([(vertex_id, nbr) for nbr in nbrs])
    
    manager = DatasetManager(vertices=vertices,
                             centers_num=lambda x: int(math.sqrt(x)),
                             distance_funct=dist_f,
                             epsilon=epsilon)
    manager.get_centers_ready()
    _, _, local_vr = manager.report_on_vertex(vertex_id)
    triangles = [simplex for simplex in local_vr if len(simplex) == 3]
    edges = [tuple(simplex.difference({vertex_id})) for simplex in triangles]
    skeleton_1.add_edges_from(edges)
    return skeleton_1

In [5]:
def arccosdist(vect1, vect2):
    if (vect1 == vect2).all():
        return 0
    return math.degrees(np.arccos(1 - dist.cosine(vect1, vect2)))

In [6]:
def visualise_1_skeleton(skeleton, title, fname):
    pos = nx.spring_layout(skeleton)
    
    dmin=1
    ncenter=0
    for n in pos:
        x, y = pos[n]
        d = (x - 0.5) ** 2 + (y - 0.5) ** 2
        if d < dmin:
            ncenter = n
            dmin = d

    p = nx.single_source_shortest_path_length(skeleton, ncenter)
    edge_trace = Scatter(
        x=[],
        y=[],
        line=Line(width=0.5, color='#888'),
        hoverinfo='none',
        mode='lines')

    for edge in skeleton.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_trace['x'] += [x0, x1, None]
        edge_trace['y'] += [y0, y1, None]

    node_trace = Scatter(
        x=[],
        y=[],
        text=[],
        mode='markers',
        hoverinfo='text',
        marker=Marker(
            showscale=True,
            # colorscale options
            # 'Greys' | 'Greens' | 'Bluered' | 'Hot' | 'Picnic' | 'Portland' |
            # Jet' | 'RdBu' | 'Blackbody' | 'Earth' | 'Electric' | 'YIOrRd' | 'YIGnBu'
            colorscale='YIGnBu',
            reversescale=True,
            color=[],
            size=10,
            colorbar=dict(
                thickness=15,
                title='Distance from the word',
                xanchor='left',
                titleside='right'
            ),
            line=dict(width=2)))
    max_dist = max([data['distance'] for _, data in skeleton.nodes(data=True)])
    for node, data in skeleton.nodes(data=True):
        x, y = pos[node]
        node_trace['x'].append(x)
        node_trace['y'].append(y)
        node_trace['marker']['color'].append(data['distance'] or max_dist + 1)
        node_trace['text'].append("{0}, distance: {1}".format(data['word'], round(data['distance'], 3)))
    
    fig = Figure(data=Data([edge_trace, node_trace]),
                 layout=Layout(
                    title=title,
                    titlefont=dict(size=16),
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=20,l=5,r=5,t=40),
                    xaxis=XAxis(showgrid=False, zeroline=False, showticklabels=False),
                    yaxis=YAxis(showgrid=False, zeroline=False, showticklabels=False)))

    py.plot(fig, filename=fname)

In [77]:
word = 'corporation'
vertex_id = df.index[df[0] == word][0]
epsilon = 76
net = get_local_1_skeleton(word, epsilon, arccosdist)
net.remove_node(vertex_id)
visualise_1_skeleton(net, 
                     title='Local 1-skeleton of word "{0}" (word2vec, {1})'.format(word, epsilon), 
                     fname='{0}_word2vec_{1}'.format(word, epsilon))

In [35]:
manager = DatasetManager(vertices=vertices,
                         centers_num=lambda x: int(math.sqrt(x)),
                         distance_funct=arccosdist,
                         epsilon=epsilon)
manager.get_centers_ready()

In [36]:
simplex_counter, operator_counter, local_vr = manager.report_on_vertex(76)

In [37]:
simplex_counter

Counter({0: 1, 1: 14, 2: 21, 3: 7})

In [38]:
local_vr_words = [{df[0][node] for node in simplex} for simplex in local_vr]

In [53]:
[x for x in local_vr_words if len(x) == 3]

[{'bank', 'corporation', 'firm'},
 {'bank', 'corporation', 'fund'},
 {'bank', 'corporation', 'institution'},
 {'bank', 'branch', 'institution'},
 {'bank', 'corporation', 'depository'},
 {'bank', 'branch', 'depository'},
 {'bank', 'depository', 'transaction'},
 {'bank', 'depository', 'institution'},
 {'bank', 'branch', 'store'},
 {'bank', 'branch', 'supermarket'},
 {'bank', 'store', 'supermarket'},
 {'bank', 'institution', 'thrift'},
 {'bank', 'fund', 'treasury'},
 {'bank', 'depository', 'treasury'},
 {'bank', 'fund', 'savings'},
 {'bank', 'savings', 'thrift'},
 {'bank', 'savings', 'treasury'},
 {'bank', 'branch', 'deposit'},
 {'bank', 'deposit', 'depository'},
 {'bank', 'deposit', 'treasury'},
 {'bank', 'deposit', 'savings'}]