In [None]:
# save the model in the gensim format to load it fast again
de_gensim_fpath = "model/cc.de.300.word_vectors.gensim"

tic = time()
wv.save(de_gensim_fpath)
print("Saved in {} sec.".format(time()-tic))

In [None]:
from gensim.models import Word2Vec

# load the model 

tic = time()
wvg = Word2Vec.load(de_gensim_fpath)
print("Loaded in {} sec.".format(time()-tic))

In [None]:
from word_sense_induction import minimize 
from chinese_whispers import chinese_whispers, aggregate_clusters
from networkx import Graph
from gensim.models import KeyedVectors
from time import time 


TOPN = 50
verbose = True
# load the wv here as a global variable ...

# load de vectors with gensim 
tic = time()
word_vectors_fpath = "model/cc.de.300.word_vectors"
wv = KeyedVectors.load_word2vec_format(word_vectors_fpath, binary=False, unicode_errors="ignore")
print("Loaded in {} sec.".format(time()-tic))


def get_nns(target, topn=TOPN):
    nns = wv.most_similar(positive=[target], negative=[], topn=topn)
    nns = [(word, score) for word, score in nns if minimize(word) != minimize(target)]
    return nns


def in_nns(nns, word):
    for w, s in nns:
        if minimize(word) == minimize(w):
            return True
        
    return False 


def get_pair(first, second):
    pair_lst = sorted([first, second])
    sorted_pair = (pair_lst[0], pair_lst[1])
    return sorted_pair         


def get_disc_pairs(ego, topn=TOPN):  
    pairs = set()
    nns = get_nns(ego, topn)
    
    for i in range(len(nns)):
        topi = nns[i][0]
        nns_topi = get_nns(topi, topn) 
        nns_untopi = wv.most_similar(positive=[ego], negative=[topi], topn=topn)
        untopi = nns_untopi[0][0]
        if in_nns(nns, untopi): pairs.add(get_pair(topi, untopi))

    return pairs


def get_nodes(pairs):
    nodes = set()
    for src, dst in pairs:
        nodes.add(src)
        nodes.add(dst)
        
    return nodes


def foo(ego, topn=TOPN):
    pairs = get_disc_pairs(ego, topn)
    nodes = get_nodes(pairs)
    
    return pairs, nodes


def list2dict(lst):
    return {p[0]: p[1] for p in lst}


def wsi(ego, topn=TOPN):
    tic = time()
    ego_network = Graph(name=ego)

    pairs = get_disc_pairs(ego, topn)
    nodes = get_nodes(pairs)   
    ego_network.add_nodes_from(nodes)

    for r_node in ego_network:
        related_related_nodes = list2dict(get_nns(r_node))
        related_related_nodes_ego = sorted(
            [(related_related_nodes[rr_node], rr_node) for rr_node in related_related_nodes if rr_node in ego_network],
            reverse=True)[:topn]
        
        related_edges = []
        for w, rr_node in related_related_nodes_ego:
            if get_pair(r_node, rr_node) not in pairs:
                related_edges.append( (r_node, rr_node, {"weight": w}) )
            else:
                print("Skipping:", r_node, rr_node)
        ego_network.add_edges_from(related_edges)

    chinese_whispers(ego_network, weighting="top", iterations=20)
    if verbose: print("{}\t{:f} sec.".format(ego, time()-tic))

    return ego_network


m = wsi("Maus")
s = wsi("Schlange")

In [None]:
%matplotlib inline

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

G = m

colors = [1. / G.node[node]['label'] for node in G.nodes()]

fig = plt.gcf()
fig.set_size_inches(10, 10)

nx.draw_networkx(G, cmap=plt.get_cmap('gist_rainbow'), node_color=colors, font_color='black')

plt.show()

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

G = s

colors = [1. / G.node[node]['label'] for node in G.nodes()]

fig = plt.gcf()
fig.set_size_inches(10, 10)

nx.draw_networkx(G, cmap=plt.get_cmap('gist_rainbow'), node_color=colors,
                 font_color='black')

plt.show()

In [None]:
G = m
print('Cluster ID\tCluster elements\n')
for label, cluster in sorted(aggregate_clusters(G).items(), key=lambda e: len(e[1]), reverse=True):
    print('{}\t{}\n'.format(label, cluster))


In [None]:
G = s
print('Cluster ID\tCluster elements\n')
for label, cluster in sorted(aggregate_clusters(G).items(), key=lambda e: len(e[1]), reverse=True):
    print('{}\t{}\n'.format(label, cluster))



In [None]:
%load_ext autoreload
%autoreload 2