In [None]:
%matplotlib inline
import re, scipy
import numpy as np
from os import mkdir
import os
from time import process_time

import networkx as nx
import community

from collections import Counter
from operator import itemgetter

from gensim.models import Word2Vec

import matplotlib.pyplot as plt
from matplotlib import rc
 
font = {'family': 'Verdana',
        'weight': 'normal'}
rc('font', **font)

from scipy.spatial.distance import pdist, squareform
from scipy.cluster import hierarchy

import warnings
warnings.filterwarnings('ignore')

In [None]:
from collections import namedtuple
Model = namedtuple('Model', ['filename', 'threshold', 'topk', 'binary', 'verbs'])

def make_combined_graph(model_designation_list):
    model_list, verb_list_combined = [], []
    for model_designation in model_designation_list:
        print('Processing model {} with threshold {}.'.format(*model_designation))
        model_bin = load_model(model_designation[0])
        verb_list = get_verbs(model_bin)
        verb_list_combined.extend(verb_list)
        model_list.append(Model(model_designation[0], model_designation[1], model_bin, set(verb_list)))
        print()
    verb_list_combined = list(set(verb_list_combined))
    print('Combined verb list volume {}.\n'.format(len(verb_list_combined)))

    print('Creating new graph...    ', end = '')
    G = nx.Graph()
    print('Done!\nAdding nodes...          ', end = '')
    G.add_nodes_from(verb_list_combined)
    print('Done!\nAdding edges...          ', end = '')
    
    for i, verb1 in enumerate(verb_list_combined):
        for verb2 in verb_list_combined[i+1:]:
            for model in model_list:
                if verb1 in model.verbs and verb2 in model.verbs:
                    sim = model.binary.similarity(verb1, verb2)
                    if sim > model.threshold:
                        G.add_edge(verb1, verb2)
                        
    print('Done!\n')
    print(nx.info(G))
    return G

In [None]:
def load_model(fname):
    model = Word2Vec.load_word2vec_format(fname, binary=True)
    print('Model vocabulary volume', len(model.vocab))
    return model

def get_mverbs(model):
    model_verb_set = set()
    for word in model.vocab:
        match = re.match('.*_V', word)
        if match:
            model_verb_set.add(word)
    print('Model verb list volume', len(model_verb_set))
    return model_verb_set

def get_fverbs():
    freq_verb_set = set()
    with open('aux/all_verbs.txt', 'r', encoding='utf-8') as ifile:
        for line in ifile:
            parts = line.strip('\n').split('\t')
            freq_verb_set.add(parts[0] + '_V')
    print('Frequency dictionary volume', len(freq_verb_set))
    return freq_verb_set

def get_verbs(model):
    model_verb_set = get_mverbs(model)
    freq_verb_set = get_fverbs()
    verb_list = list(model_verb_set & freq_verb_set)
    print('Intersected list volume', len(verb_list))
    return verb_list

def make_graph(model, verb_list, threshold, name=''):
    G = nx.Graph(threshold = threshold)
    print("Created graph")
    G.add_nodes_from(verb_list)
    print("Added nodes")
    for i, verb1 in enumerate(verb_list):
        for verb2 in verb_list[i+1:]:
            sim = model.similarity(verb1, verb2)
            if sim > threshold:
                G.add_edge(verb1, verb2, weight=sim)
    print(nx.info(G))
    return G

def make_graph_topK(model, verb_list, topK=10, name=''):

    print('Creating new graph...    ', end = '')
    G = nx.Graph(topK = topK)
    print('Done!\nAdding nodes...          ', end = '')
    G.add_nodes_from(verb_list)
    print('Done!\nCalculating edges...          ', end = '')
    btime = process_time()
    edge_dict = {}
    for verb in verb_list:
        sim_words = model.most_similar(positive=[verb], topn=topK)
        
        #sim_verbs = {sim_word:score for (sim_word, score) in sim_words if sim_word.endswith('_V')}
        #filtered_sim_verbs = list(set(sim_verbs.keys()) & verb_set)
        #filtered_sim_verbs.sort(key=sim_verbs.get, reverse=True)
        
        #edge_dict.update({sim_word[0]:verb for sim_word in sim_words if sim_word[0].endswith('_V')})
        
        edge_dict.update({sim_word[0]:verb for sim_word in sim_words})
    print("Done in {}".format(process_time() - btime))
    

    print('Filtering edges...          ', end = '')
    btime = process_time()
    filtered = set(edge_dict.keys()) & set(verb_list)
    print("Done in {}".format(process_time() - btime))

    print('Adding edges...          ', end = '')
    btime = process_time()
    G.add_edges_from([(edge_dict[word], word) for word in filtered])
    print("Done in {}".format(process_time() - btime))

    print(nx.info(G))
    return G

def plot_subs(G, dirname):
    mkdir(dirname)
    plt.figure(figsize=(20,10), dpi=80)
    plt.axis('off')
    sub_gen = nx.connected_component_subgraphs(G)
    i = 0
    while True:
        sub = next(sub_gen)
        #print(re.sub('\n', ' ', nx.info(sub)))
        plt.clf()
        nx.draw_networkx(sub, with_labels=True, node_size=1000, 
                         font_size=12, node_shape='o', alpha=0.8, node_color='green')
        plt.savefig('{}/{}.png'.format(dirname, i))    
        i += 1
        
def draw(G):
    plt.figure(figsize=(20,20), dpi=80)
    plt.axis('off')
    nx.draw_networkx(G, with_labels=True, node_size=12000, font_size=18, 
                     node_shape='o', alpha=0.8, node_color='yellow', font_family='Verdana')
    plt.show()
    
def draw_good(G, filename=None):
    plt.figure(figsize=(30,30), dpi=80)
    plt.axis('off')
    nx.draw_spring(G, with_labels=True, node_size=2000, font_size=20, font_family='Verdana')
    if filename is not None:
        plt.savefig(filename)
    
def draw_colored(G, func, cmap_name=None, colors=None):
    plt.figure(figsize=(30,30), dpi=80)
    plt.axis('off')
    if cmap_name is not None and colors is not None:
        func(G, with_labels=True, cmap = plt.get_cmap(cmap_name), node_size=2000, 
                font_size=20, node_color=colors, font_family='Verdana')
    else:
        func(G, with_labels=True, node_size=2000, font_size=20, font_family='Verdana')
    
def draw_colored_small(G, func, cmap_name=None, colors=None):
    plt.figure(figsize=(20,20), dpi=80)
    plt.axis('off')
    if cmap_name is not None and colors is not None:
        func(G, cmap = plt.get_cmap(cmap_name), node_size=50, node_color=colors, font_family='Verdana')
    else:
        func(G, node_size=50, font_family='Verdana')
        
def varinfo(G):
    print('Density:', nx.density(G))
    print()
    #print('Center:', nx.center(I))
    #print('Periphery:', nx.periphery(I))
    #print()
    print('Diameter =', nx.diameter(G))
    print('Avg shortest path =', nx.average_shortest_path_length(G))
    print()
    print('Clustering')
    print(sum(nx.triangles(G).values())/3)
    print(nx.average_clustering(G))
    print(nx.transitivity(G))
    
def component_hist(components):
    lens = [nx.number_of_nodes(C) for C in components]
    gcounter = Counter(lens)
    print(sorted(gcounter.items(), reverse=True, key = itemgetter(0)))
    hist = sorted(gcounter.items(), reverse=True, key = itemgetter(0))
    d = [bar[1] for bar in hist[1:]]
    c = [bar[0] for bar in hist[1:]]
    plt.figure(figsize=(25,15))
    plt.bar(c,d,align='center')
    #plt.plot(d,'.', ms =10)
    plt.xticks(c)
    #plt.yscale('log')
    #plt.xscale('log')
    plt.grid(True, which='both')
    plt.xlabel("бины")
    plt.show()

## Let's roll

In [None]:
model = load_model('models/ruscorpora.model.bin')
verb_list = get_verbs(model)
G = make_graph_topK(model, verb_list, topK=10)

In [None]:
# generate combined models graph
model_designation_list = [('web.model.bin', 0.6), ('ruscorpora.model.bin', 0.7)]
G = make_combined_graph(model_designation_list)

In [None]:
# generate connected components
sub_gen = nx.connected_component_subgraphs(G)
components = []
for I in sub_gen:
    components.append(I)
components.sort(key=nx.number_of_nodes, reverse=True)

In [None]:
# draw histogram of component sizes
component_hist(components)

In [None]:
# output components big enough to be interesting
for C in components[1:]:
    if nx.number_of_nodes(C) > 10:
        #print(nx.info(C))
        draw_good(C)

In [None]:
with open('clusters.csv', 'w', encoding='utf-8') as ofile:
    for i, I in enumerate(components):
        #print(str(len(I.nodes())) + ', ' + ', '.join(I.nodes()))
        ofile.write(', '.join([str(i+1), str(len(I.nodes()))] + I.nodes()) + '\n')

In [None]:
# draw the whole graph
plt.figure(figsize=(20,20), dpi=80)
plt.axis('off')
nx.draw(G, node_size=50)

## Choose subgraph

In [None]:
I = components[0]
print(nx.info(I))
varinfo(I)

In [None]:
plt.figure(figsize=(20,20), dpi=80)
plt.axis('off')
nx.draw(I, node_size=50)
draw_good(I)

### Various info

In [None]:
varinfo(I)

In [None]:
degrees = np.asarray(list(nx.degree(I).values()))
k_mean = np.mean(degrees)
k_square_mean = np.mean(degrees * degrees)
random_trans = (k_square_mean - k_mean) * (k_square_mean - k_mean) / (k_mean * k_mean * k_mean) / degrees.shape[0]
print(random_trans)
print(nx.transitivity(I) / random_trans)

In [None]:
print(sorted(nx.degree(I).items(), reverse=True, key = itemgetter(1)))

In [None]:
d = nx.degree_histogram(I)
c = range(0,len(d),1)
plt.figure(figsize=(25,15))
plt.bar(c,d,align='center')
#plt.plot(d,'.', ms =10)
plt.xticks(c)
#plt.yscale('log')
#plt.xscale('log')
plt.grid(True,which='both')
plt.show()

In [None]:
print(d)

### Structural equivalence

In [None]:
A = nx.adjacency_matrix(I)
print(A.shape)

plt.spy(A, precision=0, marker='.', markersize=5)
plt.show()

In [None]:
SD=pdist(A.toarray(),'cosine')
M = 1-squareform(SD)
plt.imshow(M,cmap='winter',interpolation='nearest')
plt.show()

In [None]:
Z = hierarchy.average(SD)
hh=hierarchy.dendrogram(Z)

### Cliques

In [None]:
def draw_circ(G):
    plt.figure(figsize=(20,10), dpi=80)
    plt.axis('off')
    nx.draw_circular(G, with_labels=True, node_size=500, font_size=16, node_color='yellow')

In [None]:
print(nx.graph_clique_number(I))
print(nx.graph_number_of_cliques(I))
cliques = list(nx.find_cliques(I)) 
print(cliques)

In [None]:
for clique in sorted(cliques, key=len, reverse=True):
    if len(clique) > 2:
        iclique = I.subgraph(clique)
        draw_circ(iclique)

### Cores

In [None]:
cores=nx.core_number(I)
#print(cores)
sorted_cores = sorted(cores.items(), key=itemgetter(1))
#print(sorted_cores)

In [None]:
max_core = sorted_cores[-1][1]
for core in range(max_core, 0, -1):
    core_nodes = [coreitem[0] for coreitem in sorted_cores if coreitem[1] >= core]
    CG = I.subgraph(core_nodes)
    draw_colored_small(CG, nx.draw_circular)
    draw_colored(CG, nx.draw_circular)

In [None]:
colors = [cores[node] for node in I.nodes()]
#draw_colored(I, 'spring', colors)
draw_colored_small(I, nx.draw, 'autumn', colors)

### Communities

In [None]:
def get_communities(G):
    partition = community.best_partition(G)
    return flatten_partition(partition)

def flatten_partition(partition):
    communities = {}
    for key in partition:
        comm = partition[key]
        communities[comm] = communities.get(comm, []) + [key]
    communities = sorted(communities.items(), key=lambda x: len(x[1]), reverse=True)
    return communities    

In [None]:
dendo = community.generate_dendogram(I)
comms = community.partition_at_level(dendo, 0)
colors = [comms[node] for node in I.nodes()]
draw_colored(I, nx.draw_spring, 'Accent', colors)

In [None]:
dendo = community.generate_dendogram(components[0])
comms = flatten_partition(community.partition_at_level(dendo, 0))

for comm in comms:
    if len(comm[1]) > 10:
        ComGr = components[0].subgraph(comm[1])
        comdendo = community.generate_dendogram(ComGr)
        compart = community.partition_at_level(comdendo, 0)
        
        colors = [compart[node] for node in ComGr.nodes()]
        draw_colored(ComGr, nx.draw, 'Accent', colors)

### Centralities

In [None]:
def draw_cent(G, cent_dict, lbls=False):
    cents = [cent_dict[x] for x in G.nodes()]
    max_cent = max(cents)
    sizes = [cent / max_cent * 10000 for cent in cents]
    colors = [float(cent_dict[x]) for x in G.nodes()]
    plt.figure(figsize=(20,20), dpi=80)
    plt.axis('off')
    nx.draw(G, node_size=sizes, with_labels=lbls, node_color=colors, cmap = plt.cm.winter, font_size=20)

In [None]:
for cent_func in [
    nx.degree_centrality, 
    nx.betweennes_centrality, 
    nx.closeness_centrality, 
    nx.eigenvector_centrality]:

    cent = cent_func(I)
    print(sorted(cent.items(),reverse=True,key = itemgetter(1))[:10])
    draw_cent(I, cent, lbls=True)

In [None]:
dcent = nx.degree_centrality(I)
print(sorted(dcent.items(),reverse=True,key = itemgetter(1)))

In [None]:
draw_cent(I, dcent)

In [None]:
bcent=nx.betweenness_centrality(I)
print(sorted(bcent.items(),reverse=True,key = itemgetter(1)))

In [None]:
draw_cent(I, bcent)

In [None]:
ccent=nx.closeness_centrality(I)
print(sorted(ccent.items(),reverse=True,key = itemgetter(1)))

In [None]:
draw_cent(I, ccent)

In [None]:
ecent=nx.eigenvector_centrality_numpy(I)
print(sorted(ecent.items(),reverse=True,key = itemgetter(1)))

In [None]:
draw_cent(I, ecent)