In [15]:
from bidict import bidict
import numpy as np
from time import time
from traceback import format_exc
import codecs
from scipy.sparse import csr_matrix


class CRSGraph(object):
    """ A graph based on the CSR sparse matrix data structure. """

    def __init__(self, neighbors_fpath):
        self._graph, self.index = self._load(neighbors_fpath) 
      
    
    def _get_or_add(self, dictionary, value):
        """ Gets the key associated with the value if exists. 
        Otherwiese inserts the value eq. to the length of the 
        dictionary and returns the key. """

        if value not in dictionary:
            value_idx = len(dictionary)
            dictionary[value] = len(dictionary)
        else:
            value_idx = dictionary[value]

        return value_idx

    
    def _load(self, neighbors_fpath):   
        tic = time()
        with codecs.open(neighbors_fpath, "r", "utf-8") as graph:
            src_lst = []
            dst_lst = []
            data_lst = []
            index = bidict()
            word_dict = {}
            for i, line in enumerate(graph):                
                if i % 10000000 == 0 and i != 0: print(i)
                try:
                    src, dst, weight = line.split("\t")
                    src = src.strip()
                    dst = dst.strip()
                    src_idx = get_or_add(index, src)
                    dst_idx = get_or_add(index, dst) 

                    src_lst.append(int(src_idx))
                    dst_lst.append(int(dst_idx))
                    data_lst.append(np.int16(floor(float(weight) * 10000.)))
                except:
                    print(format_exc())
                    print("Bad line:", line)

        rows = np.array(src_lst)
        cols = np.array(dst_lst)
        data = np.array(data_lst, dtype=np.int16)
        graph = csr_matrix( (data, (rows, cols)), shape=(len(index),len(index)), dtype=np.int16 )       
        print("Loaded in {:f} sec.".format(time() - tic))

        return graph, index 

    def get_neighbors(self, word):
        idx_i = self.index[word]
        
        nns = {self.index.inv[idx_j]: self._graph[idx_i].data[j] 
               for j, idx_j in enumerate(self._graph[idx_i].indices)}
        
        return nns
       
    def get_weight(self, word_i, word_j):
        return 0.0
    
neighbors_fpath = "model/300"
g = CRSGraph(neighbors_fpath)
g.get_neighbors("the")

Loaded in 0.003502 sec.


{'a': 9019,
 'aba': 3363,
 'about': 8359,
 'adults': 4563,
 'age': 3865,
 'albedo': 8933,
 'all': 5281,
 'also': 6652,
 'although': 5261,
 'an': 7645,
 'anarchism': 8013,
 'anarchist': 5943,
 'anarchists': 6828,
 'and': 9257,
 'another': 3830,
 'are': 8579,
 'areas': 4803,
 'as': 8517,
 'asperger_s': 7207,
 'at': 8146,
 'auditor': 3458,
 'autism': 9690,
 'autism_spectrum': 6627,
 'autistic': 9159,
 'autistic_people': 5404,
 'autistics': 7663,
 'bani': 3456,
 'be': 6364,
 'because': 6299,
 'behavior': 3711,
 'books': 3647,
 'but': 5611,
 'buying': 3494,
 'by': 7960,
 'c': 4130,
 'can': 6888,
 'can_be': 5101,
 'child': 4602,
 'childhood': 3740,
 'children': 6285,
 'chrominance': 3349,
 'clouds': 6159,
 'collectivity': 3360,
 'com': 4921,
 'common': 4254,
 'communication': 5853,
 'community': 4139,
 'considered': 3343,
 'created': 3392,
 'crystalline': 4260,
 'culture': 4348,
 'cure': 4756,
 'curriculum': 3363,
 'dark': 4468,
 'desire': 3688,
 'development': 3346,
 'dharmas': 3356,
 'diag

In [41]:
g._row_names

bidict({'the': 0, 'of': 1})

TypeError: foo() takes 1 positional argument but 2 were given

In [44]:
from itertools import zip_longest
import argparse, sys, subprocess
from utils.common import exists
from os.path import basename
import gensim
import gzip
from gensim.models.keyedvectors import KeyedVectors
from gensim.utils import tokenize
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from time import time
import numpy as np
from chinese_whispers import chinese_whispers, aggregate_clusters
import codecs
import networkx as nx
from multiprocessing import Pool
from os.path import join
from collections import defaultdict
import codecs 
from time import time

import filter_clusters
import vector_representations.build_sense_vectors
from utils.common import ensure_dir
import pcz


# def get_ego_network(ego):
#     tic = time()
#     ego_network = nx.Graph(name=ego)

#     # Add related and substring nodes
#     substring_nodes = []
#     for j, node in enumerate(index.keys()):
#         if ego.lower() == node.lower():
#             ###ego_nodes = [(rn, {"weight": G[node][rn]["weight"]}) for rn in G[node].keys()]
#             ego_network.add_nodes_from(ego_nodes)
#         else:
#             if "_" not in node: continue
#             if node.startswith(ego + "_") or node.endswith("_" + ego):

#                 if ego in index and node in ###G[ego]: w = G[ego][node]["weight"]
#                 else: w = 0.99

#                 substring_nodes.append( (node, {"weight": 0.99}) )
#     ego_network.add_nodes_from(substring_nodes)

#     # Find edges of the ego network
#     for r_node in ego_network:
#         related_related_nodes = ###G[r_node]
#         related_related_nodes_ego = sorted(
#             [(related_related_nodes[rr_node]["weight"], rr_node) for rr_node in related_related_nodes if rr_node in ego_network],
#             reverse=True)[:n]
#         related_edges = [(r_node, rr_node, {"weight": w}) for w, rr_node in  related_related_nodes_ego]
#         ego_network.add_edges_from(related_edges)

#     chinese_whispers(ego_network, weighting="top", iterations=20)
#     if verbose: print("{}\t{:f} sec.".format(ego, time()-tic))

#     return ego_network


#G = None
#index = None
n = None


def ego_network_clustering(neighbors_fpath, clusters_fpath, max_related=300, num_cores=32):
    global G
    global index
    global n
    G = graph 
    
    with codecs.open(clusters_fpath, "w", "utf-8") as output, Pool(num_cores) as pool:
        output.write("word\tcid\tcluster\tisas\n")

        for i, ego_network in enumerate(pool.imap_unordered(get_ego_network, index.keys())):
            if i % 100 == 0: print(i, "ego networks processed")
            sense_num = 1
            for label, cluster in sorted(aggregate_clusters(ego_network).items(), key=lambda e: len(e[1]), reverse=True):
                output.write("{}\t{}\t{}\t\n".format(
                    ego_network.name,
                    sense_num,
                    ", ".join( ["{}:{:.4f}".format(c_node, ego_network.node[c_node]["weight"]) for c_node in cluster] )
                ))
                sense_num += 1
    print("Clusters:", clusters_fpath)
    

neighbors_fpath = "/home/panchenko/sensegram/model/wiki.txt.graph"
graph, row_names, col_names = get_sparse_graph(neighbors_fpath)

10000000
20000000
30000000
40000000
Loaded in 197.574517 sec.


In [43]:
node_i = "the"
nns_i = graph[row_names[node_i]]
for j, idx_j in enumerate(nns_i.indices):
    print(col_names.inv[idx_j], nns_i.data[j])
    

a 0.901999
aba 0.336302
about 0.835913
adults 0.456392
age 0.386564
albedo 0.893342
all 0.528141
also 0.665255
although 0.526195
an 0.764593
anarchism 0.801308
anarchist 0.594362
anarchists 0.682834
and 0.925761
another 0.383097
are 0.857928
areas 0.480383
as 0.85173
asperger_s 0.720731
at 0.814688
auditor 0.345872
autism 0.969057
autism_spectrum 0.662745
autistic 0.915966
autistic_people 0.540493
autistics 0.766357
bani 0.345694
be 0.636418
because 0.629942
behavior 0.371195
books 0.364765
but 0.561157
buying 0.349421
by 0.796067
c 0.413061
can 0.688808
can_be 0.510148
child 0.460222
childhood 0.37403
children 0.628527
chrominance 0.334988
clouds 0.615906
collectivity 0.336066
com 0.492151
common 0.425418
communication 0.585384
community 0.413907
considered 0.334309
created 0.33926
crystalline 0.426053
culture 0.434831
cure 0.475607
curriculum 0.336359
dark 0.446822
desire 0.368871
development 0.334654
dharmas 0.335657
diagnosis 0.446196
different 0.572532
disorder 0.749866
disorders 

In [46]:
graph?

In [31]:
index["the"]

171