In [13]:
from gensim.models import KeyedVectors
import logging
from time import time
from os.path import exists


def try_print(w2v, test_word):
    try:
        for word, score in w2v.most_similar(test_word):
            print(word, score)
    except:
        print("Warning: word '{}' not found.".format(test_word))
        
    
def load_and_pickle(w2v_fpath, binary=False):
    tic = time()
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    w2v_pkl_fpath = w2v_fpath + ".pkl"

    if exists(w2v_pkl_fpath):
        w2v = KeyedVectors.load(w2v_pkl_fpath)
    else:
        w2v = KeyedVectors.load_word2vec_format(w2v_fpath, binary=binary, unicode_errors='ignore')
        w2v.init_sims(replace=True)
        try_print(w2v, "for")
        try_print(w2v, "для")
        w2v.save(w2v_pkl_fpath)
    
    print(time()- tic, "sec.")

    return w2v, w2v_pkl_fpath

w2v_en, w2v_en_fpath = load_and_pickle("/home/panchenko/tmp/GoogleNews-vectors-negative300.txt")
w2v_ru, w2v_ru_fpath = load_and_pickle("/home/panchenko/tmp/all.norm-sz500-w10-cb0-it3-min5.w2v", binary=True)


2018-06-07 13:33:57,332 : INFO : loading EuclideanKeyedVectors object from /home/panchenko/tmp/GoogleNews-vectors-negative300.txt.pkl
2018-06-07 13:34:04,914 : INFO : loading syn0 from /home/panchenko/tmp/GoogleNews-vectors-negative300.txt.pkl.syn0.npy with mmap=None
2018-06-07 13:34:06,914 : INFO : setting ignored attribute syn0norm to None
2018-06-07 13:34:06,915 : INFO : loaded /home/panchenko/tmp/GoogleNews-vectors-negative300.txt.pkl


9.583322048187256 

2018-06-07 13:34:07,744 : INFO : loading EuclideanKeyedVectors object from /home/panchenko/tmp/all.norm-sz500-w10-cb0-it3-min5.w2v.pkl
2018-06-07 13:34:25,904 : INFO : loading syn0 from /home/panchenko/tmp/all.norm-sz500-w10-cb0-it3-min5.w2v.pkl.syn0.npy with mmap=None
2018-06-07 13:34:34,336 : INFO : setting ignored attribute syn0norm to None
2018-06-07 13:34:34,338 : INFO : loaded /home/panchenko/tmp/all.norm-sz500-w10-cb0-it3-min5.w2v.pkl


sec.
26.596656799316406 sec.


In [27]:
#s2v, s2v_pkl = load_and_pickle("/home/panchenko/tmp/vector-link/konvens/ru/rwn-joint-tfidf-sensegram.tsv-1000-sum-score-20.sense_vectors")

try_print(s2v, "sV43891#0")

sV44307#0 0.9392018914222717
sV42992#0 0.8116317987442017
sN28534#0 0.7958892583847046
sV48390#0 0.7906975746154785
sN32026#0 0.7818558216094971
sN41182#0 0.7586578130722046
sN20787#0 0.7583646774291992
sN30294#0 0.7494447231292725
sN36673#0 0.7471664547920227
sV46653#0 0.7462217807769775


In [4]:
dsv.pcz.data["hV44013"][0]["cluster"]

Counter({'вредить#1': 0.501205,
         'говорить#1': 0.26525,
         'красить#2': 0.386612,
         'навредить#1': 0.501205,
         'опустить#1': 0.256847,
         'ругать#1': 0.253083})

In [1]:
import codecs
import operator
from multiprocessing import Pool
from vector_representations.dense_sense_vectors import DenseSenseVectors


def generate_binary_hypers(output_dir, max_synsets=1, hyper_synset_max_size=10, hc_max=0):
    output_fpath = output_dir + "vector-link-s%d-hmx%d-hc%d.csv" % (
        max_synsets, hyper_synset_max_size, hc_max)  
    bin_count = 0
    
    out = codecs.open(output_fpath, "w", "utf-8")
    log = codecs.open(output_fpath + ".log", "w", "utf-8")
    
    for i, h_id in enumerate(dsv.pcz.data):
        if i % 10000 == 0: print(i)

        if "h" in h_id:
            hypo_h_senses = dsv.pcz.data[h_id][0]["cluster"]
            tmp = sorted(dsv.pcz.data[h_id][0]["cluster"].items(), key=operator.itemgetter(1), reverse=True)

            s_id = "s" + h_id[1:]
            hypo_senses = dsv.pcz.data[s_id][0]["cluster"]
            log.write("\n{}{}\n".format(
                h_id, ", ".join(hypo_h_senses)
            ))
            log.write("{}\n".format(
                ", ".join(["{}:{}".format(k,v) for k,v in tmp])
            ))
            log.write("{}{}\n".format(
                s_id, ", ".join(hypo_senses)
            ))

            # save relations from the hierarchical context 
            for hypo_sense in hypo_senses:
                for hc_num, hyper_sense in enumerate(hypo_h_senses):
                    if hc_num == hc_max: break
                    hypo_word = hypo_sense.split("#")[0]
                    hyper_word = hyper_sense.split("#")
                    if hypo_word != hyper_word:
                        out.write("{}\t{}\n".format(hypo_word, hyper_word))
                bin_count += 1

            # save binary relations from a synset
            s_synsets = 0
            for rh_id, s in dsv.sense_vectors.most_similar(h_id + "#0"):
                if "s" in rh_id:
                    hyper_senses = dsv.pcz.data[rh_id.split("#")[0]][0]["cluster"]
                    if len(hyper_senses) > hyper_synset_max_size: continue

                    rh_str = ", ".join(hyper_senses)
                    log.write("\t{}:{:.3f} {}\n".format(rh_id, s, rh_str))

                    for hypo_sense in hypo_senses:
                        for hyper_sense in hyper_senses:
                            hypo_word = hypo_sense.split("#")[0]
                            hyper_word = hyper_sense.split("#")[0]
                            if hypo_word != hyper_word:
                                out.write("{}\t{}\n".format(hypo_word, hyper_word))
                            bin_count += 1
                    s_synsets += 1

                    if s_synsets >= max_synsets: break
    out.close()
    log.close()
    
    print("# binary relations:", bin_count)
    print("binary relations:", output_fpath)
    print("log of binary relations:", output_fpath + ".log")
    
    return (bin_count, output_fpath)
    

output_dir = "/home/panchenko/tmp/vector-link/konvens/ru/"
pcz_fpath="/home/panchenko/tmp/vector-link/konvens/ru/rwn-joint-tfidf-sensegram.tsv"

reload = False
try: dsv
except NameError: reload = True

if reload:
    dsv = DenseSenseVectors(
        pcz_fpath=pcz_fpath,
        word_vectors_obj=None,
        save_pkl=True,
        sense_dim_num=1000,
        norm_type="sum",
        weight_type="score",
        max_cluster_words=20)
 
# todo = []
# for max_top_synsets in range(1,10):
#     for max_hyper_synset_size in [3, 5, 10, 15, 20]:
#         for hc_max in [1, 2, 3, 0]: 
#             p = (output_dir, max_top_synsets, max_hyper_synset_size, hc_max)
#             todo.append(p)
  
# with terminating(Pool(32)) as pool:
#     for res in pool.imap_unordered(runp, todo):
#         print res
     
# for max_top_synsets in range(1,10):
#     for max_hyper_synset_size in [3, 5, 10, 15, 20]:
#         for hc_max in [1, 2, 3, 0]: 
#             print "="*50
#             print "max number of synsets:", max_top_synsets
#             print "max hyper synset size:", max_hyper_synset_size
#             print "hc_max:", hc_max
#             run(output_dir, max_top_synsets, max_hyper_synset_size, hc_max)

Loading spacy model...
Loaded 72143 words from: /home/panchenko/tmp/vector-link/konvens/ru/rwn-joint-tfidf-sensegram.tsv.pkl
Loaded a pre-computed model from: /home/panchenko/tmp/vector-link/konvens/ru/rwn-joint-tfidf-sensegram.tsv-1000-sum-score-20.sense_vectors
Loaded model from: /home/panchenko/tmp/vector-link/konvens/ru/rwn-joint-tfidf-sensegram.tsv


In [None]:
hs_type = "h"
min_size = 3
n = 0
for i, hs_id in enumerate(dsv.pcz.data):
    synset_len = len(dsv.pcz.data[hs_id][0]["cluster"])
    if synset_len >= min_size and hs_type in hs_id:
        print "\n", hs_id, ", ".join(dsv.pcz.data[hs_id][0]["cluster"])
        s_id = "s" + hs_id[1:]
        print s_id, ", ".join(dsv.pcz.data[s_id][0]["cluster"])
        for rhs_id, s in dsv.sense_vectors.most_similar(hs_id + "#0"):
            rhs_str = ", ".join(dsv.pcz.data[rhs_id.split("#")[0]][0]["cluster"])
            print "\t%s:%.3f %s" % (rhs_id, s, rhs_str)
        n += 1
    if n > 100: break

<vector_representations.dense_sense_vectors.DenseSenseVectors at 0x7f2210263358>