In [8]:
from gensim.models import KeyedVectors
import logging
from time import time
from os.path import exists


def try_print(w2v, test_word):
    try:
        for word, score in w2v.most_similar(test_word):
            print(word, score)
    except:
        print("Warning: word '{}' not found.".format(test_word))
        
    
def load_and_pickle(w2v_fpath, binary=False):
    tic = time()
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    w2v_pkl_fpath = w2v_fpath + ".pkl"

    if exists(w2v_pkl_fpath):
        w2v = KeyedVectors.load(w2v_pkl_fpath)
    else:
        w2v = KeyedVectors.load_word2vec_format(w2v_fpath, binary=binary, unicode_errors='ignore')
        w2v.init_sims(replace=True)
        try_print(w2v, "for")
        try_print(w2v, "для")
        w2v.save(w2v_pkl_fpath)
    
    print(time()- tic, "sec.")

    return w2v, w2v_pkl_fpath

w2v_en_original_fpath = "/home/panchenko/tmp/GoogleNews-vectors-negative300.txt"
w2v_ru_original_fpath = "/home/panchenko/tmp/all.norm-sz500-w10-cb0-it3-min5.w2v"

w2v_en, w2v_en_fpath = load_and_pickle(w2v_en_original_fpath)
w2v_ru, w2v_ru_fpath = load_and_pickle(w2v_ru_original_fpath, binary=True)

from glob import glob 
from vector_representations.build_sense_vectors import run

for lang in ["ru", "en"]:
    sensegram_fpaths = "/home/panchenko/tmp/vector-link/konvens/{}/*-sensegram.tsv".format(lang)
    w2v_fpath = w2v_ru_original_fpath if "ru" else w2v_en_original_fpath 

    for inventory_fpath in glob(sensegram_fpaths):
        run(inventory_fpath, w2v_fpath) 

In [7]:
pcz_fpath="/home/panchenko/tmp/vector-link/konvens/ru/watset-cw-nolog-mcl-patterns-limit-tfidf-sensegram.tsv"
run(pcz_fpath, w2v_ru_original_fpath)

2018-06-08 23:18:14,671 : INFO : loading EuclideanKeyedVectors object from /home/panchenko/tmp/all.norm-sz500-w10-cb0-it3-min5.w2v.pkl


Input PCZ: /home/panchenko/tmp/vector-link/konvens/ru/watset-cw-nolog-mcl-patterns-limit-tfidf-sensegram.tsv
Input word vectors: /home/panchenko/tmp/all.norm-sz500-w10-cb0-it3-min5.w2v
Sparse: False
Type of vector normalization: sum
Weight type: score
Max. number of cluster words to use: 20
Sense dim. number (sparse only): 1000
Save pickle (sparse only): False


2018-06-08 23:18:31,721 : INFO : loading syn0 from /home/panchenko/tmp/all.norm-sz500-w10-cb0-it3-min5.w2v.pkl.syn0.npy with mmap=None
2018-06-08 23:18:41,262 : INFO : setting ignored attribute syn0norm to None
2018-06-08 23:18:41,263 : INFO : loaded /home/panchenko/tmp/all.norm-sz500-w10-cb0-it3-min5.w2v.pkl
2018-06-08 23:18:41,264 : INFO : precomputing L2-norms of word weight vectors


0 (0) senses loaded of 60088
25000 (25000) senses loaded of 60088
50000 (50000) senses loaded of 60088
0 cluster errors
60088 senses loaded out of 60088
60088 words loaded
Pickled sense clusters: /home/panchenko/tmp/vector-link/konvens/ru/watset-cw-nolog-mcl-patterns-limit-tfidf-sensegram.tsv.pkl
Cannot load a pre-computed model from: /home/panchenko/tmp/vector-link/konvens/ru/watset-cw-nolog-mcl-patterns-limit-tfidf-sensegram.tsv-1000-sum-score-20.sense_vectors
No pre-calculated model found at: /home/panchenko/tmp/vector-link/konvens/ru/watset-cw-nolog-mcl-patterns-limit-tfidf-sensegram.tsv-1000-sum-score-20.sense_vectors
Building a new model from: /home/panchenko/tmp/vector-link/konvens/ru/watset-cw-nolog-mcl-patterns-limit-tfidf-sensegram.tsv

























































10000 senses processed













































20000 senses processed










































30000 senses processed








































40000 senses processed



































50000 senses processed







































2018-06-08 23:20:55,510 : INFO : storing 60088x500 projection weights into /home/panchenko/tmp/vector-link/konvens/ru/watset-cw-nolog-mcl-patterns-limit-tfidf-sensegram.tsv-1000-sum-score-20.sense_vectors


60000 senses processed


Sense vectors: /home/panchenko/tmp/vector-link/konvens/ru/watset-cw-nolog-mcl-patterns-limit-tfidf-sensegram.tsv-1000-sum-score-20.sense_vectors
Created 60088 sense vectors


In [4]:
dsv.pcz.data["hV44013"][0]["cluster"]

Counter({'вредить#1': 0.501205,
         'говорить#1': 0.26525,
         'красить#2': 0.386612,
         'навредить#1': 0.501205,
         'опустить#1': 0.256847,
         'ругать#1': 0.253083})

In [None]:
import codecs
import operator
from multiprocessing import Pool
from vector_representations.dense_sense_vectors import DenseSenseVectors
from traceback import format_exc
from glob import glob 


def generate_binary_hypers(output_dir, max_synsets=1, hyper_synset_max_size=10, hc_max=0):
    output_fpath = output_dir + ".vector-link-s%d-hmx%d-hc%d.csv" % (
        max_synsets, hyper_synset_max_size, hc_max)  
    bin_count = 0
    
    out = codecs.open(output_fpath, "w", "utf-8")
    log = codecs.open(output_fpath + ".log", "w", "utf-8")
    
    for i, h_id in enumerate(dsv.pcz.data):
        try:
            if i % 10000 == 0: print(i)

            if "h" in h_id:
                hypo_h_senses = dsv.pcz.data[h_id][0]["cluster"]
                tmp = sorted(dsv.pcz.data[h_id][0]["cluster"].items(), key=operator.itemgetter(1), reverse=True)

                s_id = "s" + h_id[1:]
                hypo_senses = dsv.pcz.data[s_id][0]["cluster"]
                log.write("\n{}\t{}\n".format(
                    h_id, ", ".join(hypo_h_senses)
                ))
                log.write("{}\n".format(
                    ", ".join(["{}:{}".format(k,v) for k,v in tmp])
                ))
                log.write("{}\t{}\n".format(
                    s_id, ", ".join(hypo_senses)
                ))

                # save relations from the hierarchical context 
                for hypo_sense in hypo_senses:
                    for hc_num, hyper_sense in enumerate(hypo_h_senses):
                        if hc_num == hc_max: break
                        hypo_word = hypo_sense.split("#")[0]
                        hyper_word = hyper_sense.split("#")[0]
                        if hypo_word != hyper_word:
                            out.write("{}\t{}\tfrom-original-labels\n".format(hypo_word, hyper_word))
                    bin_count += 1

                # save binary relations from a synset
                s_synsets = 0
                for rh_id, s in dsv.sense_vectors.most_similar(h_id + "#0"):
                    if "s" in rh_id:
                        hyper_senses = dsv.pcz.data[rh_id.split("#")[0]][0]["cluster"]
                        if len(hyper_senses) > hyper_synset_max_size: continue

                        rh_str = ", ".join(hyper_senses)
                        log.write("\t{}:{:.3f} {}\n".format(rh_id, s, rh_str))

                        for hypo_sense in hypo_senses:
                            for hyper_sense in hyper_senses:
                                hypo_word = hypo_sense.split("#")[0]
                                hyper_word = hyper_sense.split("#")[0]
                                if hypo_word != hyper_word:
                                    out.write("{}\t{}\tfrom-vector-linkage\n".format(hypo_word, hyper_word))
                                bin_count += 1
                        s_synsets += 1

                        if s_synsets >= max_synsets: break
        except KeyboardInterrupt:
            break
        except:
            print("Error", i, h_id)
            print(format_exc())
    out.close()
    log.close()
    
    print("# binary relations:", bin_count)
    print("binary relations:", output_fpath)
    print("log of binary relations:", output_fpath + ".log")
    
    return bin_count, output_fpath
    


pcz_fpath = "/home/panchenko/tmp/vector-link/konvens/ru2/watset-cw-nolog-mcl-patterns-limit-tfidf-sensegram.tsv"
pcz_fpath = "/home/panchenko/tmp/vector-link/konvens/ru2/rwn-joint-tfidf-sensegram.tsv"

for pcz_fpath in glob("/home/panchenko/tmp/vector-link/konvens/ru2/*tsv"):
    print(pcz_fpath)
    reload = True
    try: dsv
    except NameError: reload = True

    if reload:
        dsv = DenseSenseVectors(
            pcz_fpath=pcz_fpath,
            word_vectors_obj=None,
            save_pkl=True,
            sense_dim_num=1000,
            norm_type="sum",
            weight_type="score",
            max_cluster_words=20)

    for max_top_synsets in [1, 2, 3]:
        for max_hyper_synset_size in [3, 5, 10, 20]:
            for hc_max in [1, 3, 5]: 
                print("="*50)
                print("max number of synsets:", max_top_synsets)
                print("max hyper synset size:", max_hyper_synset_size)
                print("hc_max:", hc_max)
                generate_binary_hypers(pcz_fpath, max_top_synsets, max_hyper_synset_size, hc_max)
                        

/home/panchenko/tmp/vector-link/konvens/ru2/rwn-joint-tfidf-sensegram.tsv
Loaded 72143 words from: /home/panchenko/tmp/vector-link/konvens/ru2/rwn-joint-tfidf-sensegram.tsv.pkl
Loaded a pre-computed model from: /home/panchenko/tmp/vector-link/konvens/ru2/rwn-joint-tfidf-sensegram.tsv-1000-sum-score-20.sense_vectors
Loaded model from: /home/panchenko/tmp/vector-link/konvens/ru2/rwn-joint-tfidf-sensegram.tsv
max number of synsets: 1
max hyper synset size: 3
hc_max: 1
0


  self.syn0norm = (self.syn0 / sqrt((self.syn0 ** 2).sum(-1))[..., newaxis]).astype(REAL)


10000
20000
30000
40000
50000
60000
70000
# binary relations: 161699
binary relations: /home/panchenko/tmp/vector-link/konvens/ru2/rwn-joint-tfidf-sensegram.tsv.vector-link-s1-hmx3-hc1.csv
log of binary relations: /home/panchenko/tmp/vector-link/konvens/ru2/rwn-joint-tfidf-sensegram.tsv.vector-link-s1-hmx3-hc1.csv.log
max number of synsets: 1
max hyper synset size: 3
hc_max: 3
0
10000
20000
30000
40000
50000
60000
70000
# binary relations: 161692
binary relations: /home/panchenko/tmp/vector-link/konvens/ru2/rwn-joint-tfidf-sensegram.tsv.vector-link-s1-hmx3-hc3.csv
log of binary relations: /home/panchenko/tmp/vector-link/konvens/ru2/rwn-joint-tfidf-sensegram.tsv.vector-link-s1-hmx3-hc3.csv.log
max number of synsets: 1
max hyper synset size: 3
hc_max: 5
0
10000
20000
30000
40000
50000
60000
70000
# binary relations: 161681
binary relations: /home/panchenko/tmp/vector-link/konvens/ru2/rwn-joint-tfidf-sensegram.tsv.vector-link-s1-hmx3-hc5.csv
log of binary relations: /home/panchenko/tmp/ve

10000
20000
30000
40000
50000
60000
70000
# binary relations: 421267
binary relations: /home/panchenko/tmp/vector-link/konvens/ru2/rwn-joint-tfidf-sensegram.tsv.vector-link-s2-hmx10-hc3.csv
log of binary relations: /home/panchenko/tmp/vector-link/konvens/ru2/rwn-joint-tfidf-sensegram.tsv.vector-link-s2-hmx10-hc3.csv.log
max number of synsets: 2
max hyper synset size: 10
hc_max: 5
0
10000
20000
30000
40000
50000
60000
70000
# binary relations: 421270
binary relations: /home/panchenko/tmp/vector-link/konvens/ru2/rwn-joint-tfidf-sensegram.tsv.vector-link-s2-hmx10-hc5.csv
log of binary relations: /home/panchenko/tmp/vector-link/konvens/ru2/rwn-joint-tfidf-sensegram.tsv.vector-link-s2-hmx10-hc5.csv.log
max number of synsets: 2
max hyper synset size: 20
hc_max: 1
0
10000
20000
30000
40000
50000
60000
70000
# binary relations: 492090
binary relations: /home/panchenko/tmp/vector-link/konvens/ru2/rwn-joint-tfidf-sensegram.tsv.vector-link-s2-hmx20-hc1.csv
log of binary relations: /home/panchenko

10000
20000
30000
40000
50000
60000
# binary relations: 25507
binary relations: /home/panchenko/tmp/vector-link/konvens/ru2/watset-cw-nolog-mcl-patterns-limit-tfidf-sensegram.tsv.vector-link-s1-hmx3-hc3.csv
log of binary relations: /home/panchenko/tmp/vector-link/konvens/ru2/watset-cw-nolog-mcl-patterns-limit-tfidf-sensegram.tsv.vector-link-s1-hmx3-hc3.csv.log
max number of synsets: 1
max hyper synset size: 3
hc_max: 5
0
10000
20000
30000
40000
50000
60000
# binary relations: 25507
binary relations: /home/panchenko/tmp/vector-link/konvens/ru2/watset-cw-nolog-mcl-patterns-limit-tfidf-sensegram.tsv.vector-link-s1-hmx3-hc5.csv
log of binary relations: /home/panchenko/tmp/vector-link/konvens/ru2/watset-cw-nolog-mcl-patterns-limit-tfidf-sensegram.tsv.vector-link-s1-hmx3-hc5.csv.log
max number of synsets: 1
max hyper synset size: 5
hc_max: 1
0
10000
20000
30000
40000
50000
60000
# binary relations: 27246
binary relations: /home/panchenko/tmp/vector-link/konvens/ru2/watset-cw-nolog-mcl-patter

10000
20000
30000
40000
50000
60000
# binary relations: 33876
binary relations: /home/panchenko/tmp/vector-link/konvens/ru2/watset-cw-nolog-mcl-patterns-limit-tfidf-sensegram.tsv.vector-link-s2-hmx10-hc3.csv
log of binary relations: /home/panchenko/tmp/vector-link/konvens/ru2/watset-cw-nolog-mcl-patterns-limit-tfidf-sensegram.tsv.vector-link-s2-hmx10-hc3.csv.log
max number of synsets: 2
max hyper synset size: 10
hc_max: 5
0
10000
20000
30000
40000
50000
60000
# binary relations: 33876
binary relations: /home/panchenko/tmp/vector-link/konvens/ru2/watset-cw-nolog-mcl-patterns-limit-tfidf-sensegram.tsv.vector-link-s2-hmx10-hc5.csv
log of binary relations: /home/panchenko/tmp/vector-link/konvens/ru2/watset-cw-nolog-mcl-patterns-limit-tfidf-sensegram.tsv.vector-link-s2-hmx10-hc5.csv.log
max number of synsets: 2
max hyper synset size: 20
hc_max: 1
0
10000
20000
30000
40000
50000
60000
# binary relations: 36598
binary relations: /home/panchenko/tmp/vector-link/konvens/ru2/watset-cw-nolog-mcl-

10000
20000
30000
40000
50000
60000
# binary relations: 91289
binary relations: /home/panchenko/tmp/vector-link/konvens/ru2/rwn-mas-tfidf-sensegram.tsv.vector-link-s1-hmx3-hc1.csv
log of binary relations: /home/panchenko/tmp/vector-link/konvens/ru2/rwn-mas-tfidf-sensegram.tsv.vector-link-s1-hmx3-hc1.csv.log
max number of synsets: 1
max hyper synset size: 3
hc_max: 3
0
10000
20000
30000
40000
50000
60000
# binary relations: 91289
binary relations: /home/panchenko/tmp/vector-link/konvens/ru2/rwn-mas-tfidf-sensegram.tsv.vector-link-s1-hmx3-hc3.csv
log of binary relations: /home/panchenko/tmp/vector-link/konvens/ru2/rwn-mas-tfidf-sensegram.tsv.vector-link-s1-hmx3-hc3.csv.log
max number of synsets: 1
max hyper synset size: 3
hc_max: 5
0
10000
20000
30000
40000
50000
60000
# binary relations: 91289
binary relations: /home/panchenko/tmp/vector-link/konvens/ru2/rwn-mas-tfidf-sensegram.tsv.vector-link-s1-hmx3-hc5.csv
log of binary relations: /home/panchenko/tmp/vector-link/konvens/ru2/rwn-mas-t

10000
20000
30000
40000
50000
60000
# binary relations: 199822
binary relations: /home/panchenko/tmp/vector-link/konvens/ru2/rwn-mas-tfidf-sensegram.tsv.vector-link-s2-hmx10-hc5.csv
log of binary relations: /home/panchenko/tmp/vector-link/konvens/ru2/rwn-mas-tfidf-sensegram.tsv.vector-link-s2-hmx10-hc5.csv.log
max number of synsets: 2
max hyper synset size: 20
hc_max: 1
0
10000
20000
30000
40000
50000
60000
# binary relations: 218372
binary relations: /home/panchenko/tmp/vector-link/konvens/ru2/rwn-mas-tfidf-sensegram.tsv.vector-link-s2-hmx20-hc1.csv
log of binary relations: /home/panchenko/tmp/vector-link/konvens/ru2/rwn-mas-tfidf-sensegram.tsv.vector-link-s2-hmx20-hc1.csv.log
max number of synsets: 2
max hyper synset size: 20
hc_max: 3
0
10000
20000
30000
40000
50000
60000
# binary relations: 218372
binary relations: /home/panchenko/tmp/vector-link/konvens/ru2/rwn-mas-tfidf-sensegram.tsv.vector-link-s2-hmx20-hc3.csv
log of binary relations: /home/panchenko/tmp/vector-link/konvens/ru2

10000
20000
30000
40000
50000
# binary relations: 310784
binary relations: /home/panchenko/tmp/vector-link/konvens/ru2/ruthes-joint-tfidf-sensegram.tsv.vector-link-s1-hmx5-hc1.csv
log of binary relations: /home/panchenko/tmp/vector-link/konvens/ru2/ruthes-joint-tfidf-sensegram.tsv.vector-link-s1-hmx5-hc1.csv.log
max number of synsets: 1
max hyper synset size: 5
hc_max: 3
0
10000
20000
30000
40000
50000
# binary relations: 310784
binary relations: /home/panchenko/tmp/vector-link/konvens/ru2/ruthes-joint-tfidf-sensegram.tsv.vector-link-s1-hmx5-hc3.csv
log of binary relations: /home/panchenko/tmp/vector-link/konvens/ru2/ruthes-joint-tfidf-sensegram.tsv.vector-link-s1-hmx5-hc3.csv.log
max number of synsets: 1
max hyper synset size: 5
hc_max: 5
0
10000
20000
30000
40000
50000
# binary relations: 310784
binary relations: /home/panchenko/tmp/vector-link/konvens/ru2/ruthes-joint-tfidf-sensegram.tsv.vector-link-s1-hmx5-hc5.csv
log of binary relations: /home/panchenko/tmp/vector-link/konvens/ru2

In [5]:
len(dsv.pcz.data)

60088

In [None]:
hs_type = "h"
min_size = 3
n = 0
for i, hs_id in enumerate(dsv.pcz.data):
    synset_len = len(dsv.pcz.data[hs_id][0]["cluster"])
    if synset_len >= min_size and hs_type in hs_id:
        print "\n", hs_id, ", ".join(dsv.pcz.data[hs_id][0]["cluster"])
        s_id = "s" + hs_id[1:]
        print s_id, ", ".join(dsv.pcz.data[s_id][0]["cluster"])
        for rhs_id, s in dsv.sense_vectors.most_similar(hs_id + "#0"):
            rhs_str = ", ".join(dsv.pcz.data[rhs_id.split("#")[0]][0]["cluster"])
            print "\t%s:%.3f %s" % (rhs_id, s, rhs_str)
        n += 1
    if n > 100: break