In [1]:
import logging
import time
from datetime import datetime
import os
import ast
import codecs
import json
import collections
import pandas as pd
import numpy as np
from sklearn.model_selection import ParameterGrid
import random
import fasttext
import string
from gensim.models import KeyedVectors
from tqdm import tqdm_notebook as tqdm
from ruwordnet import RuWordNet

FORMAT = "%(asctime)s %(levelname)s: %(message)s"
logging.basicConfig(level=logging.DEBUG,format=FORMAT)

from utils import save_to_file, PUNCT_SYMBOLS, PREPOSITIONS, RESTRICT_TAGS, \
                  get_top, get_top_hyperomyns_counter, get_top_hyperomyns_counter_v, \
                  read_train_dataset, read_train_dataset, read_gold_dataset, \
                  normalize_ma, get_synset_str_and_vector, get_synset_words

In [2]:
import pymorphy2
morph_analyzer = pymorphy2.MorphAnalyzer()

2020-03-23 20:51:48,214 INFO: Loading dictionaries from /usr/local/lib/python3.7/dist-packages/pymorphy2_dicts/data
2020-03-23 20:51:48,269 INFO: format: 2.4, revision: 393442, updated: 2015-01-17T16:03:56.586168


### GRID PROCESS

In [3]:
common_prefix = 'lower_true'

In [4]:
results_dir = 'results/res_'+common_prefix

In [5]:
! mkdir -p {results_dir}

In [6]:
def params_grid():
    norm_functions = [normalize_ma]
    norm_fn_out_of_vocab2synonym = [None]
    norm_fn_sort   = [False]
    norm_fn_unique = [False] 
    norm_fn_lower  = [True]
    norm_fn_min_word_len = [1]

    punct_symbols = set(PUNCT_SYMBOLS).copy()
    punct_symbols_ = punct_symbols
    
    # special symbols from ruwordnet
    from_ruwordnet_punct_symbols = {'—', '«', '»', '·', '\xad', '\xa0', '°', '–', '§'} 
    
    punct_symbols.update(from_ruwordnet_punct_symbols)
    
    norm_fn_punct_symbols = [punct_symbols]
    norm_fn_prepositions  = [PREPOSITIONS]
    norm_fn_restrict_tags = [RESTRICT_TAGS]

    #verb_tags = ['VERB', 'INFN', 'GRND', 'NOUN']
    norm_fn_accept_tags = [None]

#     p1_l = [-1.0, 0.0, 0.1, 0.5, 1.0, 2.0]
#     p1_l = [0.0, 0.1, 1.0]
    p1_l = [0.1]
#     p2_l = [-1.0, 0.0, 0.1, 0.5, 1.0, 2.0]
#     p2_l = [0.0, 0.1, 1.0]
    p2_l = [1.0]
#     p3_l = [-1.0, 0.0, 0.1, 0.5, 1.0, 2.0]
#     p3_l = [0.0, 0.1, 1.0]
    p3_l = [1.0]
#     k_l  = [1, 3, 5, 7, 10, 15, 20]
    k_l  = [10]
#     topn_l = [1, 3, 5, 7, 10, 15, 20]
    topn_l  = [10]
    account_gold_l = [False]
    
    all_params = {'p1': p1_l,
                  'p2': p2_l,
                  'p3': p3_l,
                  'k' : k_l, 
                  'topn': topn_l,
                  'account_gold': account_gold_l,
                  
                  'normalize_func': norm_functions,
                  
                  'out_of_vocab2synonym': norm_fn_out_of_vocab2synonym,
                  'sort': norm_fn_sort,
                  'unique': norm_fn_unique,
                  'lower': norm_fn_lower,
                  'min_word_len': norm_fn_min_word_len,
                  'punct_symbols': norm_fn_punct_symbols,
                  'prepositions': norm_fn_prepositions,
                  'restrict_tags': norm_fn_restrict_tags,
                  'accept_tags': norm_fn_accept_tags,
                  'ma': [morph_analyzer]
                 }
    
    
    
    for p in ParameterGrid(all_params):
        yield p
        
len([params for params in tqdm(params_grid())])

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




1

In [7]:
params = list(params_grid())[0]
params

{'accept_tags': None,
 'account_gold': False,
 'k': 10,
 'lower': True,
 'ma': <pymorphy2.analyzer.MorphAnalyzer at 0x7f7ebcb47710>,
 'min_word_len': 1,
 'normalize_func': <function utils.normalize_ma(sentence, out_of_vocab2synonym=None, sort=False, unique=False, punct_symbols=None, lower=True, ma=None, restrict_tags=None, accept_tags=None, min_word_len=0, *args, **kwargs)>,
 'out_of_vocab2synonym': None,
 'p1': 0.1,
 'p2': 1.0,
 'p3': 1.0,
 'prepositions': {'в',
  'да',
  'для',
  'до',
  'и',
  'из-за',
  'или',
  'к',
  'как',
  'ли',
  'либо',
  'на',
  'не',
  'но',
  'о',
  'от',
  'перед',
  'по',
  'под',
  'при',
  'с',
  'так',
  'то',
  'только'},
 'punct_symbols': {'!',
  '"',
  '#',
  '$',
  '%',
  '&',
  "'",
  '(',
  ')',
  '*',
  '+',
  ',',
  '-',
  '.',
  '/',
  ':',
  ';',
  '<',
  '=',
  '>',
  '?',
  '@',
  '[',
  '\\',
  ']',
  '^',
  '_',
  '`',
  '{',
  '|',
  '}',
  '~',
  '\xa0',
  '§',
  '«',
  '\xad',
  '°',
  '·',
  '»',
  '–',
  '—'},
 'restrict_tags': {'C

In [8]:
### LOADING DATA

In [9]:
%%time
ruwordnet = RuWordNet('data/ruwordnet')

2020-03-23 20:51:48,431 DEBUG: Trying to load tree from data/ruwordnet/composed_of.xml
2020-03-23 20:51:49,264 DEBUG: Trying to load tree from data/ruwordnet/senses.A.xml
2020-03-23 20:51:49,377 DEBUG: Trying to load tree from data/ruwordnet/senses.N.xml
2020-03-23 20:51:49,884 DEBUG: Trying to load tree from data/ruwordnet/senses.V.xml
2020-03-23 20:51:50,095 DEBUG: Trying to load tree from data/ruwordnet/synsets.A.xml
2020-03-23 20:51:50,233 DEBUG: Trying to load tree from data/ruwordnet/synsets.N.xml
2020-03-23 20:51:50,460 DEBUG: Trying to load tree from data/ruwordnet/synsets.V.xml
2020-03-23 20:51:50,527 DEBUG: Trying to load tree from data/ruwordnet/synset_relations.A.xml
2020-03-23 20:51:50,690 DEBUG: Trying to load tree from data/ruwordnet/synset_relations.N.xml
2020-03-23 20:51:50,985 DEBUG: Trying to load tree from data/ruwordnet/synset_relations.V.xml
2020-03-23 20:51:51,914 INFO: RuWordNet loaded from data/ruwordnet


CPU times: user 2.76 s, sys: 237 ms, total: 2.99 s
Wall time: 3.48 s


In [10]:
%%time
ft_model_file='fasttext/cc.ru.300.bin'
ft_model = fasttext.load_model(ft_model_file)

CPU times: user 2.94 s, sys: 7.65 s, total: 10.6 s
Wall time: 36 s




In [11]:
%%time
tayga_none_fasttextcbow_300_10_2019='data/rusvectores_models/tayga_none_fasttextcbow_300_10_2019/model.model'
model_tayga = KeyedVectors.load(tayga_none_fasttextcbow_300_10_2019)

2020-03-23 20:52:27,977 INFO: loading Word2VecKeyedVectors object from data/rusvectores_models/tayga_none_fasttextcbow_300_10_2019/model.model
2020-03-23 20:52:27,978 DEBUG: {'uri': 'data/rusvectores_models/tayga_none_fasttextcbow_300_10_2019/model.model', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'transport_params': None}
2020-03-23 20:52:28,141 INFO: loading vectors from data/rusvectores_models/tayga_none_fasttextcbow_300_10_2019/model.model.vectors.npy with mmap=None
2020-03-23 20:52:28,180 INFO: loading vectors_ngrams from data/rusvectores_models/tayga_none_fasttextcbow_300_10_2019/model.model.vectors_ngrams.npy with mmap=None
2020-03-23 20:52:28,563 INFO: loading vectors_vocab from data/rusvectores_models/tayga_none_fasttextcbow_300_10_2019/model.model.vectors_vocab.npy with mmap=None
2020-03-23 20:52:28,599 INFO: setting ignored attribute vectors_ngrams_norm to None
2020-03-23 20:52:28,6

CPU times: user 136 ms, sys: 494 ms, total: 630 ms
Wall time: 628 ms


In [12]:
%%time
araneum_none_fasttextcbow_300_5_2018 = 'data/rusvectores_models/araneum_none_fasttextcbow_300_5_2018/araneum_none_fasttextcbow_300_5_2018.model'
model_araneum = KeyedVectors.load(araneum_none_fasttextcbow_300_5_2018)

2020-03-23 20:52:28,609 INFO: loading Word2VecKeyedVectors object from data/rusvectores_models/araneum_none_fasttextcbow_300_5_2018/araneum_none_fasttextcbow_300_5_2018.model
2020-03-23 20:52:28,610 DEBUG: {'uri': 'data/rusvectores_models/araneum_none_fasttextcbow_300_5_2018/araneum_none_fasttextcbow_300_5_2018.model', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'transport_params': None}
2020-03-23 20:52:29,046 INFO: loading vectors from data/rusvectores_models/araneum_none_fasttextcbow_300_5_2018/araneum_none_fasttextcbow_300_5_2018.model.vectors.npy with mmap=None
2020-03-23 20:52:29,087 INFO: loading vectors_ngrams from data/rusvectores_models/araneum_none_fasttextcbow_300_5_2018/araneum_none_fasttextcbow_300_5_2018.model.vectors_ngrams.npy with mmap=None
2020-03-23 20:52:29,497 INFO: loading vectors_vocab from data/rusvectores_models/araneum_none_fasttextcbow_300_5_2018/araneum_none_fasttext

CPU times: user 401 ms, sys: 506 ms, total: 907 ms
Wall time: 931 ms


In [13]:
modelname2model = {'ft.cc.ru.300': ft_model,
                   'tayga_none_fasttextcbow_300_10_2019': model_tayga,
                   'araneum_none_fasttextcbow_300_5_2018': model_araneum,
                  }

In [14]:
%%time
words, vector = get_synset_str_and_vector(ruwordnet, 
                                          '116365-V', 
                                          dict(), 
                                          {'ruthes_name': True, 
                                           'senses_names': True}, 
                                          norm_function=normalize_ma, 
                                          model=ft_model)
words

2020-03-23 20:52:29,555 INFO: Loading dictionaries from /usr/local/lib/python3.7/dist-packages/pymorphy2_dicts/data
2020-03-23 20:52:29,573 INFO: format: 2.4, revision: 393442, updated: 2015-01-17T16:03:56.586168
2020-03-23 20:52:29,574 INFO: Loading dictionaries from /usr/local/lib/python3.7/dist-packages/pymorphy2_dicts/data
2020-03-23 20:52:29,591 INFO: format: 2.4, revision: 393442, updated: 2015-01-17T16:03:56.586168
2020-03-23 20:52:29,592 INFO: Loading dictionaries from /usr/local/lib/python3.7/dist-packages/pymorphy2_dicts/data
2020-03-23 20:52:29,610 INFO: format: 2.4, revision: 393442, updated: 2015-01-17T16:03:56.586168
2020-03-23 20:52:29,611 INFO: Loading dictionaries from /usr/local/lib/python3.7/dist-packages/pymorphy2_dicts/data
2020-03-23 20:52:29,631 INFO: format: 2.4, revision: 393442, updated: 2015-01-17T16:03:56.586168
2020-03-23 20:52:29,632 INFO: Loading dictionaries from /usr/local/lib/python3.7/dist-packages/pymorphy2_dicts/data
2020-03-23 20:52:29,651 INFO: fo

CPU times: user 219 ms, sys: 52.6 ms, total: 271 ms
Wall time: 264 ms


['будить мысль',
 'чувство',
 'зажечь',
 'зажигать',
 'разжигать',
 'возжечь',
 'разжечь',
 'будить',
 'пробудить',
 'разжигаться',
 'пробуждать',
 'вдохнуть',
 'возжигать']

In [15]:
vector.shape

(300,)

In [16]:
words, vector = get_synset_str_and_vector(ruwordnet, 
                                                '9577-N', 
                                                dict(), 
                                              {'ruthes_name': True, 
                                               'senses_names': True}, 
                                                norm_function=normalize_ma, 
                                                model=model_tayga)
words

2020-03-23 20:52:29,829 INFO: Loading dictionaries from /usr/local/lib/python3.7/dist-packages/pymorphy2_dicts/data
2020-03-23 20:52:29,849 INFO: format: 2.4, revision: 393442, updated: 2015-01-17T16:03:56.586168
2020-03-23 20:52:29,851 INFO: Loading dictionaries from /usr/local/lib/python3.7/dist-packages/pymorphy2_dicts/data
2020-03-23 20:52:29,871 INFO: format: 2.4, revision: 393442, updated: 2015-01-17T16:03:56.586168
2020-03-23 20:52:29,872 INFO: Loading dictionaries from /usr/local/lib/python3.7/dist-packages/pymorphy2_dicts/data
2020-03-23 20:52:29,892 INFO: format: 2.4, revision: 393442, updated: 2015-01-17T16:03:56.586168


['конькобежный спорт', 'конькобежный спорт', 'скоростной бег коньках']

In [17]:
%%time

nouns=True  # NOUNS
nouns=False # VERBS

p1,p2,p3,k,topn,account_gold,norm_function = params['p1'],params['p2'],params['p3'],\
                                             params['k'],params['topn'],params['account_gold'],\
                                             params['normalize_func']

pos = "N" if nouns else "V"

norm_fn_params_list = {'out_of_vocab2synonym',
                       'sort',
                       'unique',
                       'lower',
                       'min_word_len',
                       'punct_symbols',
                       'prepositions',
                       'restrict_tags',
                       'accept_tags',
                       'ma'
                       }
norm_fn_params = {param_name: params[param_name] 
                  for param_name in norm_fn_params_list
                  if param_name in params}
if nouns:
    make_sentences_params = {'ruthes_name': True,
                             'definition': False,
                             'senses_names': True,
                             'senses_lemmas': False,
                             'senses_main_word': False,
                             'sep': ' '
                            }
else:
    make_sentences_params = {'ruthes_name': True,
                             'definition': True,
                             'senses_names': True,
                             'senses_lemmas': True,
                             'senses_main_word': True,
                             'sep': ' '
                            }

synsetstr2id = dict()
synsetstr2vector = dict()
for synset in ruwordnet.synsets_list:
    if synset['part_of_speech'] != pos:
        continue

    synset_words, vector = get_synset_str_and_vector(ruwordnet, synset['id'], 
                                                     norm_params=norm_fn_params,
                                                     make_sent_params=make_sentences_params,
                                                     norm_function=norm_function,
                                                     model=ft_model)
    synsetstr = ' '.join(synset_words)
    if synsetstr in synsetstr2id:
        logging.error(f"Duplicate synset_str.{synset['id'],synsetstr2id[synsetstr]}:'{synsetstr}'")
    synsetstr2id[synsetstr] = synset['id']
    synsetstr2vector[synsetstr] = vector

CPU times: user 12.4 s, sys: 0 ns, total: 12.4 s
Wall time: 12.4 s


In [18]:
synsetstr2id

{'ливень очень сильный проливной дождь дождь льет дождь лить лить лить ведра лить ведро лить льет дождь лить дождь лить': '4223-V',
 'быть известным слыть быть известным качестве считаться считаться считаться слыть слыть пользоваться известностью пользоваться известность пользоваться прослыть прослыть': '129120-V',
 'водоснабжение снабжать снабдить водой снабжать водой снабжать вода снабжать снабдить водой снабдить вода снабдить подать воду подать вода подать обеспечивать водой обеспечивать вода обеспечивать обеспечить водой обеспечить вода обеспечить подавать воду подавать вода подавать': '489-V',
 'сползти спуститься ползком спуститься ползком спуститься ползком спуститься сползать сползать сползти сползти': '114961-V',
 'печатать воспроизводить напечатать напечатать пропечатывать пропечатывать печатать печатать отпечатать отпечатать отпечатывать отпечатывать допечатывать допечатывать пропечатать пропечатать допечатать допечатать печататься печататься': '114108-V',
 'выложить открове

In [19]:
make_sentences_params

{'ruthes_name': True,
 'definition': True,
 'senses_names': True,
 'senses_lemmas': True,
 'senses_main_word': True,
 'sep': ' '}

In [20]:
get_synset_words(ruwordnet, '124987-N',
                       norm_params=norm_fn_params,
                       make_sent_params=make_sentences_params,
                       norm_function=normalize_ma,
                      )

['однокашник',
 'товарищ учебе',
 'товарищ обучению',
 'воспитанию',
 'соученица',
 'соученица',
 'товарищ учебе',
 'товарищ учеба',
 'товарищ',
 'однокашница',
 'однокашница',
 'соученик',
 'соученик',
 'однокашник',
 'однокашник']

In [21]:
get_synset_words(ruwordnet, '103748-N',
                 norm_params=norm_fn_params,
                 make_sent_params=make_sentences_params,
                 norm_function=normalize_ma,
                )

['гамбург', 'гамбург', 'гамбург']

In [22]:
get_synset_words(ruwordnet, '2642-N',
                 norm_params=norm_fn_params,
                 make_sent_params=make_sentences_params,
                 norm_function=normalize_ma,
                )

['санкт петербург',
 'санкт петербург',
 'санкт петербург',
 'северная столица',
 'северный столица',
 'столица',
 'ленинград',
 'ленинград',
 'питер',
 'питер',
 'петроград',
 'петроград',
 'северная пальмира',
 'северный пальмира',
 'пальмира',
 'спб',
 'спб',
 'петербург',
 'петербург']

### PROCESS WORDS FOR TEST SETS

In [23]:
def process_words(words, 
                  params_grid,
                  prefix, 
                  nouns, 
                  model,
                  algo='default',
                  out_dir='/tmp'):
    for params in tqdm(params_grid()):
        start = time.time()
        p1,p2,p3,k,topn,account_gold,norm_function = params['p1'],params['p2'],params['p3'],\
                                                     params['k'],params['topn'],params['account_gold'],\
                                                     params['normalize_func']
        
        pos = "N" if nouns else "V"
        
        norm_fn_params_list = {'out_of_vocab2synonym',
                               'sort',
                               'unique',
                               'lower',
                               'min_word_len',
                               'punct_symbols',
                               'prepositions',
                               'restrict_tags',
                               'accept_tags',
                               'ma'
                               }
        norm_fn_params = {param_name: params[param_name] 
                          for param_name in norm_fn_params_list
                          if param_name in params}
        if nouns:
            make_sentences_params = {'ruthes_name': True,
                                     'definition': False,
                                     'senses_names': True,
                                     'senses_lemmas': False,
                                     'senses_main_word': False,
                                     'sep': ' '
                                    }
        else:
            make_sentences_params = {'ruthes_name': True,
                                     'definition': True,
                                     'senses_names': True,
                                     'senses_lemmas': True,
                                     'senses_main_word': True,
                                     'sep': ' '
                                    }
        
        if nouns:
            word2parents = read_train_dataset('data/training_data/synsets_nouns.tsv', ruwordnet)
            gold_synsetid2parents = read_gold_dataset('data/training_data/synsets_nouns.tsv')
        else:
            word2parents = read_train_dataset('data/training_data/synsets_verbs.tsv', ruwordnet)
            gold_synsetid2parents = read_gold_dataset('data/training_data/synsets_verbs.tsv')
        
        synsetstr2id = dict()
        synsetstr2vector = dict()
        for synset in ruwordnet.synsets_list:
            if synset['part_of_speech'] != pos:
                continue

            synset_words, vector = get_synset_str_and_vector(ruwordnet, synset['id'], 
                                                             norm_params=norm_fn_params,
                                                             make_sent_params=make_sentences_params,
                                                             norm_function=norm_function,
                                                             model=model)
            synsetstr = ' '.join(synset_words)
            if synsetstr in synsetstr2id:
                logging.error(f"Duplicate synset_str.{synset['id'],synsetstr2id[synsetstr]}:'{synsetstr}'")
            synsetstr2id[synsetstr] = synset['id']
            synsetstr2vector[synsetstr] = vector

#         print (len(synsetstr2vector))
                              
        example_vector = None
        for _,v in synsetstr2vector.items():
            example_vector = v
            break

        ruwordnet_matrix = np.zeros((len(synsetstr2vector),example_vector.shape[0]),
                                     example_vector.dtype)

        for i, (s, v) in enumerate(synsetstr2vector.items()):
            ruwordnet_matrix[i] = v
#         print (ruwordnet_matrix.shape)
                
        
        result = collections.defaultdict(list)
        for w in words:
            hypernyms = get_top_hyperomyns_counter(w, 
                                                   k=k, 
                                                   p1=p1,p2=p2,p3=p3, 
                                                   account_gold=account_gold,
                                                   ruwordnet_matrix=ruwordnet_matrix,
                                                   gold_synsetid2parents=gold_synsetid2parents,
                                                   synsetstr2id=synsetstr2id,
                                                   synsetstr2vector=synsetstr2vector,
                                                   model=model,
                                                   ruwordnet=ruwordnet
                                                  )
            result[w] = [h for h,rate in hypernyms.most_common()][:topn]
            
        
        curr_time ='_'.join(str(datetime.now()).split()).replace(':','')
        out_file  = f'{prefix}_{k}_{topn}_{p1}_{p2}_{p3}_{account_gold}_{curr_time}'
        out_file  = os.path.join(out_dir, out_file)
                              
        with open(out_file+'.json', 'w') as of_json:
            params_out = {p: str(params[p]) for p in params.keys()}
            json.dump(params_out,fp=of_json, indent=4)

        save_to_file(result, out_file+'.tsv', 
                     ruwordnet)
                              
        
        end = time.time()
        logging.info(f"Saved {out_file}.")
        logging.info(f"Spent time: {end - start} secs ({(end - start)/60.} minutes).")

In [24]:
public_words = list()
private_words = list()

def load_words(nouns):
    global public_words
    global private_words
#     if nouns:
#         print (ruwordnet.get_synset_senses_list('126551-N'))
#     else:
#         print (ruwordnet.get_synset_senses_list('124595-V'))

#     if nouns:
#         print(len([s for s in ruwordnet.synsets_list if s['part_of_speech']=='N']))
#     else:
#         print(len([s for s in ruwordnet.synsets_list if s['part_of_speech']=='V']))
    if nouns:
        with open('data/public_test/nouns_public.tsv', 'r') as f:
            public_words = [l.strip().lower() for l in f.readlines()]
        with open('data/private_test/nouns_private.tsv', 'r') as f:
            private_words = [l.strip().lower() for l in f.readlines()]
    else:
        with open('data/public_test/verbs_public.tsv', 'r') as f:
            public_words = [l.strip().lower() for l in f.readlines()]
        with open('data/private_test/verbs_private.tsv', 'r') as f:
            private_words = [l.strip().lower() for l in f.readlines()]

### PUBLIC NOUNS

In [25]:
nouns = True 
load_words(nouns)
for model_name, model in modelname2model.items():
    process_words(public_words, 
                  params_grid, 
                  '_'.join(('public_nouns',common_prefix, model_name)),
                  nouns,
                  model=model, 
                  out_dir=results_dir
                  )

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

2020-03-23 20:53:44,066 INFO: Saved results/res_lower_true/public_nouns_lower_true_ft.cc.ru.300_10_10_0.1_1.0_1.0_False_2020-03-23_205343.984475.
2020-03-23 20:53:44,067 INFO: Spent time: 61.66870307922363 secs (1.0278117179870605 minutes).





HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

2020-03-23 20:54:47,453 INFO: Saved results/res_lower_true/public_nouns_lower_true_tayga_none_fasttextcbow_300_10_2019_10_10_0.1_1.0_1.0_False_2020-03-23_205447.372180.
2020-03-23 20:54:47,454 INFO: Spent time: 63.36373972892761 secs (1.0560623288154602 minutes).





HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

2020-03-23 20:55:51,139 INFO: Saved results/res_lower_true/public_nouns_lower_true_araneum_none_fasttextcbow_300_5_2018_10_10_0.1_1.0_1.0_False_2020-03-23_205551.057730.
2020-03-23 20:55:51,140 INFO: Spent time: 63.661333084106445 secs (1.0610222180684408 minutes).





### PRIVATE NOUNS

In [None]:
nouns = True 
load_words(nouns)
for model_name, model in modelname2model.items():
    process_words(private_words, 
                  params_grid, 
                  '_'.join(('private_nouns',common_prefix, model_name)),
                  nouns,
                  model=model,
                  out_dir=results_dir
                 )

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

2020-03-23 20:57:33,267 INFO: Saved results/res_lower_true/private_nouns_lower_true_ft.cc.ru.300_10_10_0.1_1.0_1.0_False_2020-03-23_205733.154668.
2020-03-23 20:57:33,268 INFO: Spent time: 102.0984275341034 secs (1.7016404589017233 minutes).





HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

2020-03-23 20:59:16,622 INFO: Saved results/res_lower_true/private_nouns_lower_true_tayga_none_fasttextcbow_300_10_2019_10_10_0.1_1.0_1.0_False_2020-03-23_205916.509092.
2020-03-23 20:59:16,622 INFO: Spent time: 103.32964515686035 secs (1.7221607526143392 minutes).





HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

### PUBLIC VERBS

In [None]:
nouns = False 
load_words(nouns)
for model_name, model in modelname2model.items():
    process_words(public_words, 
                  params_grid, 
                  '_'.join(('public_verbs',common_prefix, model_name)),
                  nouns,
                  model=model,
                  out_dir=results_dir
                 )

### PRIVATE VERBS

In [None]:
nouns = False 
load_words(nouns)
for model_name, model in modelname2model.items():
    process_words(private_words,
                  params_grid,
                  '_'.join(('private_verbs',common_prefix, model_name)),
                  nouns,
                  model=model,
                  out_dir=results_dir
                 )

# ELMO

In [None]:
params

In [None]:
elmo_work_dir = '_'.join(('data/elmo_vectors',common_prefix))
elmo_model_dir = 'data/rusvectores_models/199'

In [None]:
! mkdir -p {elmo_work_dir}

In [None]:
%%time

def prepare_elmo_data(nouns):
    if nouns:
        make_sentences_params = {'ruthes_name': True,
                                 'definition': False,
                                 'senses_names': True,
                                 'senses_lemmas': False,
                                 'senses_main_word': False,
                                 'sep': ' '
                                }
    else:
        make_sentences_params = {'ruthes_name': True,
                                 'definition': True,
                                 'senses_names': True,
                                 'senses_lemmas': True,
                                 'senses_main_word': True,
                                 'sep': ' '
                                }
    norm_fn_params_list = {'out_of_vocab2synonym',
                           'sort',
                           'unique',
                           'lower',
                           'min_word_len',
                           'punct_symbols',
                           'prepositions',
                           'restrict_tags',
                           'accept_tags',
                           'ma'
                           }
    norm_fn_params = {param_name: params[param_name] 
                      for param_name in norm_fn_params_list
                      if param_name in params}


    synsetid2sentence = dict()
    for synset in ruwordnet.synsets_list:
        synset_id=synset['id']
        if not ((nouns and synset_id.endswith('N')) or 
                (not nouns and synset_id.endswith('V') )):
            continue

        words = get_synset_words(ruwordnet, synset_id,
                                 norm_params=norm_fn_params,
                                 make_sent_params=make_sentences_params,
                                 norm_function=normalize_ma
                                )
        sentence = ' '.join(words).strip()
        synsetid2sentence[synset_id] = sentence

    if nouns:
        fname = os.path.join(elmo_work_dir,'sentences_N.txt')
        fname_s = os.path.join(elmo_work_dir, 'synsetids_N.txt')
    else:
        fname = os.path.join(elmo_work_dir,'sentences_V.txt')
        fname_s = os.path.join(elmo_work_dir,'synsetids_V.txt')

    with open(fname, 'w') as f:
        f.writelines([sent+'\n' for s_id, sent in synsetid2sentence.items()])
    with open(fname_s, 'w') as f:
        f.writelines([s_id+'\n' for s_id, sent in synsetid2sentence.items()])

prepare_elmo_data(True)
prepare_elmo_data(False)

In [None]:
%%time
nouns = True 
load_words(nouns)
with open(os.path.join(elmo_work_dir,'public_nouns.txt'), 'w') as f:
    f.writelines([w.lower()+'\n' for w in public_words])
with open(os.path.join(elmo_work_dir,'private_nouns.txt'), 'w') as f:
    f.writelines([w.lower()+'\n' for w in private_words])
nouns = False 
load_words(nouns)
with open(os.path.join(elmo_work_dir,'public_verbs.txt'), 'w') as f:
    f.writelines([w.lower()+'\n' for w in public_words])
with open(os.path.join(elmo_work_dir,'private_verbs.txt'), 'w') as f:
    f.writelines([w.lower()+'\n' for w in private_words])

In [None]:
elmo_work_dir

In [None]:
elmo_model_dir

#### Get elmo vectors

In [None]:
! cd simple_elmo/ && ./make_elmo_vectors_ruwordnet.sh ../{elmo_work_dir} ../{elmo_model_dir} && cd ../

In [None]:
%%time
ruwordnet_matrix = np.load(os.path.join(elmo_work_dir,
                                        'sentences_N_elmo_avg_vectors_199.npy')
                          )
ruwordnet_matrix.shape

In [None]:
public_nouns_matrix = np.load(os.path.join(elmo_work_dir,
                                           'public_nouns_elmo_avg_vectors_199.npy')
                             )
public_nouns_matrix[0]

In [None]:
with open(os.path.join(elmo_work_dir,'synsetids_N.txt'), 'r') as f:
    synset_ids_N = [l.strip() for l in f.readlines()]

In [None]:
!head -2 {elmo_work_dir}/sentences_N.txt

In [None]:
!head -10 {elmo_work_dir}/public_nouns.txt

In [None]:
get_top(public_nouns_matrix[5], ruwordnet_matrix, synset_ids_N)

In [None]:
get_top(public_nouns_matrix[0], ruwordnet_matrix, synset_ids_N)

In [None]:
def process_words_elmo_199(public, 
                       params_grid,
                       prefix, 
                       nouns,
                       elmo_work_dir,
                       out_dir='/tmp'):
    
    for params in tqdm(params_grid()):
        start = time.time()
        p1,p2,p3,k,topn,account_gold,norm_function = params['p1'],params['p2'],params['p3'],\
                                                     params['k'],params['topn'],params['account_gold'],\
                                                     params['normalize_func']
        
        pos = "N" if nouns else "V"
        
        if nouns:
            word2parents = read_train_dataset('data/training_data/synsets_nouns.tsv', ruwordnet)
            gold_synsetid2parents = read_gold_dataset('data/training_data/synsets_nouns.tsv')
        else:
            word2parents = read_train_dataset('data/training_data/synsets_verbs.tsv', ruwordnet)
            gold_synsetid2parents = read_gold_dataset('data/training_data/synsets_verbs.tsv')
            
        if nouns:
            ruwordnet_matrix = np.load(os.path.join(elmo_work_dir,'sentences_N_elmo_avg_vectors_199.npy'))
            with open(os.path.join(elmo_work_dir, 'synsetids_N.txt'), 'r') as f:
                synset_ids = [l.strip() for l in f.readlines()]
            with open(os.path.join(elmo_work_dir, 'sentences_N.txt'), 'r') as f:
                sentences = [l.strip() for l in f.readlines()]
        else:
            ruwordnet_matrix = np.load(os.path.join(elmo_work_dir,'sentences_V_elmo_avg_vectors_199.npy'))
            with open(os.path.join(elmo_work_dir,'synsetids_V.txt'), 'r') as f:
                synset_ids = [l.strip() for l in f.readlines()]
            with open(os.path.join(elmo_work_dir,'sentences_V.txt'), 'r') as f:
                sentences = [l.strip() for l in f.readlines()]

        if nouns and public:
            words_matrix = np.load(os.path.join(elmo_work_dir,'public_nouns_elmo_avg_vectors_199.npy'))
            with open(os.path.join(elmo_work_dir,'public_nouns.txt'), 'r') as f:
                words = [l.strip() for l in f.readlines()]
        elif nouns and (not public):
            words_matrix = np.load(os.path.join(elmo_work_dir,'private_nouns_elmo_avg_vectors_199.npy'))
            with open(os.path.join(elmo_work_dir,'private_nouns.txt'), 'r') as f:
                words = [l.strip() for l in f.readlines()]
        elif (not nouns) and public:
            words_matrix = np.load(os.path.join(elmo_work_dir,'public_verbs_elmo_avg_vectors_199.npy'))
            with open(os.path.join(elmo_work_dir,'public_verbs.txt'), 'r') as f:
                words = [l.strip() for l in f.readlines()]
        elif (not nouns) and (not public):
            words_matrix = np.load(os.path.join(elmo_work_dir,'private_verbs_elmo_avg_vectors_199.npy'))
            with open(os.path.join(elmo_work_dir,'private_verbs.txt'), 'r') as f:
                words = [l.strip() for l in f.readlines()]
            
        
        synsetstr2id = dict()
        synsetstr2vector = dict()
        
        for synsetstr, synsetid, synsetvector in zip(sentences, synset_ids, ruwordnet_matrix):
            synsetstr2id[synsetstr] = synsetid
            synsetstr2vector[synsetstr] = synsetvector
        
        result = collections.defaultdict(list)
        for w, v in zip(words, words_matrix):
            hypernyms = get_top_hyperomyns_counter_v(v, 
                                                     k=k, 
                                                     p1=p1,p2=p2,p3=p3, 
                                                     account_gold=account_gold,
                                                     ruwordnet_matrix=ruwordnet_matrix,
                                                     gold_synsetid2parents=gold_synsetid2parents,
                                                     synsetstr2id=synsetstr2id,
                                                     synsetstr2vector=synsetstr2vector,
                                                     ruwordnet=ruwordnet
                                                    )
            result[w] = [h for h,rate in hypernyms.most_common()][:topn]
            
        
        curr_time ='_'.join(str(datetime.now()).split()).replace(':','')
        out_file  = f'{prefix}_{k}_{topn}_{p1}_{p2}_{p3}_{account_gold}_{curr_time}'
        out_file  = os.path.join(out_dir, out_file)
                              
        with open(out_file+'.json', 'w') as of_json:
            params_out = {p: str(params[p]) for p in params.keys()}
            json.dump(params_out,fp=of_json, indent=4)

        save_to_file(result, out_file+'.tsv', 
                     ruwordnet)
                              
        
        end = time.time()
        logging.info(f"Saved {out_file}.")
        logging.info(f"Spent time: {end - start} secs ({(end - start)/60.} minutes).")

In [None]:
%%time
public=True
nouns=True
process_words_elmo_199(public, params_grid, 
                       '_'.join(('public_nouns_elmo_199',common_prefix)), 
                       nouns, elmo_work_dir, out_dir=results_dir)

In [None]:
%%time
public=True
nouns=False
process_words_elmo_199(public, params_grid, 
                       '_'.join(('public_verbs_elmo_199',common_prefix)),
                       nouns, elmo_work_dir, out_dir=results_dir)

In [None]:
%%time
public=False
nouns=True
process_words_elmo_199(public, params_grid, 
                       '_'.join(('private_nouns_elmo_199',common_prefix)),
                       nouns, elmo_work_dir, out_dir=results_dir)

In [None]:
%%time
public=False
nouns=False
process_words_elmo_199(public, params_grid, 
                       '_'.join(('private_verbs_elmo_199',common_prefix)),
                       nouns, elmo_work_dir, out_dir=results_dir)