In [None]:
import logging
import time
from datetime import datetime
import os
import ast
import codecs
import json
import collections
import pandas as pd
import numpy as np
from sklearn.model_selection import ParameterGrid
import random
import fasttext
import string
from gensim.models import KeyedVectors
from tqdm import tqdm_notebook as tqdm
from ruwordnet import RuWordNet

FORMAT = "%(asctime)s %(levelname)s: %(message)s"
logging.basicConfig(level=logging.DEBUG,format=FORMAT)

from utils import _join, PUNCT_SYMBOLS, PREPOSITIONS, RESTRICT_TAGS, \
                  get_synset_str_and_vector_lemma, get_synset_words_lemma, \
                  get_top, get_top_hyperomyns_counter, get_top_hyperomyns_counter_v, \
                  read_train_dataset, read_train_dataset, read_gold_dataset, \
                  normalize_ma_lemmatize, \
                  save_to_file

In [None]:
import pymorphy2
morph_analyzer = pymorphy2.MorphAnalyzer()

### GRID PROCESS

In [None]:
common_prefix = 'lemm_lower_true'

In [None]:
results_dir = 'results/res_'+common_prefix

In [None]:
! mkdir -p {results_dir}

In [None]:
def params_grid():
    norm_functions = [normalize_ma_lemmatize]
    norm_fn_out_of_vocab2synonym = [None]
    norm_fn_sort   = [False]
    norm_fn_unique = [False] 
    norm_fn_lower  = [True]
    norm_fn_min_word_len = [1]

    punct_symbols = set(PUNCT_SYMBOLS).copy()
    punct_symbols_ = punct_symbols
    
    # special symbols from ruwordnet
    from_ruwordnet_punct_symbols = {'—', '«', '»', '·', '\xad', '\xa0', '°', '–', '§'} 
    
    punct_symbols.update(from_ruwordnet_punct_symbols)
    
    norm_fn_punct_symbols = [punct_symbols]
    norm_fn_prepositions  = [PREPOSITIONS]
    norm_fn_restrict_tags = [RESTRICT_TAGS]

    #verb_tags = ['VERB', 'INFN', 'GRND', 'NOUN']
    norm_fn_accept_tags = [None]

#     p1_l = [-1.0, 0.0, 0.1, 0.5, 1.0, 2.0]
#     p1_l = [0.0, 0.1, 1.0]
    p1_l = [0.1]
#     p2_l = [-1.0, 0.0, 0.1, 0.5, 1.0, 2.0]
#     p2_l = [0.0, 0.1, 1.0]
    p2_l = [1.0]
#     p3_l = [-1.0, 0.0, 0.1, 0.5, 1.0, 2.0]
#     p3_l = [0.0, 0.1, 1.0]
    p3_l = [1.0]
#     k_l  = [1, 3, 5, 7, 10, 15, 20]
    k_l  = [10]
#     topn_l = [1, 3, 5, 7, 10, 15, 20]
    topn_l  = [10]
    account_gold_l = [False]
    
    all_params = {'p1': p1_l,
                  'p2': p2_l,
                  'p3': p3_l,
                  'k' : k_l, 
                  'topn': topn_l,
                  'account_gold': account_gold_l,
                  
                  'normalize_func': norm_functions,
                  
                  'out_of_vocab2synonym': norm_fn_out_of_vocab2synonym,
                  'sort': norm_fn_sort,
                  'unique': norm_fn_unique,
                  'lower': norm_fn_lower,
                  'min_word_len': norm_fn_min_word_len,
                  'punct_symbols': norm_fn_punct_symbols,
                  'prepositions': norm_fn_prepositions,
                  'restrict_tags': norm_fn_restrict_tags,
                  'accept_tags': norm_fn_accept_tags,
                  'ma': [morph_analyzer]
                 }
    
    
    
    for p in ParameterGrid(all_params):
        yield p
        
len([params for params in tqdm(params_grid())])

In [None]:
params = list(params_grid())[0]
params

In [None]:
### LOADING DATA

In [None]:
%%time
ruwordnet = RuWordNet('data/ruwordnet')

In [None]:
%%time
ft_model_file='fasttext/cc.ru.300.bin'
ft_model = fasttext.load_model(ft_model_file)

In [None]:
%%time
tayga_none_fasttextcbow_300_10_2019='data/rusvectores_models/tayga_none_fasttextcbow_300_10_2019/model.model'
model_tayga = KeyedVectors.load(tayga_none_fasttextcbow_300_10_2019)

In [None]:
%%time
araneum_none_fasttextcbow_300_5_2018 = 'data/rusvectores_models/araneum_none_fasttextcbow_300_5_2018/araneum_none_fasttextcbow_300_5_2018.model'
model_araneum = KeyedVectors.load(araneum_none_fasttextcbow_300_5_2018)

In [None]:
modelname2model = {'ft.cc.ru.300': ft_model,
                   'tayga_none_fasttextcbow_300_10_2019': model_tayga,
                   'araneum_none_fasttextcbow_300_5_2018': model_araneum,
                  }

In [None]:
%%time
words, vector = get_synset_str_and_vector_lemma(ruwordnet, 
                                          '116365-V', 
                                          dict(), 
                                          {'ruthes_name': True, 
                                           'senses_names': True}, 
                                          norm_function=normalize_ma_lemmatize, 
                                          model=ft_model)
words

In [None]:
vector.shape

In [None]:
words, vector = get_synset_str_and_vector_lemma(ruwordnet, 
                                                '9577-N', 
                                                dict(), 
                                              {'ruthes_name': True, 
                                               'senses_names': True}, 
                                                norm_function=normalize_ma_lemmatize, 
                                                model=model_tayga)
words

In [None]:
%%time

nouns=True  # NOUNS
nouns=False # VERBS

p1,p2,p3,k,topn,account_gold,norm_function = params['p1'],params['p2'],params['p3'],\
                                             params['k'],params['topn'],params['account_gold'],\
                                             params['normalize_func']

pos = "N" if nouns else "V"

norm_fn_params_list = {'out_of_vocab2synonym',
                       'sort',
                       'unique',
                       'lower',
                       'min_word_len',
                       'punct_symbols',
                       'prepositions',
                       'restrict_tags',
                       'accept_tags',
                       'ma'
                       }
norm_fn_params = {param_name: params[param_name] 
                  for param_name in norm_fn_params_list
                  if param_name in params}
if nouns:
    make_sentences_params = {'ruthes_name': True,
                             'definition': False,
                             'senses_names': True,
                             'senses_lemmas': False,
                             'senses_main_word': False,
                             'sep': ' '
                            }
else:
    make_sentences_params = {'ruthes_name': True,
                             'definition': True,
                             'senses_names': True,
                             'senses_lemmas': True,
                             'senses_main_word': True,
                             'sep': ' '
                            }

synsetstr2id = dict()
synsetstr2vector = dict()
for synset in ruwordnet.synsets_list:
    if synset['part_of_speech'] != pos:
        continue

    synset_words, vector = get_synset_str_and_vector_lemma(ruwordnet, synset['id'], 
                                                     norm_params=norm_fn_params,
                                                     make_sent_params=make_sentences_params,
                                                     norm_function=norm_function,
                                                     model=ft_model)
    synsetstr = ' '.join(synset_words)
    if synsetstr in synsetstr2id:
        logging.error(f"Duplicate synset_str.{synset['id'],synsetstr2id[synsetstr]}:'{synsetstr}'")
    synsetstr2id[synsetstr] = synset['id']
    synsetstr2vector[synsetstr] = vector

In [None]:
synsetstr2id

In [None]:
make_sentences_params

In [None]:
get_synset_words_lemma(ruwordnet, '124987-N',
                       norm_params=norm_fn_params,
                       make_sent_params=make_sentences_params,
                       norm_function=normalize_ma_lemmatize,
                      )

### PROCESS WORDS FOR TEST SETS

In [None]:
def process_words(words, 
                  params_grid,
                  prefix, 
                  nouns, 
                  model,
                  algo='default',
                  out_dir='/tmp'):
    for params in tqdm(params_grid()):
        start = time.time()
        p1,p2,p3,k,topn,account_gold,norm_function = params['p1'],params['p2'],params['p3'],\
                                                     params['k'],params['topn'],params['account_gold'],\
                                                     params['normalize_func']
        
        pos = "N" if nouns else "V"
        
        norm_fn_params_list = {'out_of_vocab2synonym',
                               'sort',
                               'unique',
                               'lower',
                               'min_word_len',
                               'punct_symbols',
                               'prepositions',
                               'restrict_tags',
                               'accept_tags',
                               'ma'
                               }
        norm_fn_params = {param_name: params[param_name] 
                          for param_name in norm_fn_params_list
                          if param_name in params}
        if nouns:
            make_sentences_params = {'ruthes_name': True,
                                     'definition': False,
                                     'senses_names': True,
                                     'senses_lemmas': False,
                                     'senses_main_word': False,
                                     'sep': ' '
                                    }
        else:
            make_sentences_params = {'ruthes_name': True,
                                     'definition': True,
                                     'senses_names': True,
                                     'senses_lemmas': True,
                                     'senses_main_word': True,
                                     'sep': ' '
                                    }
        
        if nouns:
            word2parents = read_train_dataset('data/training_data/synsets_nouns.tsv', ruwordnet)
            gold_synsetid2parents = read_gold_dataset('data/training_data/synsets_nouns.tsv')
        else:
            word2parents = read_train_dataset('data/training_data/synsets_verbs.tsv', ruwordnet)
            gold_synsetid2parents = read_gold_dataset('data/training_data/synsets_verbs.tsv')
        
        synsetstr2id = dict()
        synsetstr2vector = dict()
        for synset in ruwordnet.synsets_list:
            if synset['part_of_speech'] != pos:
                continue

            synset_words, vector = get_synset_str_and_vector_lemma(ruwordnet, synset['id'], 
                                                                   norm_params=norm_fn_params,
                                                                   make_sent_params=make_sentences_params,
                                                                   norm_function=norm_function,
                                                                   model=model)
            synsetstr = ' '.join(synset_words)
            if synsetstr in synsetstr2id:
                logging.error(f"Duplicate synset_str.{synset['id'],synsetstr2id[synsetstr]}:'{synsetstr}'")
            synsetstr2id[synsetstr] = synset['id']
            synsetstr2vector[synsetstr] = vector

#         print (len(synsetstr2vector))
                              
        example_vector = None
        for _,v in synsetstr2vector.items():
            example_vector = v
            break

        ruwordnet_matrix = np.zeros((len(synsetstr2vector),example_vector.shape[0]),
                                     example_vector.dtype)

        for i, (s, v) in enumerate(synsetstr2vector.items()):
            ruwordnet_matrix[i] = v
#         print (ruwordnet_matrix.shape)
                
        
        result = collections.defaultdict(list)
        for w in words:
            hypernyms = get_top_hyperomyns_counter(w, 
                                                   k=k, 
                                                   p1=p1,p2=p2,p3=p3, 
                                                   account_gold=account_gold,
                                                   ruwordnet_matrix=ruwordnet_matrix,
                                                   gold_synsetid2parents=gold_synsetid2parents,
                                                   synsetstr2id=synsetstr2id,
                                                   synsetstr2vector=synsetstr2vector,
                                                   model=model,
                                                   ruwordnet=ruwordnet
                                                  )
            result[w] = [h for h,rate in hypernyms.most_common()][:topn]
            
        
        curr_time ='_'.join(str(datetime.now()).split()).replace(':','')
        out_file  = f'{prefix}_{k}_{topn}_{p1}_{p2}_{p3}_{account_gold}_{curr_time}'
        out_file  = os.path.join(out_dir, out_file)
                              
        with open(out_file+'.json', 'w') as of_json:
            params_out = {p: str(params[p]) for p in params.keys()}
            json.dump(params_out,fp=of_json, indent=4)

        save_to_file(result, out_file+'.tsv', 
                     ruwordnet)
                              
        
        end = time.time()
        logging.info(f"Saved {out_file}.")
        logging.info(f"Spent time: {end - start} secs ({(end - start)/60.} minutes).")

In [None]:
public_words = list()
private_words = list()

def load_words(nouns):
    global public_words
    global private_words
#     if nouns:
#         print (ruwordnet.get_synset_senses_list('126551-N'))
#     else:
#         print (ruwordnet.get_synset_senses_list('124595-V'))

#     if nouns:
#         print(len([s for s in ruwordnet.synsets_list if s['part_of_speech']=='N']))
#     else:
#         print(len([s for s in ruwordnet.synsets_list if s['part_of_speech']=='V']))
    if nouns:
        with open('data/public_test/nouns_public.tsv', 'r') as f:
            public_words = [l.strip().lower() for l in f.readlines()]
        with open('data/private_test/nouns_private.tsv', 'r') as f:
            private_words = [l.strip().lower() for l in f.readlines()]
    else:
        with open('data/public_test/verbs_public.tsv', 'r') as f:
            public_words = [l.strip().lower() for l in f.readlines()]
        with open('data/private_test/verbs_private.tsv', 'r') as f:
            private_words = [l.strip().lower() for l in f.readlines()]

### PUBLIC NOUNS

In [None]:
nouns = True # False -- Verbs
load_words(nouns)
for model_name, model in modelname2model.items():
    process_words(public_words, 
                  params_grid, 
                  '_'.join(('public_nouns',common_prefix, model_name)),
                  nouns,
                  model=model, 
                  out_dir=results_dir
                  )

### PRIVATE NOUNS

In [None]:
nouns = True 
load_words(nouns)
for model_name, model in modelname2model.items():
    process_words(private_words, 
                  params_grid, 
                  '_'.join(('private_nouns',common_prefix, model_name)),
                  nouns,
                  model=model,
                  out_dir=results_dir
                 )

### PUBLIC VERBS

In [None]:
nouns = False 
load_words(nouns)
for model_name, model in modelname2model.items():
    process_words(public_words, 
                  params_grid, 
                  '_'.join(('public_verbs',common_prefix, model_name)),
                  nouns,
                  model=model,
                  out_dir=results_dir
                 )

### PRIVATE VERBS

In [None]:
nouns = False 
load_words(nouns)
for model_name, model in modelname2model.items():
    process_words(private_words,
                  params_grid,
                  '_'.join(('private_verbs',common_prefix, model_name)),
                  nouns,
                  model=model,
                  out_dir=results_dir
                 )

# ELMO

In [None]:
params

In [None]:
elmo_work_dir = '_'.join(('data/elmo_vectors',common_prefix))
elmo_model_dir = 'data/rusvectores_models/199'

In [None]:
! mkdir -p {elmo_work_dir}

In [None]:
%%time

def prepare_elmo_data(nouns):
    if nouns:
        make_sentences_params = {'ruthes_name': True,
                                 'definition': False,
                                 'senses_names': True,
                                 'senses_lemmas': False,
                                 'senses_main_word': False,
                                 'sep': ' '
                                }
    else:
        make_sentences_params = {'ruthes_name': True,
                                 'definition': True,
                                 'senses_names': True,
                                 'senses_lemmas': True,
                                 'senses_main_word': True,
                                 'sep': ' '
                                }
    norm_fn_params_list = {'out_of_vocab2synonym',
                           'sort',
                           'unique',
                           'lower',
                           'min_word_len',
                           'punct_symbols',
                           'prepositions',
                           'restrict_tags',
                           'accept_tags',
                           'ma'
                           }
    norm_fn_params = {param_name: params[param_name] 
                      for param_name in norm_fn_params_list
                      if param_name in params}


    synsetid2sentence = dict()
    for synset in ruwordnet.synsets_list:
        synset_id=synset['id']
        if not ((nouns and synset_id.endswith('N')) or 
                (not nouns and synset_id.endswith('V') )):
            continue

        words = get_synset_words_lemma(ruwordnet, synset_id,
                                       norm_params=norm_fn_params,
                                       make_sent_params=make_sentences_params,
                                       norm_function=normalize_ma_lemmatize
                                      )
        sentence = ' '.join(words).strip()
        synsetid2sentence[synset_id] = sentence

    if nouns:
        fname = os.path.join(elmo_work_dir,'sentences_N.txt')
        fname_s = os.path.join(elmo_work_dir, 'synsetids_N.txt')
    else:
        fname = os.path.join(elmo_work_dir,'sentences_V.txt')
        fname_s = os.path.join(elmo_work_dir,'synsetids_V.txt')

    with open(fname, 'w') as f:
        f.writelines([sent+'\n' for s_id, sent in synsetid2sentence.items()])
    with open(fname_s, 'w') as f:
        f.writelines([s_id+'\n' for s_id, sent in synsetid2sentence.items()])

prepare_elmo_data(True)
prepare_elmo_data(False)

In [None]:
%%time
nouns = True 
load_words(nouns)
with open(os.path.join(elmo_work_dir,'public_nouns.txt'), 'w') as f:
    f.writelines([w.lower()+'\n' for w in public_words])
with open(os.path.join(elmo_work_dir,'private_nouns.txt'), 'w') as f:
    f.writelines([w.lower()+'\n' for w in private_words])
nouns = False 
load_words(nouns)
with open(os.path.join(elmo_work_dir,'public_verbs.txt'), 'w') as f:
    f.writelines([w.lower()+'\n' for w in public_words])
with open(os.path.join(elmo_work_dir,'private_verbs.txt'), 'w') as f:
    f.writelines([w.lower()+'\n' for w in private_words])

In [None]:
elmo_work_dir

In [None]:
elmo_model_dir

#### Get elmo vectors

In [None]:
! cd simple_elmo/ && ./make_elmo_vectors_ruwordnet.sh ../{elmo_work_dir} ../{elmo_model_dir} && cd ../

In [None]:
%%time
ruwordnet_matrix = np.load(os.path.join(elmo_work_dir,
                                        'sentences_N_elmo_avg_vectors_199.npy')
                          )
ruwordnet_matrix.shape

In [None]:
public_nouns_matrix = np.load(os.path.join(elmo_work_dir,
                                           'public_nouns_elmo_avg_vectors_199.npy')
                             )
public_nouns_matrix[0]

In [None]:
with open(os.path.join(elmo_work_dir,'synsetids_N.txt'), 'r') as f:
    synset_ids_N = [l.strip() for l in f.readlines()]

In [None]:
!head -2 {elmo_work_dir}/sentences_N.txt

In [None]:
!head -10 {elmo_work_dir}/public_nouns.txt

In [None]:
get_top(public_nouns_matrix[5], ruwordnet_matrix, synset_ids_N)

In [None]:
get_top(public_nouns_matrix[0], ruwordnet_matrix, synset_ids_N)

In [None]:
def process_words_elmo_199(public, 
                       params_grid,
                       prefix, 
                       nouns,
                       elmo_work_dir,
                       out_dir='/tmp'):
    
    for params in tqdm(params_grid()):
        start = time.time()
        p1,p2,p3,k,topn,account_gold,norm_function = params['p1'],params['p2'],params['p3'],\
                                                     params['k'],params['topn'],params['account_gold'],\
                                                     params['normalize_func']
        
        pos = "N" if nouns else "V"
        
        if nouns:
            word2parents = read_train_dataset('data/training_data/synsets_nouns.tsv', ruwordnet)
            gold_synsetid2parents = read_gold_dataset('data/training_data/synsets_nouns.tsv')
        else:
            word2parents = read_train_dataset('data/training_data/synsets_verbs.tsv', ruwordnet)
            gold_synsetid2parents = read_gold_dataset('data/training_data/synsets_verbs.tsv')
            
        if nouns:
            ruwordnet_matrix = np.load(os.path.join(elmo_work_dir,'sentences_N_elmo_avg_vectors_199.npy'))
            with open(os.path.join(elmo_work_dir, 'synsetids_N.txt'), 'r') as f:
                synset_ids = [l.strip() for l in f.readlines()]
            with open(os.path.join(elmo_work_dir, 'sentences_N.txt'), 'r') as f:
                sentences = [l.strip() for l in f.readlines()]
        else:
            ruwordnet_matrix = np.load(os.path.join(elmo_work_dir,'sentences_V_elmo_avg_vectors_199.npy'))
            with open(os.path.join(elmo_work_dir,'synsetids_V.txt'), 'r') as f:
                synset_ids = [l.strip() for l in f.readlines()]
            with open(os.path.join(elmo_work_dir,'sentences_V.txt'), 'r') as f:
                sentences = [l.strip() for l in f.readlines()]

        if nouns and public:
            words_matrix = np.load(os.path.join(elmo_work_dir,'public_nouns_elmo_avg_vectors_199.npy'))
            with open(os.path.join(elmo_work_dir,'public_nouns.txt'), 'r') as f:
                words = [l.strip() for l in f.readlines()]
        elif nouns and (not public):
            words_matrix = np.load(os.path.join(elmo_work_dir,'private_nouns_elmo_avg_vectors_199.npy'))
            with open(os.path.join(elmo_work_dir,'private_nouns.txt'), 'r') as f:
                words = [l.strip() for l in f.readlines()]
        elif (not nouns) and public:
            words_matrix = np.load(os.path.join(elmo_work_dir,'public_verbs_elmo_avg_vectors_199.npy'))
            with open(os.path.join(elmo_work_dir,'public_verbs.txt'), 'r') as f:
                words = [l.strip() for l in f.readlines()]
        elif (not nouns) and (not public):
            words_matrix = np.load(os.path.join(elmo_work_dir,'private_verbs_elmo_avg_vectors_199.npy'))
            with open(os.path.join(elmo_work_dir,'private_verbs.txt'), 'r') as f:
                words = [l.strip() for l in f.readlines()]
            
        
        synsetstr2id = dict()
        synsetstr2vector = dict()
        
        for synsetstr, synsetid, synsetvector in zip(sentences, synset_ids, ruwordnet_matrix):
            synsetstr2id[synsetstr] = synsetid
            synsetstr2vector[synsetstr] = synsetvector
        
        result = collections.defaultdict(list)
        for w, v in zip(words, words_matrix):
            hypernyms = get_top_hyperomyns_counter_v(v, 
                                                     k=k, 
                                                     p1=p1,p2=p2,p3=p3, 
                                                     account_gold=account_gold,
                                                     ruwordnet_matrix=ruwordnet_matrix,
                                                     gold_synsetid2parents=gold_synsetid2parents,
                                                     synsetstr2id=synsetstr2id,
                                                     synsetstr2vector=synsetstr2vector,
                                                     ruwordnet=ruwordnet
                                                    )
            result[w] = [h for h,rate in hypernyms.most_common()][:topn]
            
        
        curr_time ='_'.join(str(datetime.now()).split()).replace(':','')
        out_file  = f'{prefix}_{k}_{topn}_{p1}_{p2}_{p3}_{account_gold}_{curr_time}'
        out_file  = os.path.join(out_dir, out_file)
                              
        with open(out_file+'.json', 'w') as of_json:
            params_out = {p: str(params[p]) for p in params.keys()}
            json.dump(params_out,fp=of_json, indent=4)

        save_to_file(result, out_file+'.tsv', 
                     ruwordnet)
                              
        
        end = time.time()
        logging.info(f"Saved {out_file}.")
        logging.info(f"Spent time: {end - start} secs ({(end - start)/60.} minutes).")

In [None]:
%%time
public=True
nouns=True
process_words_elmo_199(public, params_grid, 
                       '_'.join(('public_nouns_elmo_199',common_prefix)), 
                       nouns, elmo_work_dir, out_dir=results_dir)

In [None]:
%%time
public=True
nouns=False
process_words_elmo_199(public, params_grid, 
                       '_'.join(('public_verbs_elmo_199',common_prefix)),
                       nouns, elmo_work_dir, out_dir=results_dir)

In [None]:
%%time
public=False
nouns=True
process_words_elmo_199(public, params_grid, 
                       '_'.join(('private_nouns_elmo_199',common_prefix)),
                       nouns, elmo_work_dir, out_dir=results_dir)

In [None]:
%%time
public=False
nouns=False
process_words_elmo_199(public, params_grid, 
                       '_'.join(('private_verbs_elmo_199',common_prefix)),
                       nouns, elmo_work_dir, out_dir=results_dir)