In [1]:
import time
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.initializers import Zeros
from web.embedding import Embedding
from web.evaluate import evaluate_on_all
from bats_eval import *
import warnings


In [2]:
base_dir = f"{os.getcwd()}/embeddings"
file_list = ['init_glove_emb.txt', 'fin_glove_emb_False_01.txt',
             'init_syngcn_emb.txt', 'fin_syngcn_emb_False_01.txt']
verbose = True
warnings.filterwarnings("ignore")

## Standard Evaluation Metrics

In [3]:
for file in file_list:

    sem_info_path = f"{os.getcwd()}/semantic_info"
    emb_init_file = f"{os.getcwd()}/embeddings/{file}"
    voc2id = dict()
    i = 0
    with open(emb_init_file, 'r') as file:
        for line in file:
            voc2id[line.strip().split(' ')[0]] = i
            i += 1
    id2voc = {v: k for k, v in voc2id.items()}
    vocab_size = len(voc2id)
    
    emb = tf.Variable(Zeros()(shape=(vocab_size, 300), dtype=tf.float32), trainable=False)
    x = np.asarray(pd.read_csv(emb_init_file, sep=' ', header=None))
    word2row = {word: int(i) for i, word in enumerate(x[:, 0])}
    order = list()
    for i in range(len(id2voc)):
        try:
            order.append(word2row[id2voc[i]])
        except KeyError:
            order.append(word2row["UNK"])
    x = x[order, 1:]                
    emb.assign(x)
    
    embed_matrix = tf.math.l2_normalize(emb, axis=1)
    words = [id2voc[i] for i in range(len(id2voc))]
    voc2vec = dict(zip(words, iter(embed_matrix.numpy())))
    embedding = Embedding.from_dict(voc2vec)
    results = evaluate_on_all(embedding)
    
    print(file)
    print(results)


Missing 25 words. Will replace them with mean vector
Missing 17 words. Will replace them with mean vector
Missing 11 words. Will replace them with mean vector
Missing 5 words. Will replace them with mean vector
Missing 2 words. Will replace them with mean vector
Missing 1322 words. Will replace them with mean vector
Missing 35 words. Will replace them with mean vector
Missing 6823 words. Will replace them with mean vector
Missing 3905 words. Will replace them with mean vector


<_io.TextIOWrapper name='/home/ecbm4040/Project/COMS_W4995_Project/embeddings/init_glove_emb.txt' mode='r' encoding='UTF-8'>
         AP  BLESS    Battig  ESSLI_2c  ESSLI_2b  ESSLI_1a     WS353  \
0  0.634328  0.835  0.400688  0.644444      0.75  0.795455  0.545717   

     WS353R    WS353S       MEN  SimLex999        RW      RG65     MTurk  \
0  0.455616  0.670692  0.730401   0.388533  0.248189  0.760149  0.577266   

        MSR    Google  SemEval2012_2  
0  0.156875  0.546664       0.156465  


Missing 25 words. Will replace them with mean vector
Missing 17 words. Will replace them with mean vector
Missing 11 words. Will replace them with mean vector
Missing 5 words. Will replace them with mean vector
Missing 2 words. Will replace them with mean vector
Missing 1322 words. Will replace them with mean vector
Missing 35 words. Will replace them with mean vector
Missing 6823 words. Will replace them with mean vector
Missing 3905 words. Will replace them with mean vector


<_io.TextIOWrapper name='/home/ecbm4040/Project/COMS_W4995_Project/embeddings/fin_glove_emb_False_01.txt' mode='r' encoding='UTF-8'>
         AP  BLESS    Battig  ESSLI_2c  ESSLI_2b  ESSLI_1a     WS353  \
0  0.659204   0.85  0.416173       0.6      0.75  0.840909  0.565899   

     WS353R    WS353S       MEN  SimLex999        RW      RG65     MTurk  \
0  0.464069  0.703647  0.744425   0.456057  0.251035  0.819574  0.595415   

       MSR    Google  SemEval2012_2  
0  0.18225  0.557921       0.163997  


Missing 24 words. Will replace them with mean vector
Missing 16 words. Will replace them with mean vector
Missing 11 words. Will replace them with mean vector
Missing 1 words. Will replace them with mean vector
Missing 1 words. Will replace them with mean vector
Missing 978 words. Will replace them with mean vector
Missing 2 words. Will replace them with mean vector
Missing 1566 words. Will replace them with mean vector
Missing 148 words. Will replace them with mean vector


<_io.TextIOWrapper name='/home/ecbm4040/Project/COMS_W4995_Project/embeddings/init_syngcn_emb.txt' mode='r' encoding='UTF-8'>
         AP  BLESS    Battig  ESSLI_2c  ESSLI_2b  ESSLI_1a     WS353  \
0  0.679104  0.795  0.462818  0.622222     0.825  0.727273  0.609096   

     WS353R   WS353S       MEN  SimLex999        RW      RG65     MTurk  \
0  0.485657  0.75034  0.710288   0.447495  0.335924  0.796167  0.662468   

        MSR    Google  SemEval2012_2  
0  0.528375  0.507266       0.226789  


Missing 24 words. Will replace them with mean vector
Missing 16 words. Will replace them with mean vector
Missing 11 words. Will replace them with mean vector
Missing 1 words. Will replace them with mean vector
Missing 1 words. Will replace them with mean vector
Missing 978 words. Will replace them with mean vector
Missing 2 words. Will replace them with mean vector
Missing 1566 words. Will replace them with mean vector
Missing 148 words. Will replace them with mean vector


<_io.TextIOWrapper name='/home/ecbm4040/Project/COMS_W4995_Project/embeddings/fin_syngcn_emb_False_01.txt' mode='r' encoding='UTF-8'>
         AP  BLESS    Battig  ESSLI_2c  ESSLI_2b  ESSLI_1a     WS353   WS353R  \
0  0.674129  0.755  0.455745       0.6     0.825  0.795455  0.616665  0.49382   

     WS353S       MEN  SimLex999        RW      RG65     MTurk       MSR  \
0  0.768139  0.723453   0.479953  0.334516  0.811946  0.671519  0.546625   

     Google  SemEval2012_2  
0  0.534947       0.231467  


## BATS Evaluation

In [4]:
def collect_words(filepath):

    all_words = list()
    base = f"{os.getcwd()}/BATS"
    for dr in os.listdir(base):
        if os.path.isdir(os.path.join(base, dr)):
            dk = dr.split('_', 1)[1].lower()
            for f in os.listdir(os.path.join(base, dr)):
                in_filepath = os.path.join(base, dr, f)
                pairs = [line.strip().split() for line in open(in_filepath, 'r').readlines()]
                pairs = [[p[0], p[1].split('/')] for p in pairs]
                vocab = list()
                for p in pairs:
                    vocab.append(p[0])
                    for i in p[1]:
                        vocab.append(i)                
                all_words += vocab
    all_words = list(set(all_words))
    all_words.sort()
    all_words.remove("")
    all_words_dict = {k: i for i, k in enumerate(all_words)}

    file_words = dict()
    matrix = list()
    vocab = list()
    indices = dict()
    i = 0
    with open(filepath, 'r') as file:
        for line in file:
            x = line.strip('\n').split(' ')
            try:
                all_words_dict[x[0]]
                matrix.append(x[1:])
                vocab.append(x[0])
                indices[x[0]] = i
                i+=0
            except KeyError:
                pass        
    matrix = np.asarray(matrix, dtype="float32")
    
    return matrix, vocab, indices

In [5]:
results = list()
for f in file_list:
    filepath = f"{base_dir}/{f}"
    temp = load_model(filepath)    
    matrix, vocab, indices = collect_words(filepath)
    model = dict()
    for i in vocab:
        model[i] = temp[i]    
    r = {}
    if verbose: print('[evaluate_models] Evaluating on BATS...')
    res = eval_bats(model, matrix, vocab, indices)
    results.append([f, res])

[evaluate_models] Evaluating on BATS...
[evaluate_models] Evaluating on BATS...
[evaluate_models] Evaluating on BATS...
[evaluate_models] Evaluating on BATS...


In [7]:
for f, res in results:
    print(f"\n{f}")
    [print(k, v) for k, v in res.items()]


init_glove_emb.txt
inflectional_morphology 0.007662835249042145
I03 [adj - comparative] 0.0
I01 [noun - plural_reg] 0.0
I02 [noun - plural_irreg] 0.06896551724137931
I09 [verb_Ving - Ved] 0.0
I06 [verb_inf - Ving] 0.0
I10 [verb_3pSg - Ved] None
I04 [adj - superlative] 0.0
I05 [verb_inf - 3pSg] 0.0
I08 [verb_Ving - 3pSg] 0.0
I07 [verb_inf - Ved] 0.0
encyclopedic_semantics 0.12691765681127382
E05 [name - occupation] 0.14583333333333334
E02 [country - language] 0.14
E09 [things - color] 0.2857142857142857
E01 [country - capital] 0.3
E04 [name - nationality] 0.0425531914893617
E08 [animal - shelter] 0.12
E07 [animal - sound] 0.041666666666666664
E10 [male - female] 0.09090909090909091
E03 [UK_city - county] 0.04
E06 [animal - young] 0.0625
derivational_morphology 0.0023809523809523807
D07 [verb+able_reg] 0.0
D05 [adj+ness_reg] 0.0
D08 [verb+er_irreg] 0.0
D01 [noun+less_reg] 0.0
D09 [verb+tion_irreg] 0.0
D02 [un+adj_reg] 0.023809523809523808
D04 [over+adj_reg] 0.0
D06 [re+verb_reg] 0.0
D03