In [1]:
import pickle
import numpy as np
import os
import pandas as pd

In [2]:
emb_name = 'eflomal'

In [3]:
processed_dataset_path = f"./sentence_retrieval/{emb_name}"

with open(f"{processed_dataset_path}/test/test.pickle", 'rb') as handle:
    test_set_to_store = pickle.load(handle)

In [4]:
print(f"Number of languages in test: {len(test_set_to_store)}")

Number of languages in test: 1335


In [5]:
test_set_to_store = {lang: data for lang, data in test_set_to_store.items() if len(data) >= 400}
print(f"Number of languages in testset that has more than 400 verses: {len(test_set_to_store)}")

Number of languages in testset that has more than 400 verses: 1250


In [6]:
# load concept networks
from gensim.models import KeyedVectors, Word2Vec

if emb_name == 'clique_word' or emb_name == 'nt_word':
    embedding_path = f"./word_embeddings_{emb_name}.kv"
    loaded_n2v = KeyedVectors.load(embedding_path)
elif emb_name == 'sentence_id':
    epochs = 50
    emb_dim = 200
    word_vec = Word2Vec.load(f"./word2vec_{epochs}_{emb_dim}.model")
    loaded_n2v = word_vec.wv
elif emb_name == 'eflomal':
    epochs = 10
    emb_dim = 200
    loaded_n2v = KeyedVectors.load(f"/mounts/data/proj/yihong/newhome/ConceptNetwork/network_related"
                                   f"/eflomal_vectors_{emb_dim}_{epochs}.wv")

In [7]:
def create_representations(data_dict, embedding_m, lang):
    verse_numbers = []
    representations = []

    for pair in data_dict:


        verse_number = pair[0]
        verse_numbers.append(verse_number)
        
        strings = pair[1]
        representation = np.zeros(embedding_m.vector_size)
        for string in strings:
            representation += np.array(embedding_m[f"{lang}:{string}"])
        representations.append(representation)
    
    verse_representations = dict(zip(verse_numbers, representations))
    
    return verse_representations

In [19]:
from sklearn.metrics.pairwise import cosine_similarity

# creating english representations
representations_eng = create_representations(test_set_to_store['eng'], loaded_n2v, 'eng')

# specify tok k
top_k = 1

zero_shot_langs = list(test_set_to_store.keys())
print(f"Number of zero-shot languages: {len(zero_shot_langs) - 1}")  # filter english
print()

topk_performance = {}

file_name = f"./sentence_retrieval/{emb_name}/zero_shot_{emb_name}_top_{top_k}_results.txt"
    
with open(file_name, 'w', encoding='utf-8') as f:
    for test_lang in zero_shot_langs:
        
        if test_lang == 'eng':
            continue
        
        sent = f"{test_lang}:"
        print(sent)
        f.write(sent + '\n')
        
        representations_tgt_lang = create_representations(test_set_to_store[test_lang], loaded_n2v, test_lang)
        common_keys = list(set(representations_eng.keys()).intersection(set(representations_tgt_lang.keys())))
        
        representations_eng_temp = np.array([representations_eng[v] for v in common_keys])
        representations_tgt_lang = np.array([representations_tgt_lang[v] for v in common_keys])
        
        cos_sim = cosine_similarity(representations_eng_temp, representations_tgt_lang)
    
        idx = np.argsort(cos_sim, axis=1)[:,-top_k:]
        
        matches = 0
        
        for index in range(len(idx)):
            if index in idx[index]:
                matches += 1
        
        sent = f"Top {top_k} accuracy: {round(matches/len(idx), 2)}"
        
        print(sent)
        f.write(sent + '\n')
        
        print()
        f.write('\n')
        
        topk_performance[test_lang] = round(matches/len(idx), 2)

Number of zero-shot languages: 1249

aai:
Top 1 accuracy: 0.64

aak:
Top 1 accuracy: 0.42

aau:
Top 1 accuracy: 0.71

aaz:
Top 1 accuracy: 0.55

abt:
Top 1 accuracy: 0.38

abx:
Top 1 accuracy: 0.66

aby:
Top 1 accuracy: 0.33

acd:
Top 1 accuracy: 0.67

ace:
Top 1 accuracy: 0.76

acf:
Top 1 accuracy: 0.78

ach:
Top 1 accuracy: 0.79

acn:
Top 1 accuracy: 0.79

acr:
Top 1 accuracy: 0.69

acu:
Top 1 accuracy: 0.29

ade:
Top 1 accuracy: 0.73

adh:
Top 1 accuracy: 0.74

adi:
Top 1 accuracy: 0.61

adj:
Top 1 accuracy: 0.71

adl:
Top 1 accuracy: 0.7

aeb:
Top 1 accuracy: 0.65

aeu:
Top 1 accuracy: 0.66

aey:
Top 1 accuracy: 0.59

afr:
Top 1 accuracy: 0.95

agd:
Top 1 accuracy: 0.52

agg:
Top 1 accuracy: 0.46

agm:
Top 1 accuracy: 0.36

agn:
Top 1 accuracy: 0.74

agr:
Top 1 accuracy: 0.34

agt:
Top 1 accuracy: 0.54

agu:
Top 1 accuracy: 0.54

agw:
Top 1 accuracy: 0.7

ahk:
Top 1 accuracy: 0.6

aia:
Top 1 accuracy: 0.62

aii:
Top 1 accuracy: 0.83

aim:
Top 1 accuracy: 0.64

aji:
Top 1 accuracy: 

Top 1 accuracy: 0.62

dur:
Top 1 accuracy: 0.68

dwr:
Top 1 accuracy: 0.63

dww:
Top 1 accuracy: 0.71

dyi:
Top 1 accuracy: 0.73

dyo:
Top 1 accuracy: 0.61

dyu:
Top 1 accuracy: 0.72

ebk:
Top 1 accuracy: 0.68

efi:
Top 1 accuracy: 0.85

eka:
Top 1 accuracy: 0.76

ell:
Top 1 accuracy: 0.95

emp:
Top 1 accuracy: 0.47

enb:
Top 1 accuracy: 0.48

enl:
Top 1 accuracy: 0.46

enm:
Top 1 accuracy: 0.99

enx:
Top 1 accuracy: 0.62

epo:
Top 1 accuracy: 0.96

eri:
Top 1 accuracy: 0.73

ese:
Top 1 accuracy: 0.24

esi:
Top 1 accuracy: 0.22

esk:
Top 1 accuracy: 0.2

est:
Top 1 accuracy: 0.87

esu:
Top 1 accuracy: 0.22

etu:
Top 1 accuracy: 0.08

eus:
Top 1 accuracy: 0.8

ewe:
Top 1 accuracy: 0.84

eza:
Top 1 accuracy: 0.67

faa:
Top 1 accuracy: 0.35

fai:
Top 1 accuracy: 0.54

fal:
Top 1 accuracy: 0.82

fao:
Top 1 accuracy: 0.93

ffm:
Top 1 accuracy: 0.66

fij:
Top 1 accuracy: 0.86

fil:
Top 1 accuracy: 0.94

fin:
Top 1 accuracy: 0.87

fon:
Top 1 accuracy: 0.67

for:
Top 1 accuracy: 0.46

fra:
Top

Top 1 accuracy: 0.87

las:
Top 1 accuracy: 0.67

lat:
Top 1 accuracy: 0.89

lav:
Top 1 accuracy: 0.83

lbj:
Top 1 accuracy: 0.67

lbk:
Top 1 accuracy: 0.71

lcm:
Top 1 accuracy: 0.76

ldi:
Top 1 accuracy: 0.75

lee:
Top 1 accuracy: 0.63

lef:
Top 1 accuracy: 0.61

leh:
Top 1 accuracy: 0.58

lem:
Top 1 accuracy: 0.68

leu:
Top 1 accuracy: 0.77

lew:
Top 1 accuracy: 0.6

lex:
Top 1 accuracy: 0.5

lgm:
Top 1 accuracy: 0.73

lhi:
Top 1 accuracy: 0.62

lhm:
Top 1 accuracy: 0.65

lhu:
Top 1 accuracy: 0.68

lia:
Top 1 accuracy: 0.74

lid:
Top 1 accuracy: 0.64

lif:
Top 1 accuracy: 0.59

lin:
Top 1 accuracy: 0.9

lip:
Top 1 accuracy: 0.64

lit:
Top 1 accuracy: 0.83

ljp:
Top 1 accuracy: 0.77

lmk:
Top 1 accuracy: 0.8

lmp:
Top 1 accuracy: 0.71

lob:
Top 1 accuracy: 0.76

lol:
Top 1 accuracy: 0.72

lom:
Top 1 accuracy: 0.73

loz:
Top 1 accuracy: 0.8

lsi:
Top 1 accuracy: 0.64

lsm:
Top 1 accuracy: 0.64

lug:
Top 1 accuracy: 0.6

luo:
Top 1 accuracy: 0.75

lus:
Top 1 accuracy: 0.82

lwo:
Top 1 a

Top 1 accuracy: 0.75

pcm:
Top 1 accuracy: 0.79

pdc:
Top 1 accuracy: 0.9

pdt:
Top 1 accuracy: 0.69

pes:
Top 1 accuracy: 0.87

pib:
Top 1 accuracy: 0.42

pio:
Top 1 accuracy: 0.41

pir:
Top 1 accuracy: 0.34

pis:
Top 1 accuracy: 0.79

pkb:
Top 1 accuracy: 0.66

plg:
Top 1 accuracy: 0.67

pls:
Top 1 accuracy: 0.68

plu:
Top 1 accuracy: 0.52

plw:
Top 1 accuracy: 0.73

pmf:
Top 1 accuracy: 0.72

pne:
Top 1 accuracy: 0.75

poe:
Top 1 accuracy: 0.63

poh:
Top 1 accuracy: 0.6

poi:
Top 1 accuracy: 0.56

pol:
Top 1 accuracy: 0.91

pon:
Top 1 accuracy: 0.73

por:
Top 1 accuracy: 0.96

poy:
Top 1 accuracy: 0.67

ppk:
Top 1 accuracy: 0.58

ppo:
Top 1 accuracy: 0.46

prf:
Top 1 accuracy: 0.77

pri:
Top 1 accuracy: 0.68

prk:
Top 1 accuracy: 0.85

prs:
Top 1 accuracy: 0.8

pse:
Top 1 accuracy: 0.77

ptp:
Top 1 accuracy: 0.63

ptu:
Top 1 accuracy: 0.62

pua:
Top 1 accuracy: 0.46

pwg:
Top 1 accuracy: 0.64

pww:
Top 1 accuracy: 0.71

qub:
Top 1 accuracy: 0.31

quc:
Top 1 accuracy: 0.62

quf:
Top 

Top 1 accuracy: 0.65

zar:
Top 1 accuracy: 0.65

zas:
Top 1 accuracy: 0.57

zat:
Top 1 accuracy: 0.7

zav:
Top 1 accuracy: 0.51

zaw:
Top 1 accuracy: 0.54

zca:
Top 1 accuracy: 0.58

zho:
Top 1 accuracy: 0.85

zia:
Top 1 accuracy: 0.53

ziw:
Top 1 accuracy: 0.65

zom:
Top 1 accuracy: 0.77

zos:
Top 1 accuracy: 0.51

zpc:
Top 1 accuracy: 0.46

zpi:
Top 1 accuracy: 0.68

zpl:
Top 1 accuracy: 0.58

zpm:
Top 1 accuracy: 0.48

zpo:
Top 1 accuracy: 0.63

zpq:
Top 1 accuracy: 0.52

zpt:
Top 1 accuracy: 0.68

zpu:
Top 1 accuracy: 0.58

zpv:
Top 1 accuracy: 0.64

zpz:
Top 1 accuracy: 0.63

zsm:
Top 1 accuracy: 0.76

zsr:
Top 1 accuracy: 0.64

ztq:
Top 1 accuracy: 0.61

zty:
Top 1 accuracy: 0.64

zul:
Top 1 accuracy: 0.68

zyp:
Top 1 accuracy: 0.78



In [20]:
perf_sum = 0
for _, perf in topk_performance.items():
    perf_sum += perf

print(f"The average top {top_k} accuracies over {len(topk_performance)} languages is {perf_sum/len(topk_performance)}")

The average top 1 accuracies over 1249 languages is 0.6155164131305044


In [21]:
# emb_name = 'nt_word'

# top 1 average score: 0.2821107544141256
# top 5 average score: 0.4655056179775288
# top 10 average score: 0.5500240770465489

In [22]:
# emb_name = 'clique_word'

# top 1 average score: 0.4125902165196475
# top 5 average score: 0.616495589414595
# top 10 average score: 0.6965116279069767

In [23]:
# emb_name = 'sentence_id'

# top 1 average score: 0.16943154523618906
# top 5 average score: 0.2851961569255401
# top 10 average score: 0.3491993594875901

In [13]:
# emb_name = 'eflomal_node2vec'

# epoch = 10
# top 1 average score: 0.6155164131305044
# top 5 average score: 0.7685908726981596
# top 10 average score: 0.8198398718975167