In [1]:
import pickle
import numpy as np

In [2]:
# if we use the updated ngrams: ngrams that do not include white spaces
updated_ngrams = True
number_of_languages = 50

In [3]:
processed_dataset_path = '/mounts/data/proj/yihong/newhome/ConceptNetwork/eva/sentence_retrieval'

if updated_ngrams:
    processed_dataset_path = '/mounts/data/proj/yihong/newhome/ConceptNetwork/eva/sentence_retrieval/updated'

with open(f"{processed_dataset_path}/{number_of_languages}/test/test.pickle", 'rb') as handle:
    test_set_to_store = pickle.load(handle)

In [4]:
print(f"Number of languages in test: {len(test_set_to_store)}")

Number of languages in test: 1335


In [5]:
test_set_to_store = {lang: data for lang, data in test_set_to_store.items() if len(data) >= 400}
print(f"Number of languages in testset that has more than 400 verses: {len(test_set_to_store)}")

Number of languages in testset that has more than 400 verses: 1250


In [6]:
# load concept networks
from gensim.models import KeyedVectors

emb_dim = 200
num_epochs = 10

embedding_path = f"/mounts/data/proj/yihong/newhome/ConceptNetwork/network_related/" + \
                 f"expandednet_vectors_{emb_dim}_{num_epochs}.wv"

if updated_ngrams:
    embedding_path = f"/mounts/data/proj/yihong/newhome/ConceptNetwork/network_related/" + \
                     f"expandednet_vectors_minlang_{number_of_languages}_{emb_dim}_{num_epochs}_updated.wv"

loaded_n2v = KeyedVectors.load(embedding_path)

In [7]:
def create_representations(data_dict, embedding_m, lang):
    verse_numbers = []
    representations = []

    for pair in data_dict:


        verse_number = pair[0]
        verse_numbers.append(verse_number)
        
        strings = pair[1]
        representation = np.zeros(embedding_m.vector_size)
        for string in strings:
            representation += np.array(embedding_m[string]) if lang == 'eng' \
            else np.array(embedding_m[f"{lang}:{string}"])
        representations.append(representation)
    
    verse_representations = dict(zip(verse_numbers, representations))
    
    return verse_representations

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

src_lang = 'rus'
# creating english representations
representations_src = create_representations(test_set_to_store[src_lang], loaded_n2v, src_lang)

# specify tok k
top_k = 10

zero_shot_langs = list(test_set_to_store.keys())
print(f"Number of zero-shot languages: {len(zero_shot_langs) - 1}")  # filter english
print()

topk_performance = {}

file_name = f"./zero_shot_{src_lang}_top_{top_k}_results_minlang_{number_of_languages}.txt"

if updated_ngrams:
    file_name = f"./zero_shot_{src_lang}_top_{top_k}_results_updated_minlang_{number_of_languages}.txt"
    
with open(file_name, 'w', encoding='utf-8') as f:
    for test_lang in zero_shot_langs:
        
        if test_lang == src_lang:
            continue
        
        sent = f"{test_lang}:"
        print(sent)
        f.write(sent + '\n')
        
        representations_tgt_lang = create_representations(test_set_to_store[test_lang], loaded_n2v, test_lang)
        common_keys = list(set(representations_src.keys()).intersection(set(representations_tgt_lang.keys())))
        
        representations_src_temp = np.array([representations_src[v] for v in common_keys])
        representations_tgt_lang = np.array([representations_tgt_lang[v] for v in common_keys])
        
        cos_sim = cosine_similarity(representations_src_temp, representations_tgt_lang)
    
        idx = np.argsort(cos_sim, axis=1)[:,-top_k:]
        
        matches = 0
        
        for index in range(len(idx)):
            if index in idx[index]:
                matches += 1
        
        sent = f"Top {top_k} accuracy: {round(matches/len(idx), 2)}"
        
        print(sent)
        f.write(sent + '\n')
        
        print()
        f.write('\n')
        
        topk_performance[test_lang] = round(matches/len(idx), 2)

Number of zero-shot languages: 1249

aai:
Top 10 accuracy: 0.8

aak:
Top 10 accuracy: 0.75

aau:
Top 10 accuracy: 0.8

aaz:
Top 10 accuracy: 0.74

abt:
Top 10 accuracy: 0.61

abx:
Top 10 accuracy: 0.85

aby:
Top 10 accuracy: 0.62

acd:
Top 10 accuracy: 0.74

ace:
Top 10 accuracy: 0.82

acf:
Top 10 accuracy: 0.79

ach:
Top 10 accuracy: 0.86

acn:
Top 10 accuracy: 0.83

acr:
Top 10 accuracy: 0.84

acu:
Top 10 accuracy: 0.72

ade:
Top 10 accuracy: 0.8

adh:
Top 10 accuracy: 0.86

adi:
Top 10 accuracy: 0.86

adj:
Top 10 accuracy: 0.76

adl:
Top 10 accuracy: 0.81

aeb:
Top 10 accuracy: 0.87

aeu:
Top 10 accuracy: 0.79

aey:
Top 10 accuracy: 0.78

afr:
Top 10 accuracy: 0.95

agd:
Top 10 accuracy: 0.75

agg:
Top 10 accuracy: 0.76

agm:
Top 10 accuracy: 0.68

agn:
Top 10 accuracy: 0.79

agr:
Top 10 accuracy: 0.75

agt:
Top 10 accuracy: 0.67

agu:
Top 10 accuracy: 0.71

agw:
Top 10 accuracy: 0.8

ahk:
Top 10 accuracy: 0.74

aia:
Top 10 accuracy: 0.76

aii:
Top 10 accuracy: 0.91

aim:
Top 10 acc

Top 10 accuracy: 0.62

djr:
Top 10 accuracy: 0.74

dnj:
Top 10 accuracy: 0.68

dob:
Top 10 accuracy: 0.78

dop:
Top 10 accuracy: 0.79

dow:
Top 10 accuracy: 0.83

dtp:
Top 10 accuracy: 0.81

dts:
Top 10 accuracy: 0.83

due:
Top 10 accuracy: 0.78

dug:
Top 10 accuracy: 0.79

duo:
Top 10 accuracy: 0.73

dur:
Top 10 accuracy: 0.74

dwr:
Top 10 accuracy: 0.89

dww:
Top 10 accuracy: 0.79

dyi:
Top 10 accuracy: 0.78

dyo:
Top 10 accuracy: 0.87

dyu:
Top 10 accuracy: 0.81

ebk:
Top 10 accuracy: 0.82

efi:
Top 10 accuracy: 0.9

eka:
Top 10 accuracy: 0.84

ell:
Top 10 accuracy: 0.95

emp:
Top 10 accuracy: 0.74

enb:
Top 10 accuracy: 0.76

eng:
Top 10 accuracy: 0.94

enl:
Top 10 accuracy: 0.74

enm:
Top 10 accuracy: 0.91

enx:
Top 10 accuracy: 0.8

epo:
Top 10 accuracy: 0.95

eri:
Top 10 accuracy: 0.78

ese:
Top 10 accuracy: 0.58

esi:
Top 10 accuracy: 0.82

esk:
Top 10 accuracy: 0.8

est:
Top 10 accuracy: 0.93

esu:
Top 10 accuracy: 0.84

etu:
Top 10 accuracy: 0.75

eus:
Top 10 accuracy: 0.91



Top 10 accuracy: 0.66

kus:
Top 10 accuracy: 0.87

kvj:
Top 10 accuracy: 0.82

kvn:
Top 10 accuracy: 0.76

kwd:
Top 10 accuracy: 0.72

kwf:
Top 10 accuracy: 0.74

kwi:
Top 10 accuracy: 0.69

kwj:
Top 10 accuracy: 0.7

kxc:
Top 10 accuracy: 0.84

kxm:
Top 10 accuracy: 0.79

kxw:
Top 10 accuracy: 0.71

kyc:
Top 10 accuracy: 0.73

kyf:
Top 10 accuracy: 0.8

kyg:
Top 10 accuracy: 0.77

kyq:
Top 10 accuracy: 0.78

kyu:
Top 10 accuracy: 0.85

kyz:
Top 10 accuracy: 0.59

kze:
Top 10 accuracy: 0.68

lac:
Top 10 accuracy: 0.63

lai:
Top 10 accuracy: 0.88

laj:
Top 10 accuracy: 0.87

lam:
Top 10 accuracy: 0.89

lao:
Top 10 accuracy: 0.9

las:
Top 10 accuracy: 0.79

lat:
Top 10 accuracy: 0.91

lav:
Top 10 accuracy: 0.93

lbj:
Top 10 accuracy: 0.85

lbk:
Top 10 accuracy: 0.85

lcm:
Top 10 accuracy: 0.77

ldi:
Top 10 accuracy: 0.85

lee:
Top 10 accuracy: 0.78

lef:
Top 10 accuracy: 0.72

leh:
Top 10 accuracy: 0.86

lem:
Top 10 accuracy: 0.8

leu:
Top 10 accuracy: 0.8

lew:
Top 10 accuracy: 0.83

le

Top 10 accuracy: 0.92

nyf:
Top 10 accuracy: 0.85

nyn:
Top 10 accuracy: 0.89

nyo:
Top 10 accuracy: 0.88

nyy:
Top 10 accuracy: 0.87

obo:
Top 10 accuracy: 0.79

oji:
Top 10 accuracy: 0.81

ojs:
Top 10 accuracy: 0.83

okv:
Top 10 accuracy: 0.62

old:
Top 10 accuracy: 0.84

omw:
Top 10 accuracy: 0.71

ong:
Top 10 accuracy: 0.67

ons:
Top 10 accuracy: 0.77

ood:
Top 10 accuracy: 0.64

opm:
Top 10 accuracy: 0.7

ory:
Top 10 accuracy: 0.85

oss:
Top 10 accuracy: 0.9

ote:
Top 10 accuracy: 0.75

otm:
Top 10 accuracy: 0.72

otn:
Top 10 accuracy: 0.7

otq:
Top 10 accuracy: 0.78

ots:
Top 10 accuracy: 0.64

ozm:
Top 10 accuracy: 0.82

pab:
Top 10 accuracy: 0.72

pad:
Top 10 accuracy: 0.75

pag:
Top 10 accuracy: 0.94

pah:
Top 10 accuracy: 0.68

pam:
Top 10 accuracy: 0.88

pan:
Top 10 accuracy: 0.89

pao:
Top 10 accuracy: 0.49

pap:
Top 10 accuracy: 0.93

pbb:
Top 10 accuracy: 0.72

pbc:
Top 10 accuracy: 0.82

pbi:
Top 10 accuracy: 0.76

pbl:
Top 10 accuracy: 0.87

pcm:
Top 10 accuracy: 0.79



Top 10 accuracy: 0.85

wwa:
Top 10 accuracy: 0.8

xal:
Top 10 accuracy: 0.85

xav:
Top 10 accuracy: 0.64

xbr:
Top 10 accuracy: 0.88

xed:
Top 10 accuracy: 0.79

xho:
Top 10 accuracy: 0.93

xla:
Top 10 accuracy: 0.7

xon:
Top 10 accuracy: 0.74

xrb:
Top 10 accuracy: 0.69

xsi:
Top 10 accuracy: 0.73

xsm:
Top 10 accuracy: 0.75

xsu:
Top 10 accuracy: 0.59

xtd:
Top 10 accuracy: 0.71

xtm:
Top 10 accuracy: 0.69

xuo:
Top 10 accuracy: 0.74

yaa:
Top 10 accuracy: 0.6

yad:
Top 10 accuracy: 0.76

yal:
Top 10 accuracy: 0.86

yam:
Top 10 accuracy: 0.73

yan:
Top 10 accuracy: 0.75

yaq:
Top 10 accuracy: 0.75

yby:
Top 10 accuracy: 0.71

ycn:
Top 10 accuracy: 0.65

yle:
Top 10 accuracy: 0.56

yli:
Top 10 accuracy: 0.76

yml:
Top 10 accuracy: 0.76

yon:
Top 10 accuracy: 0.83

yor:
Top 10 accuracy: 0.9

yrb:
Top 10 accuracy: 0.65

yre:
Top 10 accuracy: 0.7

yss:
Top 10 accuracy: 0.62

yua:
Top 10 accuracy: 0.89

yuj:
Top 10 accuracy: 0.76

yut:
Top 10 accuracy: 0.73

yuw:
Top 10 accuracy: 0.73

yu

In [19]:
perf_sum = 0
for _, perf in topk_performance.items():
    perf_sum += perf

print(f"The average top {top_k} accuracies over {len(topk_performance)} languages is {perf_sum/len(topk_performance)}")

The average top 10 accuracies over 1249 languages is 0.7805604483586873


In [12]:
# updated (200-dim)
# using arb:

# top 1 average score: 0.5595036028823052
# top 5 average score: 0.7230184147317847
# top 10 average score: 0.779295436349079

# using zho:

# top 1 average score: 0.5988070456365081
# top 5 average score: 0.7659567654123296
# top 10 average score: 0.8185028022417912

# using rus:

# top 1 average score: 0.5516413130504414
# top 5 average score: 0.7217694155324248
# top 10 average score: 0.7805604483586873

In [13]:
# updated (200-dim, minlang=1)
# top 1 average score: 0.5100240192153725
# top 5 average score: 0.6761008807045635
# top 10 average score: 0.7369575660528423

# updated (200-dim, minlang=5)
# top 1 average score: 0.584331465172137
# top 5 average score: 0.7419535628502795
# top 10 average score: 0.7949879903923136

# updated (200-dim, minlang=10)
# top 1 average score: 0.5945236188951162
# top 5 average score: 0.7525700560448345
# top 10 average score: 0.805252201761409

# updated (200-dim, minlang=20)
# top 1 average score: 0.6254603682946358
# top 5 average score: 0.7776220976781423
# top 10 average score: 0.8265252201761396

# updated (200-dim, minlang=50)
# top 1 average score: 0.6546036829463568
# top 5 average score: 0.7997758206565244
# top 10 average score: 0.8447798238590857

# updated (200-dim, minlang=100)
# top 1 average score: 0.6625620496397109
# top 5 average score: 0.8069415532425938
# top 10 average score: 0.8505364291433135