In [1]:
import io
import numpy as np
from gensim.models import word2vec
from gensim.models import KeyedVectors
import gensim.downloader as api

In [2]:
kaw2v = word2vec.Word2Vec.load('../resources/word2vec/w2vemb100wind4.model').wv
enw2v = KeyedVectors.load_word2vec_format('glove-twitter-100/glove-twitter-100.gz')

In [3]:
def get_vecs(model):
    vocab = [word for word, emb in model.vocab.items()]
    vectors = []
    for word in vocab:
        vectors.append(model[word])
    word2id = {word:ind for ind, word in enumerate(vocab)}
    id2word = {ind:word for ind, word in enumerate(vocab)}
    embeddings = np.vstack(vectors)
    return embeddings, word2id, id2word

kavecs, kaword2id, kaid2word = get_vecs(kaw2v)
envecs, enword2id, enid2word = get_vecs(enw2v)

In [4]:
def get_nn(word, src_emb, src_id2word, tgt_emb, tgt_id2word, K=5):
    print("Nearest neighbors of \"%s\":" % word)
    word2id = {v: k for k, v in src_id2word.items()}
    word_emb = src_emb[word2id[word]]
    scores = (tgt_emb / np.linalg.norm(tgt_emb, 2, 1)[:, None]).dot(word_emb / np.linalg.norm(word_emb))
    k_best = scores.argsort()[-K:][::-1]
    print(scores.shape)
    for i, idx in enumerate(k_best):
        print('%.4f - %s' % (scores[idx], tgt_id2word[idx]))


In [5]:
get_nn('კატა', kavecs, kaid2word, kavecs, kaid2word)

Nearest neighbors of "კატა":
(90203,)
1.0000 - კატა
0.8192 - კურდღელი
0.8132 - სამურავი
0.8048 - გომბეშო
0.8048 - ლეოპარდი


In [9]:
get_nn('წიგნი', kavecs, kaid2word, envecs, enid2word)

Nearest neighbors of "წიგნი":
(1193514,)
0.4741 - corregido
0.4545 - gigas
0.4514 - bajado
0.4444 - apagones
0.4443 - bbmu
