In [4]:
from gensim.models import Word2Vec

In [5]:
model = Word2Vec.load('word2vec_model.model')

In [3]:
model.wv.most_similar('computer', topn=10)

[('computers', 0.6619974374771118),
 ('software', 0.5994142889976501),
 ('digital', 0.47143417596817017),
 ('minicomputers', 0.44772008061408997),
 ('machines', 0.4456964433193207),
 ('supercomputer', 0.4447154998779297),
 ('toy', 0.444561630487442),
 ('ibm', 0.43565645813941956),
 ('equipment', 0.4339454472064972),
 ('electronic', 0.4326600134372711)]

In [21]:
word_vectors = model.wv

# saving word vectors in KeyVector format
word_vectors.save('word_vectors.bin')

In [24]:
import numpy as np

np_word_vectors = model.wv.vectors
np.save('np_word_vectors.npy', np_word_vectors)

In [22]:
computer_vector = word_vectors.get_vector('computer')
supercomputer_vector = word_vectors.get_vector('supercomputer')

<class 'gensim.models.keyedvectors.KeyedVectors'>


In [19]:
cos_sim = np.dot(computer_vector, supercomputer_vector) / (np.linalg.norm(computer_vector) * np.linalg.norm(supercomputer_vector))
print(cos_sim)

0.44471553


using tsne, in high dimensional cos similarity becomes problematic. perhaps $\text{dim} = 3$ would be better.

In [27]:
from sklearn.manifold import TSNE

word_vectors_reduced = TSNE(n_components=3,
                            verbose = 2).fit_transform(np_word_vectors)


[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 7393 samples in 0.001s...
[t-SNE] Computed neighbors for 7393 samples in 0.293s...
[t-SNE] Computed conditional probabilities for sample 1000 / 7393
[t-SNE] Computed conditional probabilities for sample 2000 / 7393
[t-SNE] Computed conditional probabilities for sample 3000 / 7393
[t-SNE] Computed conditional probabilities for sample 4000 / 7393
[t-SNE] Computed conditional probabilities for sample 5000 / 7393
[t-SNE] Computed conditional probabilities for sample 6000 / 7393
[t-SNE] Computed conditional probabilities for sample 7000 / 7393
[t-SNE] Computed conditional probabilities for sample 7393 / 7393
[t-SNE] Mean sigma: 7.943408
[t-SNE] Computed conditional probabilities in 0.102s
[t-SNE] Iteration 50: error = 138.7195740, gradient norm = 0.1113852 (50 iterations in 4.987s)
[t-SNE] Iteration 100: error = 169.5654755, gradient norm = 0.0215915 (50 iterations in 3.936s)
[t-SNE] Iteration 150: error = 169.5790863, gradient norm 

In [29]:
print(word_vectors_reduced.shape)

np.save('word_vectors_reduced.npy', word_vectors_reduced)

(7393, 3)


In [21]:
supercomputer_idx = word_vectors.get_index('supercomputer')
computer_idx = word_vectors.get_index('computer')

print(supercomputer_idx)
print(computer_idx)

4728
222


In [35]:
print(f"Cosine Similarity between supercomputer and computer: {np.dot(word_vectors_reduced[supercomputer_idx], word_vectors_reduced[computer_idx]) / (np.linalg.norm(word_vectors_reduced[supercomputer_idx]) * np.linalg.norm(word_vectors_reduced[computer_idx]))}")

Cosine Similarity between supercomputer and computer: 0.9849855899810791


as conjectured prior, cosine similarity performs well in lower $\mathbb{R}^D$ spaces.

In [36]:
emotional_idx = word_vectors.get_index('emotional')

print(f"Cosine Similarity between supercomputer and emotional: {np.dot(word_vectors_reduced[supercomputer_idx], word_vectors_reduced[emotional_idx]) / (np.linalg.norm(word_vectors_reduced[supercomputer_idx]) * np.linalg.norm(word_vectors_reduced[emotional_idx]))}")

Cosine Similarity between supercomputer and emotional: -0.7637559175491333


not similar at all!

In [25]:
toshiba_idx = word_vectors.get_index('toshiba')

print(f"Cosine Similarity between supercomputer and toshiba: {np.dot(word_vectors_reduced[supercomputer_idx], word_vectors_reduced[toshiba_idx]) / (np.linalg.norm(word_vectors_reduced[supercomputer_idx]) * np.linalg.norm(word_vectors_reduced[toshiba_idx]))}")

Cosine Similarity between supercomputer and toshiba: 0.999747633934021


i shall visualize using projector.tensorflow.org

In [13]:
import numpy as np

word_vectors_reduced = np.load('word_vectors_reduced.npy')


In [16]:
np.savetxt('word_vectors.tsv', word_vectors_reduced, delimiter='\t', fmt = '%.16f')

In [20]:
keys = list(word_vectors.key_to_index.keys())

with open('embeds/word_vector_keys.tsv', 'w') as f:
    
    for word in keys:
        
        f.write(f"{word}\n")