In [1]:
import gensim
from gensim.models import word2vec
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [3]:
w2v = KeyedVectors.load_word2vec_format('../Datasets/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [5]:
# w2v['rupee']

In [4]:
v1 = w2v["survey"] 
v2 = w2v["generation"]

In [5]:
print(v1.shape)
print(v2.shape)

(300,)
(300,)


In [6]:
cosine_similarity([v1],[v2])

array([[0.10165814]], dtype=float32)

In [13]:
w2v.most_similar('people', topn=10)

[('peole', 0.6058608293533325),
 ('poeple', 0.5907129049301147),
 ('individuals', 0.5827618837356567),
 ('folks', 0.5794459581375122),
 ('peple', 0.578874409198761),
 ('peo_ple', 0.5768002271652222),
 ('peope', 0.5763059854507446),
 ('citizens', 0.5653228759765625),
 ('Americans', 0.562726616859436),
 ('People', 0.562098503112793)]

In [14]:
def find_odd_one_out(words):    

    word_vectors = [w2v[word] for word in words]
    mean_vector = np.mean(word_vectors, axis=0)
    print(mean_vector.shape)
    
    odd_one = None
    min_similarity = 1
    
    for word in words:
        sim = cosine_similarity( [w2v[word]], [mean_vector] )
        if sim < min_similarity:
            min_similarity = sim
            odd_one = word
    
        print("Similarity between mean vector and {} = {}".format(word, min_similarity))
            
    return odd_one

In [15]:
find_odd_one_out(["Apple", "Google", "Facebook", "Microsoft", "Zara"])

(300,)
Similarity between mean vector and Apple = [[0.75132143]]
Similarity between mean vector and Google = [[0.75132143]]
Similarity between mean vector and Facebook = [[0.6587838]]
Similarity between mean vector and Microsoft = [[0.6587838]]
Similarity between mean vector and Zara = [[0.44133237]]


'Zara'

In [16]:
len(w2v.vocab)

3000000

In [17]:
w2v["man"].shape

(300,)

In [18]:
# w2v.vocab.keys()

In [19]:
# Bad Implementation. Exhaustive Search
def find_analogy(words):
    # a:b :: c:?
    
    max_similarity = -1.01
    
    d = None
    
    vocabulary = w2v.vocab.keys()
    
    v_a, v_b, v_c = [w2v[w] for w in words]
    
    relation_1 = v_b - v_a
    
    # To find word d such that similarity(|v_b - v_a|, |v_d - v|c|) is maximum
    
    for word in vocabulary:
        if word in words:
            continue
        
        v_d = w2v[word]
        relation_2 = v_d - v_c
        sim = cosine_similarity([relation_1], [relation_2])
        
        if sim > max_similarity:
            max_similarity = sim
            d = word
            
    return d

In [20]:
from datetime import datetime
t0 = datetime.now()

print(find_analogy(["man", "woman", "king"]))

print("Time Taken:", datetime.now() - t0)

queen
Time Taken: 0:10:28.417208


In [164]:
# Using the built-in method
t0 = datetime.now()
print(w2v.most_similar(positive=['woman', 'king'], negative=['man'], topn=1))
print("Time Taken:", datetime.now() - t0)

[('queen', 0.7118192911148071)]
Time Taken: 0:00:01.558389


In [40]:
import re
f = open('../datasets/sherlock.txt')
text = f.read()
f.close()

data = text[3433:]

# data = data.lower()
data = re.sub('[^A-Za-z.]+', ' ', data)
data = data.split('.')

for i in range(len(data)):
    data[i] = data[i].split()

In [41]:
from gensim.models import Word2Vec
model = Word2Vec(data, size=300, window=10, min_count=1)

In [45]:
model.wv.most_similar('Watson')

[('rates', 0.9255977869033813),
 ('ruling', 0.9158950448036194),
 ('severely', 0.9104814529418945),
 ('Do', 0.9080990552902222),
 ('sir', 0.9064979553222656),
 ('mister', 0.9061363935470581),
 ('Mr', 0.9046266078948975),
 ('God', 0.9039232730865479),
 ('fret', 0.9032679200172424),
 ('Step', 0.8996236324310303)]

In [168]:
model.wv.save_word2vec_format("sherlock_w2v.bin")