In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_md')

In [3]:
nlp(u'AXA').vector

array([-0.73734  , -0.21723  , -0.0057086, -0.70546  ,  0.91892  ,
        0.11962  ,  0.026047 , -0.20541  , -0.24512  , -0.82177  ,
       -0.34003  , -0.20455  , -0.34613  , -0.015079 ,  0.29575  ,
        0.22098  , -0.53345  , -0.48305  ,  0.61992  , -0.29829  ,
        0.18393  , -0.050819 ,  0.055808 , -0.32073  ,  0.34512  ,
        0.37406  , -0.0020645,  0.25965  ,  0.20038  ,  1.0377   ,
        0.16002  ,  0.0087781,  0.38864  ,  0.44497  , -0.1494   ,
       -0.041039 ,  0.092425 , -0.33187  , -0.3988   ,  0.36484  ,
        0.43804  ,  0.13806  , -0.12814  , -0.22111  ,  0.24563  ,
       -0.39295  ,  0.40334  ,  0.29363  , -0.59938  , -0.31034  ,
        0.57062  ,  0.19473  ,  0.40447  , -0.23618  , -0.51966  ,
        0.024898 , -0.17556  ,  0.43054  , -0.33062  , -0.38777  ,
       -0.39376  ,  0.33062  ,  0.25948  , -0.5017   ,  0.2642   ,
       -0.15745  ,  0.87683  ,  0.96322  , -0.034149 ,  0.15019  ,
       -0.32473  ,  0.24747  ,  0.34065  ,  0.13586  ,  0.2602

In [4]:
nlp(u'AXA').vector.shape

(300,)

In [5]:
nlp(u'AXA is the biggest insurance company in the world.').vector.shape

(300,)

There are 300 dimensions to this vector. For **'AXA' (word)** there 300 dimensions. For **'AXA is the biggest insurance company in the world.' (document)** it will give averaged 300 dimensions of all the vectors of all words in that document.


#### Finding Similarity between words

In [8]:
tokens = nlp(u'Car Bike Vehicle')

In [9]:
for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))

Car Car 1.0
Car Bike 0.53577304
Car Vehicle 0.7667538
Bike Car 0.53577304
Bike Bike 1.0
Bike Vehicle 0.44815367
Vehicle Car 0.7667538
Vehicle Bike 0.44815367
Vehicle Vehicle 1.0


In [10]:
tokens2 = nlp(u'like love hate')

In [11]:
for token1 in tokens2:
    for token2 in tokens2:
        print(token1.text, token2.text, token1.similarity(token2))

like like 1.0
like love 0.657904
like hate 0.65746516
love like 0.657904
love love 1.0
love hate 0.63930994
hate like 0.65746516
hate love 0.63930994
hate hate 1.0


In [12]:
#checking the number of words in our model vocab
nlp.vocab.vectors.shape

(20000, 300)

We have 20000 words with 300 dimensions. (Medium Model). The large model has 684831 words in its vocab

In [13]:
tokens = nlp(u'dog cat Yogesh')

In [14]:
for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

dog True 7.0336733 False
cat True 6.6808186 False
Yogesh False 0.0 True


This is printing token.text (Yogesh), token.has_vector in vocab (False), token.vector_norm(value 0.0), token.is_oov (out of vocab True)

In [15]:
tokens = nlp(u'dog cat Victor')

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

dog True 7.0336733 False
cat True 6.6808186 False
Victor True 5.7749805 False


In [16]:
from scipy import spatial

In [17]:
cosine_similarity  = lambda vec1,vec2: 1- spatial.distance.cosine(vec1,vec2)

In [19]:
king = nlp.vocab['King'].vector
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector

In [20]:
#king - man + woman ----> New_vector similar to Queen, princess or highness

In [21]:
new_vector = king - man + woman

In [22]:
computed_similarities = []

#for all words in my model vocab
for word in nlp.vocab:
    if word.has_vector:
        if word.is_lower:
            if word.is_alpha:
                similarity = cosine_similarity(new_vector, word.vector)
                computed_similarities.append((word,similarity))
    

In [23]:
computed_similarities = sorted(computed_similarities, key= lambda item:-item[1])

In [24]:
print([t[0].text for t in computed_similarities[:10] ])

['king', 'queen', 'commoner', 'highness', 'prince', 'sultan', 'maharajas', 'princes', 'kumbia', 'kings']
