In [1]:
import spacy
nlp = spacy.load('en_core_web_lg')  

In [2]:
nlp(u'lion').vector

array([  1.2746  ,   0.46242 ,  -1.1829  ,  -5.2661  ,  -2.7128  ,
         1.8521  ,  -0.94273 ,   2.1865  ,   6.503   ,   0.6704  ,
         1.5361  ,   2.5992  ,  -0.36233 ,   4.3965  ,  -6.5644  ,
         1.6141  ,  -1.2897  ,   2.1184  ,  -0.63654 ,  -3.4572  ,
        -4.3771  ,   4.2074  ,  -3.6411  ,  -0.97214 ,   1.3253  ,
        -2.3125  ,  -3.6531  ,  -2.8398  ,   2.7913  ,  -1.53    ,
        -2.9984  ,  -2.6357  ,   0.50615 ,  -2.6925  ,   4.3401  ,
        -5.6017  ,   0.045691,   4.3832  ,  -0.19535 ,  -1.0751  ,
         0.32172 ,   2.4395  ,   4.6638  ,   3.4471  ,  -3.3847  ,
        -1.8238  ,   0.70212 ,   0.58557 ,   5.0032  ,  -3.1072  ,
         1.2364  ,   7.4595  ,   0.057368,   1.0111  ,  -1.0827  ,
         0.69113 ,   2.8009  ,  -3.4383  ,  -1.0599  ,  -2.2627  ,
        -5.149   ,  -5.0636  ,   3.1405  ,   1.0793  ,  -0.72892 ,
        -3.9939  ,  -0.69551 ,  -0.55767 ,   3.2555  ,  -2.9449  ,
         4.7114  ,   1.6388  ,   1.3828  ,   1.4255  ,  -3.233

What's interesting is that Doc and Span objects themselves have vectors, derived from the averages of individual token vectors. <br>This makes it possible to compare similarities between whole documents.

In [3]:
doc = nlp(u'I really love mangoes and apple products')

doc.vector

array([-0.64032286, -2.082937  , -3.203207  , -1.6169443 , -0.40962008,
       -0.9783429 ,  0.8656715 ,  3.6665127 , -4.72105   ,  1.6279743 ,
        5.327057  ,  0.8303994 , -3.6414168 ,  0.2078429 ,  3.3106844 ,
       -2.0268972 ,  1.8166057 , -2.8183496 , -2.0398142 , -2.5937572 ,
        2.0000157 ,  2.6643143 , -1.2932256 , -3.557186  , -0.69740576,
       -2.8837273 , -1.5617572 ,  1.0545056 , -1.5280263 ,  0.3444    ,
        1.1809614 , -2.7802346 , -1.1542844 , -0.04495568,  0.6391516 ,
       -1.1982458 ,  0.05180485,  1.51052   ,  1.8687885 , -0.23269026,
        0.01442422,  2.0931156 ,  1.1818829 , -0.53122437,  2.1103702 ,
        3.2447371 , -0.72176445, -3.2786944 ,  0.62256855,  1.1671041 ,
        0.9361643 ,  0.06186284,  0.5581914 , -4.5202856 , -1.2642289 ,
       -0.4799177 , -0.5918317 ,  1.6091173 ,  2.5329716 , -0.17789118,
        4.200649  , -0.03903861, -0.35921577, -1.0349535 , -0.24234438,
        0.61263573, -2.4287713 , -3.299747  ,  1.6252729 ,  1.57

In [7]:
tokens = nlp(u'apple mango fruits')

for token1 in tokens:
    for token2 in tokens:
        if token1 != token2:
         print(token1.text, token2.text, token1.similarity(token2))

apple mango 0.6305075883865356
apple fruits 0.5867226123809814
mango apple 0.6305075883865356
mango fruits 0.6582745909690857
fruits apple 0.5867226123809814
fruits mango 0.6582745909690857


In [7]:
# Create a three-token Doc object:
tokens = nlp(u'like love hate')

# Iterate through token combinations:
for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))

like like 1.0
like love 0.657904
like hate 0.657465
love like 0.657904
love love 1.0
love hate 0.63931
hate like 0.657465
hate love 0.63931
hate hate 1.0


In [8]:
from scipy import spatial

cosine_similarity = lambda x, y: 1 - spatial.distance.cosine(x, y)

king = nlp.vocab['king'].vector
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector

new_vector = king - man + woman
computed_similarities = []

for word in nlp.vocab:
    # Ignore words without vectors and mixed-case words:
    if word.has_vector:
        if word.is_lower:
            if word.is_alpha:
                similarity = cosine_similarity(new_vector, word.vector)
                computed_similarities.append((word, similarity))

computed_similarities = sorted(computed_similarities, key=lambda item: -item[1])

print([w[0].text for w in computed_similarities[:10]])

['king', 'and', 'that', 'where', 'she', 'they', 'woman', 'there', 'should', 'these']
