In [3]:
import spacy

# word vectors occupy lot of space. hence en_core_web_sm model do not have them included. 
# In order to download
# word vectors you need to install large or medium english model. We will install the large one!
# make sure you have run "python -m spacy download en_core_web_lg" to install large english model
nlp = spacy.load("en_core_web_lg")

In [2]:
!python -m spacy download en_core_web_lg


Collecting en-core-web-lg==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.6.0/en_core_web_lg-3.6.0-py3-none-any.whl (587.7 MB)
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.6.0
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_lg')


In [4]:
doc = nlp("dog cat banana kem")

for token in doc:
    print(token.text, "Vector:", token.has_vector, "OOV:", token.is_oov)

dog Vector: True OOV: False
cat Vector: True OOV: False
banana Vector: True OOV: False
kem Vector: False OOV: True


In [5]:
base_token = nlp("bread")
base_token.vector.shape

(300,)

In [6]:
doc = nlp("bread sandwich burger car tiger human wheat")

for token in doc:
    print(f"{token.text} <-> {base_token.text}:", token.similarity(base_token))

bread <-> bread: 0.9999999744752309
sandwich <-> bread: 0.6341067010130894
burger <-> bread: 0.47520687769584247
car <-> bread: 0.06451532596945217
tiger <-> bread: 0.04764611272488976
human <-> bread: 0.2151154210812192
wheat <-> bread: 0.615036141030184


In [7]:
def print_similarity(base_word, words_to_compare):
    base_token = nlp(base_word)
    doc = nlp(words_to_compare)
    for token in doc:
        print(f"{token.text} <-> {base_token.text}: ", token.similarity(base_token))

In [18]:
print_similarity("communication", "Language is a bridge that connects humans and enhances understanding.")


Language <-> communication:  0.49262584751013627
is <-> communication:  0.2807799136363269
a <-> communication:  0.28392002278808687
bridge <-> communication:  0.28055051554804294
that <-> communication:  0.5221810231689782
connects <-> communication:  0.542629176122194
humans <-> communication:  0.37781693676233474
and <-> communication:  0.6639839397288615
enhances <-> communication:  0.5216576960211894
understanding <-> communication:  0.747913369375598
. <-> communication:  0.29203578065530134


In [9]:
king = nlp.vocab["king"].vector
man = nlp.vocab["man"].vector
woman = nlp.vocab["woman"].vector
queen = nlp.vocab["queen"].vector

result = king - man + woman

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity([result], [queen])

array([[0.6178015]], dtype=float32)