# Word Embeddings

### Execute this cell to install required python module

After you've installed this once, you can delete this cell.

In [None]:
!pip install gensim
!python -m spacy download en_core_web_lg

# Word Embeddings using Word2Vec
Go here and download + unzip the Text8 Corpus: http://mattmahoney.net/dc/text8.zip, then change the `path_to_text8` variable

In [None]:
from gensim.models import Word2Vec
from gensim.models.word2vec import Text8Corpus

path_to_text8 = '~/Downloads/text8'

# We take only words that appear more than 150 times for doing a visualization later
w2v_model2 = Word2Vec(Text8Corpus(path_to_text8), size=100, window=5, min_count=150, workers=4)

#### What terms are most similar to Paris?

In [None]:
w2v_model2.wv.most_similar('paris')

### What terms are closest to `woman` and `king`, but not `man`?

In [None]:
w2v_model2.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=3)

### What terms are closest to `girl` and `father`, but not `boy`?

In [None]:
w2v_model2.wv.most_similar(positive=['girl', 'father'], negative=['boy'], topn=3)

# Calculating Cosine Similarity using Spacy's pre-trained word embeddings

In [None]:
import spacy
from scipy import spatial

nlp = spacy.load("en_core_web_lg")
 
cosine_similarity = lambda x, y: 1 - spatial.distance.cosine(x, y)
 
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector
queen = nlp.vocab['queen'].vector
king = nlp.vocab['king'].vector
 
# We now need to find the closest vector in the vocabulary to the result of "man" - "woman" + "queen"
maybe_king = man - woman + queen
computed_similarities = []
 
for word in nlp.vocab:
    # Ignore words without vectors
    if not word.has_vector:
        continue
 
    similarity = cosine_similarity(maybe_king, word.vector)
    computed_similarities.append((word, similarity))
    
computed_similarities = sorted(computed_similarities, key=lambda item: -item[1])
print([w[0].text for w in computed_similarities[:10]])

### Word-level Similarities

In [None]:
banana = nlp.vocab['banana']
dog = nlp.vocab['dog']
fruit = nlp.vocab['fruit']
animal = nlp.vocab['animal']
 
print(dog.similarity(animal), dog.similarity(fruit))
print(banana.similarity(fruit), banana.similarity(animal))

### Document-level Similarities

In [None]:
target = nlp("Cats are beautiful animals.")

doc1 = nlp("Dogs are awesome.")
doc2 = nlp("Some gorgeous creatures are felines.")
doc3 = nlp("Dolphins are swimming mammals.")
 
print(target.similarity(doc1))
print(target.similarity(doc2))
print(target.similarity(doc3))