# Imports

In [None]:
%pip install gensim

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import spacy
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

# SpaCy OntoNotes embedding

In [None]:
# load model
nlp = spacy.load("en_core_web_lg")
# word data
sentence = "king is to queen as man is to woman"
# calculate average word embedding for each profile
embedding = nlp(sentence)

In [None]:
# king - man + woman = queen
oop_word_embed = (embedding[0].vector - embedding[5].vector) + embedding[-1].vector
oop_word_embed.shape

In [None]:
arr_embedding = np.zeros((5,300))
arr_embedding[0] = embedding[0].vector
arr_embedding[1] = embedding[5].vector
arr_embedding[2] = embedding[-1].vector
arr_embedding[3] = embedding[3].vector
arr_embedding[4] = oop_word_embed

In [None]:
words = [
    "king",
    "man",
    "woman",
    "queen",
    "(king - man) + woman"
]

In [None]:
# visualize in 2D
pca = PCA(n_components=2)
embed_2d = pca.fit_transform(arr_embedding)

plt.figure(figsize=(9,6))
plt.scatter(embed_2d[:, 0], embed_2d[:, 1])
for i, w in enumerate(words):
    plt.annotate(w, xy=(embed_2d[i, 0], embed_2d[i, 1]))
plt.xlabel("dimension 1",fontsize=16)
plt.ylabel("dimension 2",fontsize=16)
plt.title("SpaCy OntoNotes Model")
plt.show()

In [None]:
# visualize in 3d
%matplotlib notebook
pca = PCA(n_components=3)
embed_3d = pca.fit_transform(arr_embedding)

fig = plt.figure(figsize=(9,6))
ax = fig.gca(projection='3d')
ax.scatter(embed_3d[:, 0], embed_3d[:, 1], embed_3d[:, 2])
for i, w in enumerate(words):
    ax.text(embed_3d[i, 0], embed_3d[i, 1], embed_3d[i, 2], w)
ax.set_xlabel("dimension 1",fontsize=16)
ax.set_ylabel("dimension 2",fontsize=16)
ax.set_zlabel("dimension 3",fontsize=16)
ax.set_title("SpaCy OntoNotes Model")
plt.show()


# Google's word2vec embedding

In [None]:
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)

In [None]:
google_embedding = np.array([model[word] for word in words[:-1]])
google_embedding.shape

In [None]:
# visualize in 2D
pca = PCA(n_components=2)
embed_2d = pca.fit_transform(google_embedding)

plt.figure(figsize=(9,6))
plt.scatter(embed_2d[:, 0], embed_2d[:, 1])
for i, w in enumerate(words[:-1]):
    plt.annotate(w, xy=(embed_2d[i, 0], embed_2d[i, 1]))
plt.xlabel("dimension 1",fontsize=16)
plt.ylabel("dimension 2",fontsize=16)
plt.title("GoogleNews Word2Vec Model")
plt.show()