https://machinelearningmastery.com/develop-word-embeddings-python-gensim/

# Imports

In [None]:
%pip install gensim

In [None]:
import numpy as np
import gensim
from gensim.models import Word2Vec
import nltk
from nltk.corpus import gutenberg
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [None]:
nltk.download('gutenberg')

In [None]:
nltk.download('punkt')

# Load & process sentence data

In [None]:
# load data
whitman_fileid = gutenberg.fileids()[-1]
sentences = gutenberg.sents(whitman_fileid)

In [None]:
print(f"# of sentences - {len(sentences)}")
mean_words_per_sent = np.round(np.mean([len(sent) for sent in sentences]),2)
print(f"Mean words/sentence - {mean_words_per_sent}")

In [None]:
# remove punctuation and symbols
proc_sentences = []
for sentence in sentences:
    curr_sentence = []
    for word in sentence:
        if word.isalpha():
            curr_sentence.append(word.lower())
    proc_sentences.append(curr_sentence)
print(proc_sentences)

In [None]:
print(f"# of sentences - {len(proc_sentences)}")
mean_words_per_sent = np.round(np.mean([len(sent) for sent in proc_sentences]),2)
print(f"Mean words/sentence - {mean_words_per_sent}")

# Custom Word2Vec embedding

In [None]:
model = Word2Vec(proc_sentences)

In [None]:
# tokens
words = list(model.wv.vocab)
print(words)

In [None]:
# vector for particular token
print(model['leaves'])

### Visualize embedding

In [None]:
# get subset of vectors from model for visualization
n_words = 50
X = model[model.wv.vocab][:n_words]

# create 2D PCA model of embeddings
pca = PCA(n_components=2)
result = pca.fit_transform(X)

In [None]:
# visualize embedding in 2D
plt.figure(figsize=(15,10))
plt.scatter(result[:, 0], result[:, 1])
# annotate words
words = list(model.wv.vocab)[:n_words]
for i, word in enumerate(words):
    plt.annotate(word, xy=(result[i, 0], result[i, 1]))
plt.xlabel("dimension 1")
plt.ylabel("dimension 2")
plt.show()