In [None]:
from pathlib import Path

import numpy as np
import matplotlib.pylab as plt
from IPython.display import display

from utils import seed_everything, pca, moving_average
from word_embedding_multi import SkipGram, DataLoader

seed_everything()

%load_ext autoreload
%autoreload 2

In [None]:
data_path = Path("../code/utils/datasets/stanfordSentimentTreebank/")

dataset = DataLoader(path=data_path / "datasetSentences.txt")
display(dataset.df.head())
dataset.token_freq.most_common(10)

In [None]:
center_word_indices, outside_word_indices = dataset.get_random_context(num_context=5, batch_size=2)

for c, o in zip(center_word_indices, outside_word_indices):
    print(dataset.token_list[c], [dataset.token_list[w] for w in o])

In [None]:
model = SkipGram(dataset=dataset, vec_dim=10, num_context=5, k=10)
model.center_word_vectors[center_word_indices].shape, model.outside_word_vectors[outside_word_indices].shape

In [None]:
losses = model.fit(epochs=30000, batch_size=32, lr=1e-2, anneal_every=4000, save_every=10000)

In [None]:
_, axs = plt.subplots(nrows=len(losses.keys()), figsize=(10, 10), sharex=True)

for ax, (k, v) in zip(axs, losses.items()):
    ax.plot(v)
    ax.plot(moving_average(v, win=200))
    ax.set_title(k)
    ax.grid()
plt.tight_layout()
plt.show()

In [None]:
words = [
    "great",
    "cool",
    "brilliant",
    "wonderful",
    "well",
    "amazing",
    "worth",
    "sweet",
    "enjoyable",
    "boring",
    "bad",
    "dumb",
    "annoying",
    "female",
    "male",
    "queen",
    "king",
    "man",
    "woman",
    "rain",
    "snow",
    "hail",
    "coffee",
    "tea",
]

word_indices = [dataset.token_dict[w] for w in words]
result = pca(model.center_word_vectors[word_indices], 2)
result = result / np.linalg.norm(result, axis=1, keepdims=True)

plt.figure(figsize=(10, 8))
plt.scatter(result[:, 0], result[:, 1])
for i, word in enumerate(words):
    plt.annotate(word, xy=(result[i, 0], result[i, 1]))
plt.show()

# References

- [Efficient Estimation of Word Representations in Vector Space](http://arxiv.org/pdf/1301.3781.pdf)
- [Distributed Representations of Words and Phrases and their Compositionality](http://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf)
- [Demystifying Neural Network in Skip-Gram Language Modeling](https://aegis4048.github.io/demystifying_neural_network_in_skip_gram_language_modeling#Derivation-of-Cost-Function)
- [Optimize Computational Efficiency of Skip-Gram with Negative Sampling](https://aegis4048.github.io/optimize_computational_efficiency_of_skip-gram_with_negative_sampling)
- [The Illustrated Word2vec](https://jalammar.github.io/illustrated-word2vec/)