In [None]:
from pathlib import Path

import numpy as np
import matplotlib.pylab as plt
from IPython.display import display

from utils import seed_everything, pca, moving_average
from word_embedding_single import SkipGram, DataLoader

seed_everything()

%load_ext autoreload
%autoreload 2

In [None]:
data_path = Path("../code/utils/datasets/stanfordSentimentTreebank/")

dataset = DataLoader(path=data_path / "datasetSentences.txt")
display(dataset.df.head())
dataset.token_freq.most_common(10)

In [None]:
display(dataset.all_sentences[:10])
dataset.get_random_context()

In [None]:
model = SkipGram(dataset=dataset, vec_dim=10)
losses = model.fit(epochs=40000, batch_size=32, lr=0.3, anneal_every=5000, save_every=10000)

In [None]:
_, axs = plt.subplots(nrows=len(losses.keys()), figsize=(10, 10))

for ax, (k, v) in zip(axs, losses.items()):
    ax.plot(v)
    ax.plot(moving_average(v, win=200))
    ax.set_title(k)
    ax.grid()
plt.tight_layout()
plt.show()

In [None]:
words = [
    "great",
    "cool",
    "brilliant",
    "wonderful",
    "well",
    "amazing",
    "worth",
    "sweet",
    "enjoyable",
    "boring",
    "bad",
    "dumb",
    "annoying",
    "female",
    "male",
    "queen",
    "king",
    "man",
    "woman",
    "rain",
    "snow",
    "hail",
    "coffee",
    "tea",
]

word_indices = [dataset.token_dict[w] for w in words]
result = pca(model.center_word_vectors[word_indices], 2)
result = result / np.linalg.norm(result, axis=1, keepdims=True)

plt.figure(figsize=(10, 8))
plt.scatter(result[:, 0], result[:, 1])
for i, word in enumerate(words):
    plt.annotate(word, xy=(result[i, 0], result[i, 1]))
plt.show()