In [None]:
import torch
from word2vec_models import CBOW
import pickle
from tqdm import tqdm

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
#Load cleaned words and dictionary
with open('../data/cleaned_words.pickle', 'rb') as handle:
    corpus = pickle.load(handle)
with open('../data/word_to_ix.pickle', 'rb') as handle:
    word_to_ix = pickle.load(handle)
with open('../data/ix_to_word.pickle', 'rb') as handle:
    ix_to_word = pickle.load(handle)
vocab_size = len(word_to_ix)

In [None]:
########################################################################
# Import best performing model
model_path = '../models/cbow_d200_cs_2_5.pth'

checkpoint = torch.load(f=model_path, map_location=torch.device('cpu'))

In [None]:
net = CBOW(vocab_size, 200, 4)

In [None]:
net.load_state_dict(checkpoint['model_state_dict'])
net.eval()

In [None]:
net.ix_to_embeddding(torch.tensor([word_to_ix['abuse']],dtype=torch.long))

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

In [None]:
# getting embeddings from the embedding layer of our model, by name
embeddings = net.embeddings.weight.to('cpu').data.numpy()

In [None]:
viz_words = 500
tsne = TSNE()
embed_tsne = tsne.fit_transform(embeddings[:viz_words, :])

In [None]:
fig, ax = plt.subplots(figsize=(16, 16))
for idx in range(viz_words):
    plt.scatter(*embed_tsne[idx, :], color='steelblue')
    plt.annotate(ix_to_word[idx], (embed_tsne[idx, 0], embed_tsne[idx, 1]), alpha=0.7)

In [None]:
len(corpus)

In [None]:
import pandas as pd

In [None]:
columns = ['Context','Target']
train_data = []
for i in tqdm(range(2,4000)):
    context = word_to_ix[corpus[i]]
    for j in range(1, 2+1):
        target = word_to_ix[corpus[i-j]]
        train_data.append((context,target))
        target = word_to_ix[corpus[i+j]]
        train_data.append((context,target))
t = torch.tensor(train_data,dtype=torch.long)
print(t.shape)

In [None]:
trl = torch.utils.data.DataLoader(t,batch_size=2000,shuffle=False)

In [None]:
for inp in trl:
    o = inp[:,0][:,None]
    print(o.shape)
    break