### Валерия Бунтякова

In [1]:
import json
import random
import numpy as np
from tqdm.notebook import tqdm
from matplotlib import pyplot as plt
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from torch.utils.data import Dataset, DataLoader
import pickle

import torch

In [2]:
with open('data/processed_corpus.json') as f:
    corpus = json.load(f)

## Split functions

In [None]:
sample_text = corpus[1]

In [None]:
def cbow_split(tokens, window, pad_token='PAD'):
    
    splits = []
    
    # CODE
    
    padding = [pad_token for i in range(window)]
    tokens = padding + tokens + padding

    for i in range(window, len(tokens)-window):
        splits.append((tokens[i-window:i], tokens[i], tokens[i+1:i+window+1]))
        
    return splits


splits = cbow_split(sample_text, window=2)

In [None]:
def skipgram_split(tokens, window):
    
    splits = []
    
    # CODE
    
    pad_token = 'PAD'
    
    padding = [pad_token for i in range(window)]
    tokens = padding + tokens + padding
    
    for i in range(window, len(tokens)-window):
        center = tokens[i]
        contexts = [token for token in tokens[i-window:i] + tokens[i+1:i+window+1] if token != pad_token]
        
        for c in contexts:
            splits.append((c, center))
        
    return splits


splits = skipgram_split(sample_text, window=2)

## Dataset

In [3]:
word2index = {}

for text in corpus:
    for token in text:
        if token not in word2index:
            word2index[token] = len(word2index)

In [4]:
class SkipgramDataset(Dataset):

    def __init__(self,
                 corpus,
                 word2index,
                 window=2,
                 unk_token='UNK',
                 collect_verbose=True):

        self.corpus = corpus
        self.word2index = word2index
        self.index2word = {value: key for key, value in self.word2index.items()}
        self.window = window

        self.unk_token = unk_token
        self.unk_index = self.word2index[self.unk_token]

        self.collect_verbose = collect_verbose

        self.data = []

        self.collect_data()

    def __len__(self):

        return len(self.data)

    def _split_function(self, tokenized_text):
        
        # CODE

        splits = []
        pad_token = 'PAD'

        padding = [pad_token for i in range(self.window)]
        tokenized_text = padding + tokenized_text + padding

        for i in range(self.window, len(tokenized_text)-self.window):
            center = tokenized_text[i]
            contexts = [token for token in tokenized_text[i-self.window:i] + tokenized_text[i+1:i+self.window+1] if token != pad_token]
            for c in contexts:
                splits.append((c, center))

        return splits

    def indexing(self, tokenized_text):

        return [self.word2index[token] if token in self.word2index else self.unk_index for token in tokenized_text]

    def collect_data(self):

        corpus = tqdm(self.corpus, disable=not self.collect_verbose)

        for tokenized_text in corpus:
            indexed_text = self.indexing(tokenized_text)
            skipgram_examples = self._split_function(indexed_text)

            self.data.extend(skipgram_examples)

    def __getitem__(self, idx):
        
        context, central_word = self.data[idx]
        context = torch.Tensor([context, ]).long()
        
        # CODE

        return context, central_word

In [None]:
dataset = SkipgramDataset(corpus, word2index)

In [None]:
BATCH_SIZE = 512

In [None]:
dataset_loader = DataLoader(dataset, shuffle=True, batch_size=BATCH_SIZE)

In [None]:
for x, y in dataset_loader:
    break

## Model

In [5]:
# CODE
class SkipGram(torch.nn.Module):
    
    def __init__(self, vocab_size, embedding_dim):
        
        super().__init__()
        
        self.in_embedding = torch.nn.Embedding(num_embeddings=vocab_size, 
                                               embedding_dim=embedding_dim)
        
        self.out_embedding = torch.nn.Linear(in_features=embedding_dim,
                                             out_features=vocab_size, bias=False)
        
    def forward(self, x):
        x = self.in_embedding(x).sum(dim=-2)
        x = self.out_embedding(x)
        
        return x   

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
EMBEDDING_DIM = 20

In [None]:
model = SkipGram(vocab_size=len(word2index), embedding_dim=EMBEDDING_DIM).to(device)

optimizer = torch.optim.Adam(params=model.parameters(), lr=0.0001)

criterion = torch.nn.CrossEntropyLoss()

epochs = 5

losses = []

for n_epoch in range(epochs):

    try:

        progress_bar = tqdm(total=len(dataset_loader.dataset), desc='Epoch {}'.format(n_epoch + 1))

        for x, y in dataset_loader:
            
            x = x.to(device)
            y = y.to(device)
            
            optimizer.zero_grad()
            pred = model(x)

            # CODE

            loss = criterion(pred, y)
            loss.backward()
            optimizer.step()
            
            # CODE
            
            losses.append(loss.item())
            
            progress_bar.set_postfix(loss=np.mean(losses[-100:]))
            progress_bar.update(x.shape[0])

        progress_bar.close()

    except KeyboardInterrupt:

        progress_bar.close()
        break

In [None]:
plt.title('SkipGram Training Process')
plt.xlabel('Iterations')
plt.ylabel('Loss')
plt.grid()
plt.plot(losses)

In [None]:
# average loss 7.29
with open('emb_model.pickle', 'wb') as f:
    pickle.dump(model, f)

In [6]:
with open('emb_model.pickle', 'rb') as f:
    model = pickle.load(f)

## Testing

In [7]:
embedding_matrix = model.in_embedding.weight.detach()

In [8]:
def cos_sim(embedding_matrix, token2id, word1, word2):
    
    try:
        i1 = token2id[word1]
        i2 = token2id[word2]
    except KeyError: # if word not in model
        return -5
    
    v1, v2 = embedding_matrix[i1], embedding_matrix[i2]
    
    v1_n = v1.div(v1.norm(keepdim=True))
    v2_n = v2.div(v2.norm(keepdim=True))
    
    similarity = torch.dot(v1_n, v2_n).item()
    
    return similarity

In [9]:
cos_sim(embedding_matrix, word2index, 'день', 'месяц')

0.49183663725852966

In [10]:
cos_sim(embedding_matrix, word2index, 'минута', 'месяц')

0.6426668763160706

In [11]:
cos_sim(embedding_matrix, word2index, 'сотрудник', 'сотрудница')

0.04380059242248535

In [12]:
cos_sim(embedding_matrix, word2index, 'смерть', 'хлеб')

0.18202295899391174

In [None]:
freq = {}

for text in corpus:
    for token in text:
        if token in freq:
            freq[token] += 1
        else:
            freq[token] = 1

In [None]:
sorted_freq = [(k, freq[k]) for k in sorted(freq, key=freq.get, reverse=True)]
top_sorted_freq = sorted_freq[0:200]
tsne = TSNE(n_components=2, init='pca', random_state=42, verbose=2)
reduced = tsne.fit_transform(embedding_matrix.cpu())
top_words = [a for a,_ in top_sorted_freq]
inds = [word2index[word] for word in top_words]
x_coords = [coords[0] for coords in reduced[inds]]
y_coords = [coords[1] for coords in reduced[inds]]

In [None]:
for (x, y, word) in zip(x_coords, y_coords, top_words):
    plt.scatter(x, y, marker='.', color='blue')
    plt.text(x+0.01, y+0.01, word, fontsize=9)
plt.show()

## Evaluation

In [13]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler as MMS
from sklearn.metrics import mean_squared_error as MSE

In [14]:
df = pd.read_csv('data/ru_simlex965_tagged.tsv', delimiter='\t')

In [15]:
df[['Word1', 'POS1']] = df['# Word1'].str.split('_', expand=True)
df[['Word2', 'POS2']] = df['Word2'].str.split('_', expand=True)
df = df.drop(columns=['# Word1', 'POS1', 'POS2'])

In [16]:
avscore = np.array(df['Average Score']).reshape(-1, 1)
df['Average Score'] = MMS().fit_transform(avscore)

In [17]:
w1 = df['Word1'].to_list()
w2 = df['Word2'].to_list()
df['My_score'] = [cos_sim(embedding_matrix, word2index, w1[i], w2[i]) for i in range(len(w1))]

In [18]:
smol = df[df['My_score']!=-5]

In [19]:
MSE(smol['Average Score'], smol['My_score'])

0.20290080832022972

Вполне неплохо!!