<a href="https://colab.research.google.com/github/zhestyatsky/abbyy-nlp-course/blob/main/sem3/hw3_pytorch_embeddings_training_task1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Семинар 3: Представления слов: продолжение

In [None]:
%%writefile requirements.txt
gensim
pandas
razdel
sklearn
allennlp
pytorch_lightning

Writing requirements.txt


In [None]:
!pip install --upgrade -r requirements.txt

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/2b/e0/fa6326251692056dc880a64eb22117e03269906ba55a6864864d24ec8b4e/gensim-3.8.3-cp36-cp36m-manylinux1_x86_64.whl (24.2MB)
[K     |████████████████████████████████| 24.2MB 136kB/s 
[?25hRequirement already up-to-date: pandas in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 2)) (1.1.5)
Collecting razdel
  Downloading https://files.pythonhosted.org/packages/15/2c/664223a3924aa6e70479f7d37220b3a658765b9cfe760b4af7ffdc50d38f/razdel-0.5.0-py3-none-any.whl
Requirement already up-to-date: sklearn in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 4)) (0.0)
Collecting allennlp
[?25l  Downloading https://files.pythonhosted.org/packages/72/f5/f4dd3424b3ae9dec0a55ae7f7f34ada3ee60e4b10a187d2ba7384c698e09/allennlp-1.3.0-py3-none-any.whl (506kB)
[K     |████████████████████████████████| 512kB 55.5MB/s 
[?25hCollecting pytorch_lightning
[?25l  Downloading https://files.py

In [None]:
!wget https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/download/v1.0/lenta-ru-news.csv.gz
!gzip -d lenta-ru-news.csv.gz
!head -n 2 lenta-ru-news.csv

--2020-12-19 17:52:57--  https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/download/v1.0/lenta-ru-news.csv.gz
Resolving github.com (github.com)... 192.30.255.112
Connecting to github.com (github.com)|192.30.255.112|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github-production-release-asset-2e65be.s3.amazonaws.com/87156914/0b363e00-0126-11e9-9e3c-e8c235463bd6?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20201219%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20201219T175257Z&X-Amz-Expires=300&X-Amz-Signature=abe5239e017c33015b7f136621f4d3ac38af38afefe179f7c780a05c2231cbb1&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=87156914&response-content-disposition=attachment%3B%20filename%3Dlenta-ru-news.csv.gz&response-content-type=application%2Foctet-stream [following]
--2020-12-19 17:52:57--  https://github-production-release-asset-2e65be.s3.amazonaws.com/87156914/0b363e00-0126-11e9-9e3c-e8c235463bd6?X-Amz-Algorit

In [None]:
import pandas as pd
import re
import datetime as dt
from razdel import tokenize, sentenize
from string import punctuation

def get_date(url):
    dates = re.findall(r"\d\d\d\d\/\d\d\/\d\d", url)
    return next(iter(dates), None)

dataset = pd.read_csv("lenta-ru-news.csv", sep=',', quotechar='\"', escapechar='\\', encoding='utf-8', header=0)
dataset["date"] = dataset["url"].apply(lambda x: dt.datetime.strptime(get_date(x), "%Y/%m/%d"))
dataset = dataset[dataset["date"] > "2017-01-01"]
dataset["text"] = dataset["text"].apply(lambda x: x.replace("\xa0", " "))
dataset["title"] = dataset["title"].apply(lambda x: x.replace("\xa0", " "))
train_dataset = dataset[dataset["date"] < "2018-04-01"]
test_dataset = dataset[dataset["date"] > "2018-04-01"]

def get_texts(dataset):
    texts = []
    for text in dataset["text"]:
        for sentence in sentenize(text):
            texts.append([token.text.lower() for token in tokenize(sentence.text) if token.text not in punctuation])
    
    for title in dataset["title"]:
        texts.append([token.text.lower() for token in tokenize(title) if token.text not in punctuation])
    return texts


texts = get_texts(train_dataset)
test_texts = get_texts(test_dataset)

assert len(texts) == 827217
assert len(texts[0]) > 0
assert texts[0][0].islower()
print(texts[0])

['возобновление', 'нормального', 'сотрудничества', 'между', 'россией', 'и', 'нато', 'невозможно', 'пока', 'москва', 'не', 'будет', 'соблюдать', 'нормы', 'международного', 'права']


## Построение словаря

In [None]:
from collections import Counter


class Vocabulary:
    def __init__(self):
        self.word2index = {
            "<unk>": 0
        }
        self.index2word = ["<unk>"]

    def build(self, texts, min_count=10):
        words_counter = Counter(token for tokens in texts for token in tokens)
        for word, count in words_counter.most_common():
            if count >= min_count:
                self.word2index[word] = len(self.word2index)
        self.index2word = [word for word, _ in sorted(self.word2index.items(), key=lambda x: x[1])]
    
    @property
    def size(self):
        return len(self.index2word)
    
    def top(self, n=100):
        return self.index2word[1:n+1]
    
    def get_index(self, word):
        return self.word2index.get(word, 0)
    
    def get_word(self, index):
        return self.index2word[index]

vocabulary = Vocabulary()
vocabulary.build(texts)
assert vocabulary.word2index[vocabulary.index2word[10]] == 10
print(vocabulary.size)
print(vocabulary.top(100))

71186
['в', 'и', 'на', '«', '»', 'что', 'с', 'по', '—', 'не', 'из', 'этом', 'об', 'о', 'он', 'за', 'года', 'россии', 'к', 'его', 'для', 'как', 'также', 'от', 'а', 'это', 'сообщает', 'до', 'году', 'после', 'сша', 'у', 'во', 'время', 'был', 'при', 'заявил', 'со', 'словам', 'рублей', 'будет', 'ее', 'она', 'но', 'ранее', 'их', 'они', 'было', 'тысяч', 'более', 'того', 'том', 'мы', 'были', 'я', 'которые', 'все', 'который', 'человек', 'под', '2016', 'из-за', 'лет', '2017', 'украины', 'марта', 'процентов', 'чтобы', 'долларов', 'глава', 'президент', 'этого', 'отметил', 'же', 'сказал', 'так', 'января', 'или', 'страны', 'ру', 'то', 'еще', 'области', 'данным', 'была', 'президента', 'около', 'сообщил', 'февраля', 'однако', 'компании', 'может', 'уже', 'один', 'рассказал', 'только', 'процента', '1', '10', 'июня']


## Задание 1: Самописный CBoW

Сделайте аналогичную модель, но в архитектуре CBoW

In [None]:
import torch
import json
import random
from itertools import cycle
from torch.utils.data import Dataset, IterableDataset


def get_samples(tokenized_texts, window_size, texts_count):
    for text_num, tokens in enumerate(tokenized_texts):
        if texts_count and text_num >= texts_count:
            break
        for i in range(len(tokens)):
            central_word = vocabulary.get_index(tokens[i])
            context_words = [vocabulary.get_index(tokens[i + delta]) for delta in range(-window_size, window_size + 1) 
                       if delta != 0 and i + delta >= 0 and i + delta < len(tokens)]
            if len(context_words) != 2 * window_size:
                continue
            yield (torch.cuda.LongTensor(context_words), torch.cuda.LongTensor([central_word]))


def get_samples_cycle(tokenized_texts, window_size, texts_count):
    while True:
        for sample in get_samples(tokenized_texts, window_size, texts_count):
            yield sample


class Word2VecDataset(Dataset):
    def __init__(self, tokenized_texts, vocabulary, window_size=2, texts_count=100000):
        self.samples = list(get_samples(tokenized_texts, window_size, texts_count))
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, index):
        return self.samples[index]


class Word2VecIterableDataset(IterableDataset):
    def __init__(self, tokenized_texts, vocabulary, window_size=2, texts_count=None):
        self.tokenized_texts = tokenized_texts
        self.vocabulary = vocabulary
        self.window_size = window_size
        self.texts_count = texts_count

    def __iter__(self):
        return get_samples_cycle(self.tokenized_texts, self.window_size, self.texts_count)

In [None]:
from torch.utils.data import DataLoader, RandomSampler

BATCH_SIZE = 256

random.shuffle(texts)
train_data = Word2VecIterableDataset(texts, vocabulary)
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE)

random.shuffle(test_texts)
val_data = Word2VecIterableDataset(test_texts, vocabulary)
val_loader = DataLoader(val_data, batch_size=BATCH_SIZE)

In [None]:
import torch
import torch.nn as nn
from pytorch_lightning import LightningModule


class CBoWModel(LightningModule):
    def __init__(self, vocab_size, embedding_dim=128):
        super().__init__()

        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.out_layer = nn.Linear(embedding_dim, vocab_size)
        self.loss = nn.CrossEntropyLoss()
    
    def forward(self, contexts, centrals):
        projections = self.embeddings.forward(contexts)
        bag_of_words = torch.sum(projections, dim=1)
        logits = self.out_layer.forward(bag_of_words)
        loss = self.loss(logits, centrals.view(-1))
        return loss
    
    def training_step(self, batch, batch_nb):
        return {'loss': self(*batch)}
    
    def validation_step(self, batch, batch_nb):
        return {'val_loss': self(*batch)}

    def test_step(self, batch, batch_nb):
        return {'test_loss': self(*batch)}
    
    def validation_epoch_end(self, outputs): 
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        tensorboard_logs = {'val_loss': avg_loss}
        return {'val_loss': avg_loss, 'progress_bar': tensorboard_logs}

    def test_epoch_end(self, outputs):
        avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean()
        tensorboard_logs = {'test_loss': avg_loss}
        return {'test_loss': avg_loss, 'progress_bar': tensorboard_logs}
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-2)
        return [optimizer]

In [None]:
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping

EPOCHS = 1

model = CBoWModel(vocabulary.size)
early_stop_callback = EarlyStopping(
    monitor="val_loss",
    min_delta=0.0,
    patience=5,
    verbose=True,
    mode="min" 
)
trainer = Trainer(
    gpus=1,
    checkpoint_callback=False,
    max_epochs=EPOCHS,
    callbacks=[early_stop_callback],
    progress_bar_refresh_rate=100,
    limit_train_batches=40000,
    limit_val_batches=500,
    val_check_interval=2000)
trainer.fit(model, train_loader, val_loader)

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type             | Params
------------------------------------------------
0 | embeddings | Embedding        | 9.1 M 
1 | out_layer  | Linear           | 9.2 M 
2 | loss       | CrossEntropyLoss | 0     
------------------------------------------------
18.3 M    Trainable params
0         Non-trainable params
18.3 M    Total params


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



Please use self.log(...) inside the lightningModule instead.

# log on a step or aggregate epoch metric to the logger and/or progress bar
# (inside LightningModule)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




1

In [None]:
model.freeze()

In [None]:
embeddings = model.embeddings.weight.cpu().data.numpy()

In [None]:
import numpy as np
np.save("embeddings.npy", embeddings)

In [None]:
embeddings.shape

(71186, 128)

## Базовые проверки

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def most_similar(embeddings, vocabulary, word):
    word_emb = embeddings[vocabulary.get_index(word)]
    
    similarities = cosine_similarity([word_emb], embeddings)[0]
    top10 = np.argsort(similarities)[-10:]
    
    return [vocabulary.get_word(index) for index in reversed(top10)]

most_similar(embeddings, vocabulary, 'россия')

['россия',
 'москва',
 'деревня',
 'страна',
 'отис',
 'зеландия',
 'букер',
 'аль-джазира',
 'восьмерка',
 'компания']

## Визуализация

In [None]:
import bokeh.models as bm, bokeh.plotting as pl
from bokeh.io import output_notebook

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale


def draw_vectors(x, y, radius=10, alpha=0.25, color='blue',
                 width=600, height=400, show=True, **kwargs):
    """ draws an interactive plot for data points with auxilirary info on hover """
    output_notebook()
    
    if isinstance(color, str): 
        color = [color] * len(x)
    data_source = bm.ColumnDataSource({ 'x' : x, 'y' : y, 'color': color, **kwargs })

    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height)
    fig.scatter('x', 'y', size=radius, color='color', alpha=alpha, source=data_source)

    fig.add_tools(bm.HoverTool(tooltips=[(key, "@" + key) for key in kwargs.keys()]))
    if show: 
        pl.show(fig)
    return fig


def get_tsne_projection(word_vectors):
    tsne = TSNE(n_components=2)
    return scale(tsne.fit_transform(word_vectors))

def get_pca_projection(word_vectors):
    pca = PCA(n_components=2)
    return scale(pca.fit_transform(word_vectors))
    
    
def visualize_embeddings(embeddings, vocabulary, word_count, method="pca"):
    word_vectors = embeddings[1: word_count + 1]
    words = vocabulary.top(word_count)
    get_projections = get_pca_projection if method == "pca" else get_tsne_projection
    projections = get_projections(word_vectors)
    draw_vectors(projections[:, 0], projections[:, 1], color='green', token=words)
    
    
visualize_embeddings(embeddings, vocabulary, 500, method="tsne")



## Задача рубрикации

In [None]:
def get_text_embedding(embeddings, vocabulary, phrase):
    embeddings = np.array([embeddings[vocabulary.get_index(word.text.lower())] for word in tokenize(phrase)])
    return np.mean(embeddings, axis=0)

target_labels = set(train_dataset["topic"].dropna().tolist())
target_labels -= {"69-я параллель", "Крым", "Культпросвет ", "Оружие", "Бизнес", "Путешествия"}
target_labels = list(target_labels)
print(target_labels)

pattern = r'(\b{}\b)'.format('|'.join(target_labels))

train_with_topics = train_dataset[train_dataset["topic"].str.contains(pattern, case=False, na=False)]
train_with_topics = train_with_topics.head(20000)

test_with_topics = test_dataset[test_dataset["topic"].str.contains(pattern, case=False, na=False)]

y_train = train_with_topics["topic"].apply(lambda x: target_labels.index(x)).to_numpy()
X_train = np.zeros((train_with_topics.shape[0], embeddings.shape[1]))
for i, embedding in enumerate(train_with_topics["text"]):
    X_train[i, :] = get_text_embedding(embeddings, vocabulary, embedding)

y_test = test_with_topics["topic"].apply(lambda x: target_labels.index(x)).to_numpy()
X_test = np.zeros((test_with_topics.shape[0], embeddings.shape[1]))
for i, embedding in enumerate(test_with_topics["text"]):
    X_test[i, :] = get_text_embedding(embeddings, vocabulary, embedding)

print(X_train.shape)
print(y_train)

  return func(self, *args, **kwargs)


['Бывший СССР', 'Интернет и СМИ', 'Дом', 'Наука и техника', 'Из жизни', 'Россия', 'Спорт', 'Мир', 'Ценности', 'Силовые структуры', 'Экономика', 'Культура']
(20000, 128)
[ 7  7 11 ...  1 10  8]


In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn import metrics

clf = MLPClassifier()
clf.fit(X_train, y_train)

y_predicted = clf.predict(X_test)
print(metrics.classification_report(y_test, y_predicted))



              precision    recall  f1-score   support

           0       0.64      0.58      0.61      2156
           1       0.67      0.55      0.60      2447
           2       0.68      0.69      0.69      1182
           3       0.79      0.81      0.80      2119
           4       0.71      0.68      0.69      2191
           5       0.59      0.62      0.61      4324
           6       0.90      0.89      0.89      3429
           7       0.66      0.75      0.70      4291
           8       0.78      0.71      0.74      1177
           9       0.65      0.59      0.62      1663
          10       0.73      0.76      0.74      3185
          11       0.69      0.68      0.69      1995

    accuracy                           0.70     30159
   macro avg       0.71      0.69      0.70     30159
weighted avg       0.70      0.70      0.70     30159

