In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Код из библиотеки dlnlputils репозитория https://github.com/Samsung-IT-Academy/stepik-dl-nlp

In [48]:
#########stepik-dl-nlp/dlnlputils/data/base.py#########

import collections
import re

import numpy as np

TOKEN_RE = re.compile(r'[\w\d]+')


def tokenize_text_simple_regex(txt, min_token_size=4):
    txt = txt.lower()
    all_tokens = TOKEN_RE.findall(txt)
    return [token for token in all_tokens if len(token) >= min_token_size]

def character_tokenize(txt):
    return list(txt)

def tokenize_corpus(texts, tokenizer=tokenize_text_simple_regex, **tokenizer_kwargs):
    return [tokenizer(text, **tokenizer_kwargs) for text in texts]

def tokenize_corpus_verbose(texts, tokenizer=tokenize_text_simple_regex, verbose_chunk=1000, **tokenizer_kwargs):
    tokenize_texts = []
    for i, text in enumerate(texts):
        tokenize_texts.append(tokenizer(text, **tokenizer_kwargs))
        if i % verbose_chunk == 0:
            print('Complete: {}/{}'.format(i,len(texts)))
    return tokenize_texts

def texts_to_token_ids(tokenized_texts, word2id):
    return [[word2id[token] for token in text if token in word2id]
            for text in tokenized_texts]


def build_vocabulary(tokenized_texts, max_size=1000000, max_doc_freq=0.8, min_count=5, pad_word=None):
    word_counts = collections.defaultdict(int)
    doc_n = 0

    # посчитать количество документов, в которых употребляется каждое слово
    # а также общее количество документов
    for txt in tokenized_texts:
        doc_n += 1
        unique_text_tokens = set(txt)
        for token in unique_text_tokens:
            word_counts[token] += 1

    # убрать слишком редкие и слишком частые слова
    word_counts = {word: cnt for word, cnt in word_counts.items()
                   if cnt >= min_count and cnt / doc_n <= max_doc_freq}

    # отсортировать слова по убыванию частоты
    sorted_word_counts = sorted(word_counts.items(),
                                reverse=True,
                                key=lambda pair: pair[1])

    # добавим несуществующее слово с индексом 0 для удобства пакетной обработки
    if pad_word is not None:
        sorted_word_counts = [(pad_word, 0)] + sorted_word_counts

    # если у нас по прежнему слишком много слов, оставить только max_size самых частотных
    if len(word_counts) > max_size:
        sorted_word_counts = sorted_word_counts[:max_size]

    # нумеруем слова
    word2id = {word: i for i, (word, _) in enumerate(sorted_word_counts)}

    # нормируем частоты слов
    word2freq = np.array([cnt / doc_n for _, cnt in sorted_word_counts], dtype='float32')

    return word2id, word2freq



#########stepik-dl-nlp/dlnlputils/data/bag_of_words.py#########

import numpy as np
import scipy.sparse
import torch
from torch.utils.data import Dataset


def vectorize_texts(tokenized_texts, word2id, word2freq, mode='tfidf', scale=True):
    #modified by me 
    #add 'lftidf', 'tflidf', 'ltflidf', 'ltf', 'lidf'
    
    assert mode in {'tfidf', 'idf', 'tf', 'bin', 'ltfidf', 'tflidf', 'tflidf_v2', 'ltf', 'tfpmi'}

    # считаем количество употреблений каждого слова в каждом документе
    result = scipy.sparse.dok_matrix((len(tokenized_texts), len(word2id)), dtype='float32')
    for text_i, text in enumerate(tokenized_texts):
        for token in text:
            if token in word2id:
                result[text_i, word2id[token]] += 1

    # получаем бинарные вектора "встречается или нет"
    if mode == 'bin':
        result = (result > 0).astype('float32')

    # получаем вектора относительных частот слова в документе
    elif mode == 'tf':
        result = result.tocsr()
        result = result.multiply(1 / result.sum(1))

    # полностью убираем информацию о количестве употреблений слова в данном документе,
    # но оставляем информацию о частотности слова в корпусе в целом
    elif mode == 'idf':
        result = (result > 0).astype('float32').multiply(1 / word2freq)

    # учитываем всю информацию, которая у нас есть:
    # частоту слова в документе и частоту слова в корпусе
    elif mode == 'tfidf':
        result = result.tocsr()
        result = result.multiply(1 / result.sum(1))  # разделить каждую строку на её длину
        result = result.multiply(1 / word2freq)  # разделить каждый столбец на вес слова

    elif mode == 'ltf': # lTF=ln⁡(TF+1)
        result = result.tocsr()
        result = result.multiply(1 / result.sum(1))
        result = scipy.sparse.dok_matrix(np.log(result.toarray()+1))
 
    elif mode == 'lidf': # lIDF=ln⁡(n/IDF+1)
        result = (result > 0).astype('float32').multiply(len(tokenized_texts) / word2freq)
        result = scipy.sparse.dok_matrix(np.log(result.toarray()+1))

        
    elif mode == 'ltfidf': # lTFIDF=ln⁡(TF+1)⋅IDF
        result = result.tocsr() #переводим матрицу в режим быстрой работы со строками (это очень важно!!)
        result = result.multiply(1/result.sum(1)) # разделить каждую строку на её длину
        result = scipy.sparse.dok_matrix(np.log(result.toarray()+1))
        result = result.multiply(1 / word2freq) # разделить каждый столбец на вес слова
        

    elif mode == 'tflidf': # lTFIDF=TF⋅ln⁡(1/IDF+1)
        result = result.tocsr() #переводим матрицу в режим быстрой работы со строками (это очень важно!!)
        result = result.multiply(1/result.sum(1)) # разделить каждую строку на её длину
        result = result.multiply(np.log(1 / word2freq + 1)) # разделить каждый столбец на вес слова

    elif mode == 'tflidf_v2': # lTFIDF=TF⋅ln⁡(n/IDF+1)
        result = result.tocsr() #переводим матрицу в режим быстрой работы со строками (это очень важно!!)
        result = result.multiply(1/result.sum(1)) # разделить каждую строку на её длину
        result = result.multiply(np.log(len(tokenized_texts) / word2freq + 1)) # разделить каждый столбец на вес слова
        
    elif mode == 'tfpmi': # TFPMI=TF⋅PMI
        result = result.tocsr()
        result = result.multiply(1 / result.sum(1))  # разделить каждую строку на её длину
        result = result.multiply(word2freq)  # домножить каждую строку на word2freq (это массив PMI Scores)

    if scale:
        result = result.tocsc()
        result -= result.min()
        result /= (result.max() + 1e-6)

    return result.tocsr()


class SparseFeaturesDataset(Dataset):
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets

    def __len__(self):
        return self.features.shape[0]

    def __getitem__(self, idx):
        cur_features = torch.from_numpy(self.features[idx].toarray()[0]).float()
        cur_label = torch.from_numpy(np.asarray(self.targets[idx])).long()
        return cur_features, cur_label
    
    
#########stepik-dl-nlp/dlnlputils/pipeline.py#########

import copy
import datetime
import random
import traceback

import numpy as np
import torch
from torch.utils.data import DataLoader


def init_random_seed(value=0):
    random.seed(value)
    np.random.seed(value)
    torch.manual_seed(value)
    torch.cuda.manual_seed(value)
    torch.backends.cudnn.deterministic = True


def copy_data_to_device(data, device):
    if torch.is_tensor(data):
        return data.to(device)
    elif isinstance(data, (list, tuple)):
        return [copy_data_to_device(elem, device) for elem in data]
    raise ValueError('Недопустимый тип данных {}'.format(type(data)))


def print_grad_stats(model):
    mean = 0
    std = 0
    norm = 1e-5
    for param in model.parameters():
        grad = getattr(param, 'grad', None)
        if grad is not None:
            mean += grad.data.abs().mean()
            std += grad.data.std()
            norm += 1
    mean /= norm
    std /= norm
    print(f'Mean grad {mean}, std {std}, n {norm}')


def train_eval_loop(model, train_dataset, val_dataset, criterion,
                    lr=1e-4, epoch_n=10, batch_size=32,
                    device=None, early_stopping_patience=10, l2_reg_alpha=0,
                    max_batches_per_epoch_train=10000,
                    max_batches_per_epoch_val=1000,
                    data_loader_ctor=DataLoader,
                    optimizer_ctor=None,
                    lr_scheduler_ctor=None,
                    shuffle_train=True,
                    dataloader_workers_n=0,
                    best_acc_type = 'loss',
                    test_dataset = None,
                    experiment_name = 'NoName',
                    no_calculate_accuracy = False):
    """
    v2.1
    Цикл для обучения модели. После каждой эпохи качество модели оценивается по отложенной выборке.
    :param model: torch.nn.Module - обучаемая модель
    :param train_dataset: torch.utils.data.Dataset - данные для обучения
    :param val_dataset: torch.utils.data.Dataset - данные для оценки качества
    :param criterion: функция потерь для настройки модели
    :param lr: скорость обучения
    :param epoch_n: максимальное количество эпох
    :param batch_size: количество примеров, обрабатываемых моделью за одну итерацию
    :param device: cuda/cpu - устройство, на котором выполнять вычисления
    :param early_stopping_patience: наибольшее количество эпох, в течение которых допускается
        отсутствие улучшения модели, чтобы обучение продолжалось.
    :param l2_reg_alpha: коэффициент L2-регуляризации
    :param max_batches_per_epoch_train: максимальное количество итераций на одну эпоху обучения
    :param max_batches_per_epoch_val: максимальное количество итераций на одну эпоху валидации
    :param data_loader_ctor: функция для создания объекта, преобразующего датасет в батчи
        (по умолчанию torch.utils.data.DataLoader)
    :return: кортеж из двух элементов:
        - среднее значение функции потерь на валидации на лучшей эпохе
        - лучшая модель
    """
    
    '''
    modified by wisoffe
    best_acc_type: 'loss' or 'acc'
    experiment_name: 
    '''
    assert best_acc_type in {'loss', 'acc'}
    
    train_start_time = datetime.datetime.now()
    print("############## Start experiment with name: {} ##############".format(experiment_name))
    
    #statistics history
    history = {'acc': {'train': [0.0],
                       'val': [0.0]},
               'loss': {'train': [float('inf')],
                       'val': [float('inf')]}}
    
    if device is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)
    model.to(device)

    if optimizer_ctor is None:
        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=l2_reg_alpha)
    else:
        optimizer = optimizer_ctor(model.parameters(), lr=lr)

    if lr_scheduler_ctor is not None:
        lr_scheduler = lr_scheduler_ctor(optimizer)
    else:
        lr_scheduler = None

    train_dataloader = data_loader_ctor(train_dataset, batch_size=batch_size, shuffle=shuffle_train,
                                        num_workers=dataloader_workers_n)
    val_dataloader = data_loader_ctor(val_dataset, batch_size=batch_size, shuffle=False,
                                      num_workers=dataloader_workers_n)
    
    if best_acc_type == 'loss': #отбираем модель по минимальному loss
        best_val_metric = float('inf')
    elif best_acc_type == 'acc': #отбираем модель по максимальному accuracy
        best_val_metric = float('-inf')
        
    best_epoch_i = 0
    best_model = copy.deepcopy(model)
    
    
    for epoch_i in range(1, epoch_n + 1):
        try:
            #####train phase######
            epoch_start = datetime.datetime.now()
            train_accuracy_epoch = [] #for statistics
            train_loss_epoch = [] #for statistics
            
            model.train()
            
            for batch_i, (batch_x, batch_y) in enumerate(train_dataloader):
                if batch_i > max_batches_per_epoch_train:
                    print('Threshold max_batches_per_epoch_train exceeded!')
                    break

                batch_x = copy_data_to_device(batch_x, device)
                batch_y = copy_data_to_device(batch_y, device)

                pred = model(batch_x)
                loss = criterion(pred, batch_y)

                model.zero_grad()
                loss.backward()

                optimizer.step()

                train_loss_epoch.append(float(loss))
                
                if not no_calculate_accuracy:
                    train_accuracy_epoch.append(float((pred.argmax(dim=1) == batch_y.data).float().mean().data))
                    #train_accuracy_epoch.append(float((pred.detach().cpu().numpy().argmax(-1) == batch_y.detach().cpu().numpy()).mean()))
                else: train_accuracy_epoch.append(0.)
                    
            
            #####validation phase######
            model.eval()

            val_accuracy_epoch = [] #for statistics
            val_loss_epoch = [] #for statistics

            with torch.no_grad():
                for batch_i, (batch_x, batch_y) in enumerate(val_dataloader):
                    if batch_i > max_batches_per_epoch_val:
                        print('Threshold max_batches_per_epoch_val exceeded!')
                        break

                    batch_x = copy_data_to_device(batch_x, device)
                    batch_y = copy_data_to_device(batch_y, device)

                    pred = model(batch_x)
                    loss = criterion(pred, batch_y)
                    
                    if not no_calculate_accuracy:
                        val_accuracy_epoch.append(float((pred.argmax(dim=1) == batch_y.data).float().mean().data))
                        #val_accuracy_epoch.append(float((pred.detach().cpu().numpy().argmax(-1) == batch_y.detach().cpu().numpy()).mean()))
                    else:
                        val_accuracy_epoch.append(0.)
                    val_loss_epoch.append(float(loss))

            
            ########ending of epoch#########
            
            history['acc']['train'].append(sum(train_accuracy_epoch) / len(train_accuracy_epoch))
            history['loss']['train'].append(sum(train_loss_epoch) / len(train_loss_epoch))  

            history['acc']['val'].append(sum(val_accuracy_epoch) / len(val_accuracy_epoch))
            history['loss']['val'].append(sum(val_loss_epoch) / len(val_loss_epoch))
            
            
            #save best model
            best_model_saved = False
            if (best_acc_type == 'loss' and history['loss']['val'][-1] < best_val_metric) or \
                    (best_acc_type == 'acc' and history['acc']['val'][-1] > best_val_metric):
                #отбираем модель по минимальному loss или максимальному accuracy
                best_epoch_i = epoch_i
                best_val_metric = history[best_acc_type]['val'][-1]
                best_model = copy.deepcopy(model)
                best_model_saved = True
            #check for break training
            elif epoch_i - best_epoch_i > early_stopping_patience:
                print('Модель не улучшилась за последние {} эпох, прекращаем обучение'.format(
                    early_stopping_patience))
                break

            if lr_scheduler is not None:
                lr_scheduler.step(history['loss']['val'][-1])
            
            #output statistics
            
            print('Epoch = {:>3},   ACC: val = {:.3f}, train = {:.3f}    LOSS: val = {:.3f}, train = {:.3f}   SAVE: {}, Time: {:0.2f}s'\
                  .format(epoch_i,
                          history['acc']['val'][-1], 
                          history['acc']['train'][-1],
                          history['loss']['val'][-1],
                          history['loss']['train'][-1],
                          best_model_saved,
                          (datetime.datetime.now() - epoch_start).total_seconds()),
                  flush=True)

        except KeyboardInterrupt:
            print('Досрочно остановлено пользователем')
            break
        except Exception as ex:
            print('Ошибка при обучении: {}\n{}'.format(ex, traceback.format_exc()))
            break
            
    print(' ')
    print("BEST MODEL: ACC: val = {:.3f}, train = {:.3f}, LOSS: val = {:.3f}, train = {:.3f}, on epoch = {}, metric type = {}, Full train time = {:0.2f}s"\
                  .format(history['acc']['val'][best_epoch_i], 
                          history['acc']['train'][best_epoch_i],
                          history['loss']['val'][best_epoch_i],
                          history['loss']['train'][best_epoch_i],
                          best_epoch_i,
                          best_acc_type,
                          (datetime.datetime.now() - train_start_time).total_seconds()))
    print("************** End experiment with name: {} **************".format(experiment_name))
    print(' ')
    history['BEST'] = {}
    history['BEST']['epoch'] = best_epoch_i
    history['BEST']['dict_size'] = batch_x.shape[-1]
    
    
    #calculate and save final metrics best_model on train/val/test datasets
    if test_dataset is not None:
        history['BEST']['acc'] = {}
        history['BEST']['loss'] = {}
        
        #save validation metrics (no calculate again)
        history['BEST']['acc']['val'] = history['acc']['val'][best_epoch_i]
        history['BEST']['loss']['val'] = history['loss']['val'][best_epoch_i]
        
        #calculate and save train metrics
        train_pred = predict_with_model(best_model, train_dataset, return_labels=True)
        history['BEST']['loss']['train'] = float(F.cross_entropy(torch.from_numpy(train_pred[0]),
                             torch.from_numpy(train_pred[1]).long()))
        history['BEST']['acc']['train'] = accuracy_score(train_pred[1], train_pred[0].argmax(-1))
        
        #calculate and save test metrics
        test_pred = predict_with_model(best_model, test_dataset, return_labels=True)
        history['BEST']['loss']['test'] = float(F.cross_entropy(torch.from_numpy(test_pred[0]),
                             torch.from_numpy(test_pred[1]).long()))
        history['BEST']['acc']['test'] = accuracy_score(test_pred[1], test_pred[0].argmax(-1))    
    
    
    return history, best_model


def predict_with_model(model, dataset, device=None, batch_size=32, num_workers=0, return_labels=False):
    """
    :param model: torch.nn.Module - обученная модель
    :param dataset: torch.utils.data.Dataset - данные для применения модели
    :param device: cuda/cpu - устройство, на котором выполнять вычисления
    :param batch_size: количество примеров, обрабатываемых моделью за одну итерацию
    :return: numpy.array размерности len(dataset) x *
    """
    if device is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
    results_by_batch = []

    device = torch.device(device)
    model.to(device)
    model.eval()

    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)
    labels = []
    with torch.no_grad():
        import tqdm
        for batch_x, batch_y in tqdm.tqdm(dataloader, total=len(dataset)/batch_size):
            batch_x = copy_data_to_device(batch_x, device)

            if return_labels:
                labels.append(batch_y.numpy())

            batch_pred = model(batch_x)
            results_by_batch.append(batch_pred.detach().cpu().numpy())

    if return_labels:
        return np.concatenate(results_by_batch, 0), np.concatenate(labels, 0)
    else:
        return np.concatenate(results_by_batch, 0)


#########stepik-dl-nlp/dlnlputils/nnets.py#########

from torch.utils.data import Dataset


def ensure_length(txt, out_len, pad_value):
    if len(txt) < out_len:
        txt = list(txt) + [pad_value] * (out_len - len(txt))
    else:
        txt = txt[:out_len]
    return txt


class PaddedSequenceDataset(Dataset):
    def __init__(self, texts, targets, out_len=100, pad_value=0):
        self.texts = texts
        self.targets = targets
        self.out_len = out_len
        self.pad_value = pad_value

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        txt = self.texts[item]

        txt = ensure_length(txt, self.out_len, self.pad_value)
        txt = torch.tensor(txt, dtype=torch.long)

        target = torch.tensor(self.targets[item], dtype=torch.long)

        return txt, target

#########stepik-dl-nlp/dlnlputils/embeddings.py#########

class Embeddings:
    def __init__(self, embeddings, word2id):
        self.embeddings = embeddings
        self.embeddings /= (np.linalg.norm(self.embeddings, ord=2, axis=-1, keepdims=True) + 1e-4)
        self.word2id = word2id
        self.id2word = {i: w for w, i in word2id.items()}

    def most_similar(self, positive=None, negative=None, topk=10, with_mean = False):
        #modified by wis, converted to gensim syntax
        
        if positive is not None:
            if type(positive) != list:
                positive = [positive]
            pos_vec = [self.get_vector(word) for word in positive]
            pos_len = len(positive)
        else:
            pos_vec = 0
            pos_len = 1
            
        if negative is not None:
            if type(negative) != list:
                negative = [negative]
            neg_vec = [self.get_vector(word) for word in negative]
            neg_len = len(negative)
        else:
            neg_vec = 0
            neg_len = 1
        
        if with_mean:
            result_vec = np.array(pos_vec).sum(0) / pos_len - np.array(neg_vec).sum(0) / neg_len
        else:
            result_vec = np.array(pos_vec).sum(0) - np.array(neg_vec).sum(0)
        
        return self.most_similar_by_vector(result_vec, topk=topk)
    
    def most_similar_legacy(self, word, topk=10):
        return self.most_similar_by_vector(self.get_vector(word), topk=topk)

    def analogy(self, a1, b1, a2, topk=10):
        a1_v = self.get_vector(a1)
        b1_v = self.get_vector(b1)
        a2_v = self.get_vector(a2)
        query = b1_v - a1_v + a2_v
        return self.most_similar_by_vector(query, topk=topk)

    def most_similar_by_vector(self, query_vector, topk=10):
        similarities = (self.embeddings * query_vector).sum(-1)
        best_indices = np.argpartition(-similarities, topk, axis=0)[:topk]
        result = [(self.id2word[i], similarities[i]) for i in best_indices]
        result.sort(key=lambda pair: -pair[1])
        return result

    def get_vector(self, word):
        if word not in self.word2id:
            raise ValueError('Неизвестное слово "{}"'.format(word))
        return self.embeddings[self.word2id[word]]

    def get_vectors(self, *words):
        word_ids = [self.word2id[i] for i in words]
        vectors = np.stack([self.embeddings[i] for i in word_ids], axis=0)
        return vectors

#########stepik-dl-nlp/dlnlputils/visualization.py#########

from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE


def plot_vectors(vectors, labels, how='tsne', ax=None, xy_lim=None):
    if how == 'tsne':
        projections = TSNE().fit_transform(vectors)
    elif how == 'svd':
        projections = TruncatedSVD().fit_transform(vectors)

    x = projections[:, 0]
    y = projections[:, 1]
    if xy_lim is not None:
        ax.set_xlim(xy_lim)
        ax.set_ylim(xy_lim)
    ax.scatter(x, y)
    for cur_x, cur_y, cur_label in zip(x, y, labels):
        ax.annotate(cur_label, (cur_x, cur_y))
        

#########stepik-dl-nlp/dlnlputils/data/pos.py#########
import torch
from torch.utils.data import TensorDataset

def pos_corpus_to_tensor(sentences, char2id, label2id, max_sent_len, max_token_len):
    inputs = torch.zeros((len(sentences), max_sent_len, max_token_len + 2), dtype=torch.long)
    targets = torch.zeros((len(sentences), max_sent_len), dtype=torch.long)

    for sent_i, sent in enumerate(sentences):
        for token_i, token in enumerate(sent):
            targets[sent_i, token_i] = label2id.get(token.upos, 0)
            if token.form is not None:
                for char_i, char in enumerate(token.form):
                    inputs[sent_i, token_i, char_i + 1] = char2id.get(char, 0)
            else:
                for char_i, char in enumerate('-'):
                    inputs[sent_i, token_i, char_i + 1] = char2id.get(char, 0)                

    return inputs, targets


class POSTagger:
    def __init__(self, model, char2id, id2label, max_sent_len, max_token_len):
        self.model = model
        self.char2id = char2id
        self.id2label = id2label
        self.max_sent_len = max_sent_len
        self.max_token_len = max_token_len

    def __call__(self, sentences):
        tokenized_corpus = tokenize_corpus(sentences, min_token_size=1)

        inputs = torch.zeros((len(sentences), self.max_sent_len, self.max_token_len + 2), dtype=torch.long)

        for sent_i, sentence in enumerate(tokenized_corpus):
            for token_i, token in enumerate(sentence):
                for char_i, char in enumerate(token):
                    inputs[sent_i, token_i, char_i + 1] = self.char2id.get(char, 0)

        dataset = TensorDataset(inputs, torch.zeros(len(sentences)))
        predicted_probs = predict_with_model(self.model, dataset)  # SentenceN x TagsN x MaxSentLen
        predicted_classes = predicted_probs.argmax(1)

        result = []
        for sent_i, sent in enumerate(tokenized_corpus):
            result.append([self.id2label[cls] for cls in predicted_classes[sent_i, :len(sent)]])
        return result

### Мои наработки

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
# import spacy
# !python -m spacy download ru_core_news_md
# spacy_nlp = spacy.load('ru_core_news_md', disable=['parser', 'ner'])

def tokenize_text_spacy_lemmatize(txt, spacy_nlp, min_token_size=4, with_pos = True, remove_stopwords = False):
    doc = spacy_nlp(txt)
    
    if remove_stopwords:
        lemmatized_doc = [token for token in doc if (len(token) >= min_token_size) and (not token.is_stop)]
    else:
        lemmatized_doc = [token for token in doc if len(token) >= min_token_size]
    
    if with_pos:
        return ['_'.join([token.lemma_, token.pos_]) for token in lemmatized_doc]
    else:
        return [token.lemma_ for token in lemmatized_doc]

def tokenize_corpus_convert(tokenized_corpus, converter, addition = False):
    '''
    Convert each token in tokenized_corpus by converter
    
    Sample (PorterStemmer):
    import nltk
    ps = nltk.stemmer.PorterStemmer()
    tokenized_stemmed_corpus = tokenize_corpus_convert(tokenized_corpus, converter=ps.stem)
    
    Sample (SnowballStemmer):
    import nltk
    sno = nltk.stem.SnowballStemmer('english')
    tokenized_stemmed_corpus = tokenize_corpus_convert(tokenized_corpus, converter=sno.stem)
    
    Sample (WordNetLemmatizer):
    import nltk
    lemma = nltk.wordnet.WordNetLemmatizer()
    tokenized_lemmas_corpus = tokenize_corpus_convert(tokenized_corpus, converter=lemma.lemmatize)
    '''
    output = []
    if not addition: #возвращаем только преобразованные токены
        for doc in tokenized_corpus:
            output.append([converter(token) for token in doc])
    else: #возвращаем списк из исходных токенов, дополненных списком преобразованных
        for doc in tokenized_corpus:
            output.append(doc + [converter(token) for token in doc])        
    return output

def show_experiments_stats(histories, figsize = (16.0, 6.0), show_plots = True, only_BEST_MODEL_CALC = False):
    matplotlib.rcParams['figure.figsize'] = figsize
    
    for experiment_id in histories.keys():
        print('{:-<100}'.format(experiment_id))
        
        if not only_BEST_MODEL_CALC:
            epoch_max_acc = np.array(histories[experiment_id]['acc']['val']).argmax()
            print('Max val acc on:    Epoch = {:>3},   ACCURACY: val  = {:.3f}, train = {:.3f},   LOSS: val  = {:.3f}, train = {:.3f}'\
                  .format(epoch_max_acc, 
                          histories[experiment_id]['acc']['val'][epoch_max_acc], 
                          histories[experiment_id]['acc']['train'][epoch_max_acc],
                          histories[experiment_id]['loss']['val'][epoch_max_acc],
                          histories[experiment_id]['loss']['train'][epoch_max_acc]))
            epoch_min_loss = np.array(histories[experiment_id]['loss']['val']).argmin()
            print('Min val loss on:   Epoch = {:>3},   ACCURACY: val  = {:.3f}, train = {:.3f},   LOSS: val  = {:.3f}, train = {:.3f}'\
                  .format(epoch_min_loss, 
                          histories[experiment_id]['acc']['val'][epoch_min_loss], 
                          histories[experiment_id]['acc']['train'][epoch_min_loss],
                          histories[experiment_id]['loss']['val'][epoch_min_loss],
                          histories[experiment_id]['loss']['train'][epoch_min_loss]))
        
        if 'acc' in histories[experiment_id]['BEST']:
            print("BEST MODEL CALC:   Epoch = {:>3},   ACCURACY: test = {:.3f}, train = {:.3f},   LOSS: test = {:.3f}, train = {:.3f}  DICT SIZE = {}"\
                  .format(histories[experiment_id]['BEST']['epoch'], 
                          histories[experiment_id]['BEST']['acc']['test'],
                          histories[experiment_id]['BEST']['acc']['train'],
                          histories[experiment_id]['BEST']['loss']['test'],
                          histories[experiment_id]['BEST']['loss']['train'],
                          histories[experiment_id]['BEST']['dict_size']))
    
    
    if show_plots:
        for experiment_id in histories.keys():
            plt.plot(histories[experiment_id]['acc']['val'], label=experiment_id + ' val')
        plt.legend()
        plt.title('Validation Accuracy (Val only)')
        plt.show()

        for experiment_id in histories.keys():
            plt.plot(histories[experiment_id]['acc']['val'], label=experiment_id + ' val')
            plt.plot(histories[experiment_id]['acc']['train'], label=experiment_id + ' train')
        plt.legend()
        plt.title('Validation Accuracy (Val/Train)');
        plt.show()

        for experiment_id in histories.keys():
            plt.plot(histories[experiment_id]['loss']['val'], label=experiment_id  + ' val')
        plt.legend()
        plt.title('Validation Loss (Val only)');
        plt.show()

        for experiment_id in histories.keys():
            plt.plot(histories[experiment_id]['loss']['val'], label=experiment_id  + ' val')
            plt.plot(histories[experiment_id]['loss']['train'], label=experiment_id  + ' train')
        plt.legend()
        plt.title('Validation Loss (Val/Train)');
        plt.show()

def run_most_sumilars(func_most_similars, words_list, verbose = True, **kwargs):
    most_similars = {word: func_most_similars(word, **kwargs) for word in words_list}
    if verbose:
        for word, similars in most_similars.items():
            print('{}:'.format(word))
            print('\n'.join(map(str,similars)))
            print(' ')
    return most_similars
        

#https://stackoverflow.com/questions/4529815/saving-an-object-data-persistence/4529901
import pickle
def save_object(obj, filename):
    with open(filename, 'wb') as outp:  # Overwrites any existing file.
        pickle.dump(obj, outp, pickle.HIGHEST_PROTOCOL)

def load_object(filename):
    with open(filename, 'rb') as inp:
        return pickle.load(inp)

# sample usage
#company1 = [1,2,3,4,5]
#save_object(company1, '/kaggle/working/company1.pkl')
#del company
#company1 = load_object(filename)

In [3]:
histories = {}

# Свёрточные нейросети и POS-теггинг

POS-теггинг - определение частей речи (снятие частеречной неоднозначности)

In [7]:
!pip install pyconll
# !pip install spacy_udpipe

In [8]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')

import sys; sys.path.append('./stepik-dl-nlp')

from sklearn.metrics import classification_report

import numpy as np

import pyconll

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import TensorDataset

# import dlnlputils
# from dlnlputils.data import tokenize_corpus, build_vocabulary, \
#     character_tokenize, pos_corpus_to_tensor, POSTagger
# from dlnlputils.pipeline import train_eval_loop, predict_with_model, init_random_seed

init_random_seed()

## Загрузка текстов и разбиение на обучающую и тестовую подвыборки

In [16]:
# Если Вы запускаете ноутбук на colab или kaggle, добавьте в начало пути ./stepik-dl-nlp
!mkdir ./stepik-dl-nlp/
!mkdir ./stepik-dl-nlp/datasets/
!wget -O ./stepik-dl-nlp/datasets/ru_syntagrus-ud-train.conllu https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-SynTagRus/master/ru_syntagrus-ud-train-a.conllu
!wget -O ./stepik-dl-nlp/datasets/ru_syntagrus-ud-dev.conllu https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-SynTagRus/master/ru_syntagrus-ud-dev.conllu

In [17]:
# Если Вы запускаете ноутбук на colab или kaggle, добавьте в начало пути ./stepik-dl-nlp
full_train = pyconll.load_from_file('./stepik-dl-nlp/datasets/ru_syntagrus-ud-train.conllu')
full_test = pyconll.load_from_file('./stepik-dl-nlp/datasets/ru_syntagrus-ud-dev.conllu')
print(len(full_train), len(full_test))

In [18]:
for sent in full_train[:2]:
    for token in sent:
        print(token.form, token.upos)
    print()

In [19]:
MAX_SENT_LEN = max(len(sent) for sent in full_train)
MAX_ORIG_TOKEN_LEN = max(len(token.form) for sent in full_train for token in sent)
print('Наибольшая длина предложения', MAX_SENT_LEN)
print('Наибольшая длина токена', MAX_ORIG_TOKEN_LEN)

In [20]:
all_train_texts = [' '.join(token.form for token in sent) for sent in full_train]
print('\n'.join(all_train_texts[:10]))

In [23]:
train_char_tokenized = tokenize_corpus(all_train_texts, tokenizer=character_tokenize)
char_vocab, word_doc_freq = build_vocabulary(train_char_tokenized, max_doc_freq=1.0, min_count=5, pad_word='<PAD>')
print("Количество уникальных символов", len(char_vocab))
print(list(char_vocab.items())[:10])

In [24]:
UNIQUE_TAGS = ['<NOTAG>'] + sorted({token.upos for sent in full_train for token in sent if token.upos})
label2id = {label: i for i, label in enumerate(UNIQUE_TAGS)}
label2id

In [28]:
train_inputs, train_labels = pos_corpus_to_tensor(full_train, char_vocab, label2id, MAX_SENT_LEN, MAX_ORIG_TOKEN_LEN)
train_dataset = TensorDataset(train_inputs, train_labels)

test_inputs, test_labels = pos_corpus_to_tensor(full_test, char_vocab, label2id, MAX_SENT_LEN, MAX_ORIG_TOKEN_LEN)
test_dataset = TensorDataset(test_inputs, test_labels)

In [29]:
train_inputs[1][:5]

In [30]:
train_labels[1]

## Вспомогательная свёрточная архитектура

In [31]:
class StackedConv1d(nn.Module):
    def __init__(self, features_num, layers_n=1, kernel_size=3, conv_layer=nn.Conv1d, dropout=0.0):
        super().__init__()
        layers = []
        for _ in range(layers_n):
            layers.append(nn.Sequential(
                conv_layer(features_num, features_num, kernel_size, padding=kernel_size//2),
                nn.Dropout(dropout),
                nn.LeakyReLU()))
        self.layers = nn.ModuleList(layers)
    
    def forward(self, x):
        """x - BatchSize x FeaturesNum x SequenceLen"""
        for layer in self.layers:
            x = x + layer(x)
        return x

## Предсказание частей речи на уровне отдельных токенов

In [32]:
class SingleTokenPOSTagger(nn.Module):
    def __init__(self, vocab_size, labels_num, embedding_size=32, **kwargs):
        super().__init__()
        self.char_embeddings = nn.Embedding(vocab_size, embedding_size, padding_idx=0)
        self.backbone = StackedConv1d(embedding_size, **kwargs)
        self.global_pooling = nn.AdaptiveMaxPool1d(1)
        self.out = nn.Linear(embedding_size, labels_num)
        self.labels_num = labels_num
    
    def forward(self, tokens):
        """tokens - BatchSize x MaxSentenceLen x MaxTokenLen"""
        batch_size, max_sent_len, max_token_len = tokens.shape
        tokens_flat = tokens.view(batch_size * max_sent_len, max_token_len)
        
        char_embeddings = self.char_embeddings(tokens_flat)  # BatchSize*MaxSentenceLen x MaxTokenLen x EmbSize
        char_embeddings = char_embeddings.permute(0, 2, 1)  # BatchSize*MaxSentenceLen x EmbSize x MaxTokenLen
        
        features = self.backbone(char_embeddings)
        
        global_features = self.global_pooling(features).squeeze(-1)  # BatchSize*MaxSentenceLen x EmbSize
        
        logits_flat = self.out(global_features)  # BatchSize*MaxSentenceLen x LabelsNum
        logits = logits_flat.view(batch_size, max_sent_len, self.labels_num)  # BatchSize x MaxSentenceLen x LabelsNum
        logits = logits.permute(0, 2, 1)  # BatchSize x LabelsNum x MaxSentenceLen
        return logits

In [33]:
single_token_model = SingleTokenPOSTagger(len(char_vocab), len(label2id), embedding_size=64, layers_n=3, kernel_size=3, dropout=0.3)
print('Количество параметров', sum(np.product(t.shape) for t in single_token_model.parameters()))

In [34]:
(best_val_loss,
 best_single_token_model) = train_eval_loop(single_token_model,
                                            train_dataset,
                                            test_dataset,
                                            F.cross_entropy,
                                            lr=5e-3,
                                            epoch_n=10,
                                            batch_size=64,
                                            device='cuda',
                                            early_stopping_patience=5,
                                            max_batches_per_epoch_train=500,
                                            max_batches_per_epoch_val=100,
                                            lr_scheduler_ctor=lambda optim: torch.optim.lr_scheduler.ReduceLROnPlateau(optim, patience=2,
                                                                                                                       factor=0.5,
                                                                                                                       verbose=True))

In [36]:
# Если Вы запускаете ноутбук на colab или kaggle, добавьте в начало пути ./stepik-dl-nlp
!mkdir ./stepik-dl-nlp/models/
torch.save(best_single_token_model.state_dict(), './stepik-dl-nlp/models/single_token_pos.pth')

In [None]:
# Если Вы запускаете ноутбук на colab или kaggle, добавьте в начало пути ./stepik-dl-nlp
single_token_model.load_state_dict(torch.load('./stepik-dl-nlp/models/single_token_pos.pth'))

In [37]:
train_pred = predict_with_model(single_token_model, train_dataset)
train_loss = F.cross_entropy(torch.tensor(train_pred),
                             torch.tensor(train_labels))
print('Среднее значение функции потерь на обучении', float(train_loss))
print(classification_report(train_labels.view(-1), train_pred.argmax(1).reshape(-1), target_names=UNIQUE_TAGS))
print()

test_pred = predict_with_model(single_token_model, test_dataset)
test_loss = F.cross_entropy(torch.tensor(test_pred),
                            torch.tensor(test_labels))
print('Среднее значение функции потерь на валидации', float(test_loss))
print(classification_report(test_labels.view(-1), test_pred.argmax(1).reshape(-1), target_names=UNIQUE_TAGS))

## Предсказание частей речи на уровне предложений (с учётом контекста)

In [38]:
class SentenceLevelPOSTagger(nn.Module):
    def __init__(self, vocab_size, labels_num, embedding_size=32, single_backbone_kwargs={}, context_backbone_kwargs={}):
        super().__init__()
        self.embedding_size = embedding_size
        self.char_embeddings = nn.Embedding(vocab_size, embedding_size, padding_idx=0)
        self.single_token_backbone = StackedConv1d(embedding_size, **single_backbone_kwargs)
        self.context_backbone = StackedConv1d(embedding_size, **context_backbone_kwargs)
        self.global_pooling = nn.AdaptiveMaxPool1d(1)
        self.out = nn.Conv1d(embedding_size, labels_num, 1)
        self.labels_num = labels_num
    
    def forward(self, tokens):
        """tokens - BatchSize x MaxSentenceLen x MaxTokenLen"""
        batch_size, max_sent_len, max_token_len = tokens.shape
        tokens_flat = tokens.view(batch_size * max_sent_len, max_token_len)
        
        char_embeddings = self.char_embeddings(tokens_flat)  # BatchSize*MaxSentenceLen x MaxTokenLen x EmbSize
        char_embeddings = char_embeddings.permute(0, 2, 1)  # BatchSize*MaxSentenceLen x EmbSize x MaxTokenLen
        char_features = self.single_token_backbone(char_embeddings)
        
        token_features_flat = self.global_pooling(char_features).squeeze(-1)  # BatchSize*MaxSentenceLen x EmbSize

        token_features = token_features_flat.view(batch_size, max_sent_len, self.embedding_size)  # BatchSize x MaxSentenceLen x EmbSize
        token_features = token_features.permute(0, 2, 1)  # BatchSize x EmbSize x MaxSentenceLen
        context_features = self.context_backbone(token_features)  # BatchSize x EmbSize x MaxSentenceLen

        logits = self.out(context_features)  # BatchSize x LabelsNum x MaxSentenceLen
        return logits

In [39]:
sentence_level_model = SentenceLevelPOSTagger(len(char_vocab), len(label2id), embedding_size=64,
                                              single_backbone_kwargs=dict(layers_n=3, kernel_size=3, dropout=0.3),
                                              context_backbone_kwargs=dict(layers_n=3, kernel_size=3, dropout=0.3))
print('Количество параметров', sum(np.product(t.shape) for t in sentence_level_model.parameters()))

In [40]:
(best_val_loss,
 best_sentence_level_model) = train_eval_loop(sentence_level_model,
                                              train_dataset,
                                              test_dataset,
                                              F.cross_entropy,
                                              lr=5e-3,
                                              epoch_n=10,
                                              batch_size=64,
                                              device='cuda',
                                              early_stopping_patience=5,
                                              max_batches_per_epoch_train=500,
                                              max_batches_per_epoch_val=100,
                                              lr_scheduler_ctor=lambda optim: torch.optim.lr_scheduler.ReduceLROnPlateau(optim, patience=2,
                                                                                                                         factor=0.5,
                                                                                                                         verbose=True))

In [43]:
# Если Вы запускаете ноутбук на colab или kaggle, добавьте в начало пути ./stepik-dl-nlp
torch.save(best_sentence_level_model.state_dict(), './stepik-dl-nlp/models/sentence_level_pos.pth')

In [None]:
# Если Вы запускаете ноутбук на colab или kaggle, добавьте в начало пути ./stepik-dl-nlp
sentence_level_model.load_state_dict(torch.load('./stepik-dl-nlp/models/sentence_level_pos.pth'))

In [44]:
train_pred = predict_with_model(sentence_level_model, train_dataset)
train_loss = F.cross_entropy(torch.tensor(train_pred),
                             torch.tensor(train_labels))
print('Среднее значение функции потерь на обучении', float(train_loss))
print(classification_report(train_labels.view(-1), train_pred.argmax(1).reshape(-1), target_names=UNIQUE_TAGS))
print()

test_pred = predict_with_model(sentence_level_model, test_dataset)
test_loss = F.cross_entropy(torch.tensor(test_pred),
                            torch.tensor(test_labels))
print('Среднее значение функции потерь на валидации', float(test_loss))
print(classification_report(test_labels.view(-1), test_pred.argmax(1).reshape(-1), target_names=UNIQUE_TAGS))

## Применение полученных теггеров и сравнение

In [49]:
single_token_pos_tagger = POSTagger(single_token_model, char_vocab, UNIQUE_TAGS, MAX_SENT_LEN, MAX_ORIG_TOKEN_LEN)
sentence_level_pos_tagger = POSTagger(sentence_level_model, char_vocab, UNIQUE_TAGS, MAX_SENT_LEN, MAX_ORIG_TOKEN_LEN)

In [50]:
test_sentences = [
    'Мама мыла раму.',
    'Косил косой косой косой.',
    'Глокая куздра штеко будланула бокра и куздрячит бокрёнка.',
    'Сяпала Калуша с Калушатами по напушке.',
    'Пирожки поставлены в печь, мама любит печь.',
    'Ведро дало течь, вода стала течь.',
    'Три да три, будет дырка.',
    'Три да три, будет шесть.',
    'Сорок сорок'
]
test_sentences_tokenized = tokenize_corpus(test_sentences, min_token_size=1)

In [51]:
for sent_tokens, sent_tags in zip(test_sentences_tokenized, single_token_pos_tagger(test_sentences)):
    print(' '.join('{}-{}'.format(tok, tag) for tok, tag in zip(sent_tokens, sent_tags)))
    print()

In [52]:
for sent_tokens, sent_tags in zip(test_sentences_tokenized, sentence_level_pos_tagger(test_sentences)):
    print(' '.join('{}-{}'.format(tok, tag) for tok, tag in zip(sent_tokens, sent_tags)))
    print()

## Свёрточный модуль своими руками

In [53]:
class MyConv1d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, padding=0):
        super().__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.padding = padding
        self.weight = nn.Parameter(torch.randn(in_channels * kernel_size, out_channels) / (in_channels * kernel_size),
                                   requires_grad=True)
        self.bias = nn.Parameter(torch.zeros(out_channels), requires_grad=True)
    
    def forward(self, x):
        """x - BatchSize x InChannels x SequenceLen"""

        batch_size, src_channels, sequence_len = x.shape        
        if self.padding > 0:
            pad = x.new_zeros(batch_size, src_channels, self.padding)
            x = torch.cat((pad, x, pad), dim=-1)
            sequence_len = x.shape[-1]

        chunks = []
        chunk_size = sequence_len - self.kernel_size + 1
        for offset in range(self.kernel_size):
            chunks.append(x[:, :, offset:offset + chunk_size])

        in_features = torch.cat(chunks, dim=1)  # BatchSize x InChannels * KernelSize x ChunkSize
        in_features = in_features.permute(0, 2, 1)  # BatchSize x ChunkSize x InChannels * KernelSize
        out_features = torch.bmm(in_features, self.weight.unsqueeze(0).expand(batch_size, -1, -1)) + self.bias.unsqueeze(0).unsqueeze(0)
        out_features = out_features.permute(0, 2, 1)  # BatchSize x OutChannels x ChunkSize
        return out_features

In [54]:
sentence_level_model_my_conv = SentenceLevelPOSTagger(len(char_vocab), len(label2id), embedding_size=64,
                                                      single_backbone_kwargs=dict(layers_n=3, kernel_size=3, dropout=0.3, conv_layer=MyConv1d),
                                                      context_backbone_kwargs=dict(layers_n=3, kernel_size=3, dropout=0.3, conv_layer=MyConv1d))
print('Количество параметров', sum(np.product(t.shape) for t in sentence_level_model_my_conv.parameters()))

In [57]:
(best_val_loss,
 best_sentence_level_model_my_conv) = train_eval_loop(sentence_level_model_my_conv,
                                                      train_dataset,
                                                      test_dataset,
                                                      F.cross_entropy,
                                                      lr=5e-3,
                                                      epoch_n=10,
                                                      batch_size=64,
                                                      device='cuda',
                                                      early_stopping_patience=5,
                                                      max_batches_per_epoch_train=500,
                                                      max_batches_per_epoch_val=100,
                                                      lr_scheduler_ctor=lambda optim: torch.optim.lr_scheduler.ReduceLROnPlateau(optim, patience=2,
                                                                                                                                 factor=0.5,
                                                                                                                                 verbose=True))

In [58]:
train_pred = predict_with_model(best_sentence_level_model_my_conv, train_dataset)
train_loss = F.cross_entropy(torch.tensor(train_pred),
                             torch.tensor(train_labels))
print('Среднее значение функции потерь на обучении', float(train_loss))
print(classification_report(train_labels.view(-1), train_pred.argmax(1).reshape(-1), target_names=UNIQUE_TAGS))
print()

test_pred = predict_with_model(best_sentence_level_model_my_conv, test_dataset)
test_loss = F.cross_entropy(torch.tensor(test_pred),
                            torch.tensor(test_labels))
print('Среднее значение функции потерь на валидации', float(test_loss))
print(classification_report(test_labels.view(-1), test_pred.argmax(1).reshape(-1), target_names=UNIQUE_TAGS))