# Imports and data reading

In [1]:
!pip install pymorphy2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 4.4 MB/s 
[?25hCollecting dawg-python>=0.7.1
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting docopt>=0.6
  Downloading docopt-0.6.2.tar.gz (25 kB)
Collecting pymorphy2-dicts-ru<3.0,>=2.4
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[K     |████████████████████████████████| 8.2 MB 52.2 MB/s 
[?25hBuilding wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13723 sha256=4fd2c97b5b2a8a4482e5304ae46731ecd772b7dabcd2c8873c563103d361db79
  Stored in directory: /root/.cache/pip/wheels/72/b0/3f/1d95f96ff986c7dfffe46ce2be4062f38ebd04b506c77c81b9
Successfully built docopt
Installing collected 

In [2]:
import pandas as pd
import numpy as np
from tqdm.notebook import trange, tqdm
from math import sqrt

import seaborn as sns
from matplotlib import pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import pymorphy2
from nltk.tokenize import word_tokenize, wordpunct_tokenize
import nltk
nltk.download('punkt')

morph = pymorphy2.MorphAnalyzer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [4]:
!wget https://raw.githubusercontent.com/semensorokin/DLforNLP_course_material/master/Homework2/answers_subsample.csv
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.vec.gz
!gzip -d cc.ru.300.vec.gz

--2022-11-30 15:03:21--  https://raw.githubusercontent.com/semensorokin/DLforNLP_course_material/master/Homework2/answers_subsample.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 28717126 (27M) [text/plain]
Saving to: ‘answers_subsample.csv’


2022-11-30 15:03:21 (384 MB/s) - ‘answers_subsample.csv’ saved [28717126/28717126]

--2022-11-30 15:03:21--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 172.67.9.4, 104.22.74.142, 104.22.75.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|172.67.9.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1306357571 (1.2G) [binary/octet-stream]
Saving to: ‘cc.ru.300.vec.gz’


2022-11-30 15

In [5]:
data = pd.read_csv('answers_subsample.csv')
data.head()

Unnamed: 0,category,text
0,business,Могут ли в россельхозбанке дать в залог норков...
1,law,Может ли срочник перевестись на контракт после...
2,business,Продажа недвижимости по ипотеки ? ( арестованы...
3,business,"В чем смысл криптовалюты, какая от неё выгода ..."
4,law,часть 1 статья 158 похитил телефон


# Data processing, embeddings getting

In [6]:
def process_text(text, cache):
    words = wordpunct_tokenize(text.lower())
    new_words = []
    for i in range(len(words)):
        el = words[i]
        if el.isalpha():
            el = el.lower().strip()
            if el not in cache:
                cache[el] = morph.parse(el)[0].normal_form
            el = cache[el]
            new_words.append(el)
    return new_words, cache

In [7]:
def process_all_texts(data):
    word2form = {}
    words = set()
    for text in tqdm(data.text):
        word, word2form = process_text(text, word2form)
        words.update(word)
    return words, word2form

In [8]:
def get_all_embeddings(word2form, word2vec):
    vectors = []
    word2index = {'PAD': 0}

    n_words, embedding_dim = word2vec.readline().split()
    n_words, embedding_dim = int(n_words), int(embedding_dim)

    # Zero vector for PAD
    vectors.append(np.zeros((1, embedding_dim)))

    progress_bar = tqdm(desc='Read word2vec', total=n_words)
    
    line = word2vec.readline().strip()
    c = 0
    while line:            
        current_parts = line.split()
        current_word = ' '.join(current_parts[:-embedding_dim])
        if current_word in word2form:
            word2index[current_word] = len(word2index)

            current_vectors = current_parts[-embedding_dim:]
            current_vectors = np.array(list(map(float, current_vectors)))
            current_vectors = np.expand_dims(current_vectors, 0)

            vectors.append(current_vectors)
        else:
            c += 1
        progress_bar.update(1)

        line = word2vec.readline().strip()
    print(c)
    
    progress_bar.close()
    vectors = np.concatenate(vectors)

    return vectors, word2index

In [9]:
words, word2form = process_all_texts(data)

  0%|          | 0/237779 [00:00<?, ?it/s]

In [10]:
word2vec_file = open('cc.ru.300.vec')
vectors, word2index = get_all_embeddings(words, word2vec_file)
word2vec_file.close()

Read word2vec:   0%|          | 0/2000000 [00:00<?, ?it/s]

1955984


In [11]:
cat_mapper = {cat: n for n, cat in enumerate(data.category.unique())}
data.category = data.category.map(cat_mapper)

# Dataset

In [12]:
class WordData(Dataset):
    
    def __init__(self, x_data, y_data, word2index, sequence_length=32, pad_token='PAD', verbose=True):
        
        super().__init__()
        
        self.x_data = []
        self.y_data = y_data
        
        self.word2index = word2index
        self.sequence_length = sequence_length
        
        self.pad_token = pad_token
        self.pad_index = self.word2index[self.pad_token]
        
        self.load(x_data, verbose=verbose)
        
    @staticmethod
    def process_text(text):
        global word2form
        words = wordpunct_tokenize(text.lower())
        new_words = []
        for i in range(len(words)):
            el = words[i]
            if el.isalpha():
                el = el.lower().strip()
                if el not in word2form:
                    word2form[el] = morph.parse(el)[0].normal_form
                el = word2form[el]
                new_words.append(el)
        return new_words
        
    def load(self, data, verbose=True):
        
        data_iterator = tqdm(data, desc='Loading data', disable=not verbose)
        
        for text in data_iterator:
            
            words = self.process_text(text)
            
            indexed_words = self.indexing(words)
            
            self.x_data.append(indexed_words)
    
    def indexing(self, tokenized_text):

        # здесь мы не используем токен UNK, потому что мы его специально не учили
        # становится непонятно какой же эмбеддинг присвоить неизвестному слову,
        # поэтому просто выбрасываем наши неизветсные слова
        
        return [self.word2index[word] for word in tokenized_text if word in self.word2index]
    
    def padding(self, sequence):
        
        # Ограничить длину self.sequence_length
        # если длина меньше максимально - западить
        if len(sequence)< self.sequence_length:
          add_pad = self.sequence_length - len(sequence)
          return sequence+[self.pad_index]*add_pad
        else:
          return sequence[:self.sequence_length]
    
    def __len__(self):
        
        return len(self.x_data)
    
    def __getitem__(self, idx):
        
        x = self.x_data[idx]
        x = self.padding(x)
        x = torch.Tensor(x).long()
        
        y = self.y_data[idx]
        
        return x, y

# Model class

In [13]:
class ModelWithAtt(torch.nn.Module):
  def __init__(self, matrix_w, n, hidden_size=256): #n - количетсво категорий
        
        super().__init__()

        self.n = n
        input_size = matrix_w.shape[1]

        self.emb_layer = torch.nn.Embedding.from_pretrained(torch.Tensor(matrix_w))

        self.LSTM = torch.nn.LSTM(input_size=input_size, 
                                  hidden_size=hidden_size, 
                                  num_layers=2,
                                  bidirectional=True,
                                  dropout=0.1, 
                                  batch_first=True) # задайте лстм, можно 2 уровня, лучше бидирекциональный, в доке торча есть инофрмация как это сделать в одну строчку
        
        self.q_proj = torch.nn.Linear(in_features=hidden_size * 2, out_features=256, bias=True)# три линейных преобразования, размерность совпадает с выходом из лстм (если БИлстм то надо умножить ее на 2)
        self.k_proj = torch.nn.Linear(in_features=hidden_size * 2, out_features=256, bias=True)
        self.v_proj = torch.nn.Linear(in_features=hidden_size * 2, out_features=256, bias=True)

        self.att_soft = torch.nn.Softmax(dim = 2)
        
        self.cnn_3gr = torch.nn.Conv1d(256, 128, kernel_size=(3,), stride=(1,))# три конволюционных фильтра с разными ядрами (3,4,5) чтобы были всякие нграммы ловить
        self.cnn_4gr = torch.nn.Conv1d(256, 128, kernel_size=(4,), stride=(1,))
        self.cnn_5gr = torch.nn.Conv1d(256, 128, kernel_size=(5,), stride=(1,))

        self.linear_1 = torch.nn.Linear(in_features=384, out_features=256, bias=True)# сверху накидываем два полносвязных слоя для классификации
        self.relu = torch.nn.ReLU()
        self.linear_2 = torch.nn.Linear(in_features=256, out_features=n, bias=True) 

        
  def forward(self, x):
      # torch.Size([64, 32])
      x_emb = self.emb_layer(x) # примените эмбеддинги 
      # torch.Size([64, 32, 300])
      # транспонируйте тензор для лстм как было описано выше
      x, _ = self.LSTM(x_emb) # применим лстм, не забываем что на выходе у него много всяких последовательностей, нам нужна только эта
      # транспонируйте обратно 
      # torch.Size([64, 32, 512])

      x_q = self.q_proj(x) #применим линейные преобразования для селф-эттеншена torch.Size([64, 32, 256])
      x_k = self.k_proj(x) # torch.Size([64, 32, 256])
      x_v = self.v_proj(x) # torch.Size([64, 32, 256])

      att_scores = torch.div(torch.bmm(x_q, x_k.transpose(1, 2)), sqrt(x_k.size(-1)))
      # torch.Size([64, 32, 32])
      # посмотрите в презентацию и перемножьте нужные тензора изспольуя функцию bmm из торча, перед этим одну из матриц обзательно транспонируйте
      # результат обязательно поделите на корень из последней размерности (то есть на рземер эмбеддинга из предыдущего слоя)
      att_dist = self.att_soft(att_scores) # накидываем софтмакс
      # torch.Size([64, 32, 32])
      attention_vectors = torch.bmm(att_scores, x_v)
      # torch.Size([64, 32, 256])

      x_att = attention_vectors.transpose(2,1) #транспонируем для конфолючионнах фильтров
      # torch.Size([64, 256, 32])

      x_cnn3 = self.cnn_3gr(x_att) # torch.Size([64, 128, 30])
      x_cnn4 = self.cnn_4gr(x_att) # torch.Size([64, 128, 29])
      x_cnn5 = self.cnn_5gr(x_att) # torch.Size([64, 128, 28])

      frst, _ =  x_cnn3.max(dim=-1,) # cделаем макс пуллинг torch.Size([64, 128])
      sc, _ = x_cnn4.max(dim=-1,) # torch.Size([64, 128])
      thr, _ = x_cnn5.max(dim=-1,) # torch.Size([64, 128])
      
      x_cat = torch.cat((frst, sc, thr), dim=-1) # а теперь объединим результаты
      # torch.Size([64, 384])
      
      x = self.linear_1(x_cat) # пару полносвязных слоев с релу для классификации
      # torch.Size([64, 256])
      x = self.relu(x) 
      # torch.Size([64, 256]) 
      x = self.linear_2(x)
      # torch.Size([64, 5])
    
      return x

# Training class

In [14]:
class ModelTrainer:
    def __init__(self, model, criterion, optimizer):
        self.model = model
        self.criterion = criterion
        self.optimizer = optimizer

    def learning_cycle(self, train, valid, epochs=10):
        self.losses = []
        self.best_test_loss = 10.
        self.test_f1 = []
        self.train_losses = []
        for n_epoch in trange(epochs, desc='Epochs'):
            progress_bar = tqdm(total=len(train_loader.dataset), desc='Epoch {}'.format(n_epoch + 1))
            self.train(train, progress_bar)
            progress_bar.close()

            self.valid(valid)

            print()
            print('Losses: train - {:.3f}, test - {:.3f}'.format(np.mean(self.train_losses), self.mean_test_loss))
            print('F1 test - {:.3f}'.format(self.test_f1[-1]))
                
            # Early stopping:
            if self.mean_test_loss < self.best_test_loss:
                self.best_test_loss = self.mean_test_loss
            else:
                print('Early stopping')
                break

    def train(self, data, pbar):
        
        self.model.train()
        
        for x, y in data:

            x = x.to(device)
            y = y.to(device)
            
            self.optimizer.zero_grad()
            pred = self.model(x)
            loss = self.criterion(pred, y)
            loss.backward()
            self.optimizer.step()
            
            self.train_losses.append(loss.item())
            self.losses.append(loss.item())
            
            pbar.set_postfix(train_loss = np.mean(self.losses[-500:]))
            pbar.update(x.shape[0])

    def valid(self, data):
        test_losses = []
        test_targets = []
        test_pred_class = []
        self.model.eval()
    
        for x, y in data:
            x = x.to(device)
            with torch.no_grad():

                pred = self.model(x)
                pred = pred.cpu()
                test_targets.append(y.numpy())
                test_pred_class.append(np.argmax(pred, axis=1))
                loss = self.criterion(pred, y)
                test_losses.append(loss.item())
        self.mean_test_loss = np.mean(test_losses)

        test_targets = np.concatenate(test_targets).squeeze()
        test_pred_class = np.concatenate(test_pred_class).squeeze()
        f1 = f1_score(test_targets, test_pred_class, average='micro')
        self.test_f1.append(f1)            

# Training

In [15]:
x_train, x_validation, y_train, y_validation = train_test_split(data.text, data.category, test_size=0.1)

train_dataset = WordData(list(x_train), list(y_train), word2index)
train_loader = DataLoader(train_dataset, batch_size=64)

validation_dataset = WordData(list(x_validation), list(y_validation), word2index)
validation_loader = DataLoader(validation_dataset, batch_size=64)

Loading data:   0%|          | 0/214001 [00:00<?, ?it/s]

Loading data:   0%|          | 0/23778 [00:00<?, ?it/s]

In [16]:
n_classes = data.category.unique().shape[0]

In [17]:
model = ModelWithAtt(vectors, n_classes)

In [18]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters())

model = model.to(device)
criterion = criterion.to(device)

In [19]:
trainer = ModelTrainer(model, criterion, optimizer)

In [20]:
trainer.learning_cycle(train_loader, validation_loader, epochs=10)

Epochs:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/214001 [00:00<?, ?it/s]


Losses: train - 0.625, test - 0.480
F1 test - 0.827


Epoch 2:   0%|          | 0/214001 [00:00<?, ?it/s]


Losses: train - 0.549, test - 0.453
F1 test - 0.839


Epoch 3:   0%|          | 0/214001 [00:00<?, ?it/s]


Losses: train - 0.515, test - 0.441
F1 test - 0.840


Epoch 4:   0%|          | 0/214001 [00:00<?, ?it/s]


Losses: train - 0.492, test - 0.440
F1 test - 0.842


Epoch 5:   0%|          | 0/214001 [00:00<?, ?it/s]


Losses: train - 0.474, test - 0.446
F1 test - 0.839
Early stopping
