# Imports and data reading

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 14.1 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 52.7 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 69.0 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.24.0


In [2]:
import pandas as pd
import numpy as np
from tqdm.notebook import trange, tqdm
from math import sqrt
import pickle

import seaborn as sns
from matplotlib import pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [3]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [4]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [5]:
path = '/gdrive/MyDrive/nnlp/berttens'

In [6]:
!wget https://raw.githubusercontent.com/semensorokin/DLforNLP_course_material/master/Homework2/answers_subsample.csv

--2022-12-01 14:12:42--  https://raw.githubusercontent.com/semensorokin/DLforNLP_course_material/master/Homework2/answers_subsample.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 28717126 (27M) [text/plain]
Saving to: ‘answers_subsample.csv’


2022-12-01 14:12:47 (317 MB/s) - ‘answers_subsample.csv’ saved [28717126/28717126]



In [7]:
model_name = 'DeepPavlov/rubert-base-cased-sentence'
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert = AutoModel.from_pretrained(model_name).to(device)

Downloading:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/711M [00:00<?, ?B/s]

In [8]:
data = pd.read_csv('answers_subsample.csv')
data.head()

Unnamed: 0,category,text
0,business,Могут ли в россельхозбанке дать в залог норков...
1,law,Может ли срочник перевестись на контракт после...
2,business,Продажа недвижимости по ипотеки ? ( арестованы...
3,business,"В чем смысл криптовалюты, какая от неё выгода ..."
4,law,часть 1 статья 158 похитил телефон


In [9]:
cat_mapper = {cat: n for n, cat in enumerate(data.category.unique())}
data.category = data.category.map(cat_mapper)

# Dataset

In [10]:
class WordData(Dataset):
    
    def __init__(self, x_data, y_data, sequence_length=32, verbose=True):
        
        super().__init__()
    
        self.x_data = []
        self.y_data = y_data
        self.sequence_length = sequence_length
        self.load(x_data, verbose=verbose)
        
    def process_text(self, text):
        toks = tokenizer(
                    text,
                    add_special_tokens=True,
                    truncation=True,
                    max_length=self.sequence_length,
                    padding='max_length', 
                    return_tensors='pt')
        with torch.no_grad():
            x = bert(**{k: v.to(device) for k, v in toks.items()})
        return x[0].cpu()
        
    def load(self, data, verbose=True):
        data_iterator = tqdm(data, desc='Loading data', disable=not verbose)
        for text in data_iterator:
            indexed_words = self.process_text(text)
            self.x_data.append(indexed_words)
        self.x_data = torch.cat(self.x_data)
    
    def __len__(self):
        return self.x_data.size(0)
    
    def __getitem__(self, idx):
        x = self.x_data[idx]
        y = self.y_data[idx]
        return x, y

# Model class

In [11]:
class ModelWithAtt(torch.nn.Module):
  def __init__(self, n, input_size=768, hidden_size=256): #n - количетсво категорий
        
        super().__init__()

        self.n = n
        
        # self.emb_layer = torch.nn.Embedding.from_pretrained(matrix_w)

        self.LSTM = torch.nn.LSTM(input_size=input_size, 
                                  hidden_size=hidden_size, 
                                  num_layers=2,
                                  bidirectional=True,
                                  dropout=0.1, 
                                  batch_first=True) # задайте лстм, можно 2 уровня, лучше бидирекциональный, в доке торча есть инофрмация как это сделать в одну строчку
        
        self.q_proj = torch.nn.Linear(in_features=hidden_size * 2, out_features=256, bias=True)# три линейных преобразования, размерность совпадает с выходом из лстм (если БИлстм то надо умножить ее на 2)
        self.k_proj = torch.nn.Linear(in_features=hidden_size * 2, out_features=256, bias=True)
        self.v_proj = torch.nn.Linear(in_features=hidden_size * 2, out_features=256, bias=True)

        self.att_soft = torch.nn.Softmax(dim = 2)
        
        self.cnn_3gr = torch.nn.Conv1d(256, 128, kernel_size=(3,), stride=(1,))# три конволюционных фильтра с разными ядрами (3,4,5) чтобы были всякие нграммы ловить
        self.cnn_4gr = torch.nn.Conv1d(256, 128, kernel_size=(4,), stride=(1,))
        self.cnn_5gr = torch.nn.Conv1d(256, 128, kernel_size=(5,), stride=(1,))

        self.linear_1 = torch.nn.Linear(in_features=384, out_features=256, bias=True)# сверху накидываем два полносвязных слоя для классификации
        self.relu = torch.nn.ReLU()
        self.linear_2 = torch.nn.Linear(in_features=256, out_features=n, bias=True) 

        
  def forward(self, x):
      # torch.Size([64, 32])
    #   x_emb = self.emb_layer(x) # примените эмбеддинги 
      # torch.Size([64, 32, 300])
      # транспонируйте тензор для лстм как было описано выше
      x, _ = self.LSTM(x) # применим лстм, не забываем что на выходе у него много всяких последовательностей, нам нужна только эта
      # транспонируйте обратно 
      # torch.Size([64, 32, 512])

      x_q = self.q_proj(x) #применим линейные преобразования для селф-эттеншена torch.Size([64, 32, 256])
      x_k = self.k_proj(x) # torch.Size([64, 32, 256])
      x_v = self.v_proj(x) # torch.Size([64, 32, 256])

      att_scores = torch.div(torch.bmm(x_q, x_k.transpose(1, 2)), sqrt(x_k.size(-1)))
      # torch.Size([64, 32, 32])
      # посмотрите в презентацию и перемножьте нужные тензора изспольуя функцию bmm из торча, перед этим одну из матриц обзательно транспонируйте
      # результат обязательно поделите на корень из последней размерности (то есть на рземер эмбеддинга из предыдущего слоя)
      att_dist = self.att_soft(att_scores) # накидываем софтмакс
      # torch.Size([64, 32, 32])
      attention_vectors = torch.bmm(att_scores, x_v)
      # torch.Size([64, 32, 256])

      x_att = attention_vectors.transpose(2,1) #транспонируем для конфолючионнах фильтров
      # torch.Size([64, 256, 32])

      x_cnn3 = self.cnn_3gr(x_att) # torch.Size([64, 128, 30])
      x_cnn4 = self.cnn_4gr(x_att) # torch.Size([64, 128, 29])
      x_cnn5 = self.cnn_5gr(x_att) # torch.Size([64, 128, 28])

      frst, _ =  x_cnn3.max(dim=-1,) # cделаем макс пуллинг torch.Size([64, 128])
      sc, _ = x_cnn4.max(dim=-1,) # torch.Size([64, 128])
      thr, _ = x_cnn5.max(dim=-1,) # torch.Size([64, 128])
      
      x_cat = torch.cat((frst, sc, thr), dim=-1) # а теперь объединим результаты
      # torch.Size([64, 384])
      
      x = self.linear_1(x_cat) # пару полносвязных слоев с релу для классификации
      # torch.Size([64, 256])
      x = self.relu(x) 
      # torch.Size([64, 256]) 
      x = self.linear_2(x)
      # torch.Size([64, 5])
    
      return x

# Training class

In [12]:
class ModelTrainer:
    def __init__(self, model, criterion, optimizer):
        self.model = model
        self.criterion = criterion
        self.optimizer = optimizer

    def learning_cycle(self, train, valid, epochs=10):
        self.losses = []
        self.best_test_loss = 10.
        self.test_f1 = []
        self.train_losses = []
        for n_epoch in trange(epochs, desc='Epochs'):
            progress_bar = tqdm(total=len(train_loader.dataset), desc='Epoch {}'.format(n_epoch + 1))
            self.train(train, progress_bar)
            progress_bar.close()

            self.valid(valid)

            print()
            print('Losses: train - {:.3f}, test - {:.3f}'.format(np.mean(self.train_losses), self.mean_test_loss))
            print('F1 test - {:.3f}'.format(self.test_f1[-1]))
                
            # # Early stopping:
            # if self.mean_test_loss < self.best_test_loss:
            #     self.best_test_loss = self.mean_test_loss
            # else:
            #     print('Early stopping')
            #     break

    def train(self, data, pbar):
        
        self.model.train()
        
        for x, y in data:

            x = x.to(device)
            y = y.to(device)
            
            self.optimizer.zero_grad()
            pred = self.model(x)
            loss = self.criterion(pred, y)
            loss.backward()
            self.optimizer.step()
            
            self.train_losses.append(loss.item())
            self.losses.append(loss.item())
            
            pbar.set_postfix(train_loss = np.mean(self.losses[-500:]))
            pbar.update(x.shape[0])

    def valid(self, data):
        test_losses = []
        test_targets = []
        test_pred_class = []
        self.model.eval()
    
        for x, y in data:
            x = x.to(device)
            with torch.no_grad():

                pred = self.model(x)
                pred = pred.cpu()
                test_targets.append(y.numpy())
                test_pred_class.append(np.argmax(pred, axis=1))
                loss = self.criterion(pred, y)
                test_losses.append(loss.item())
        self.mean_test_loss = np.mean(test_losses)

        test_targets = np.concatenate(test_targets).squeeze()
        test_pred_class = np.concatenate(test_pred_class).squeeze()
        f1 = f1_score(test_targets, test_pred_class, average='micro')
        self.test_f1.append(f1)            

# Training

In [16]:
# 
x_part, _, y_part, _ = train_test_split(data.text, data.category, test_size=0.7)

In [15]:
x_train, x_validation, y_train, y_validation = train_test_split(x_part, y_part, test_size=0.1)

train_dataset = WordData(list(x_train), list(y_train))
train_loader = DataLoader(train_dataset, batch_size=64)

validation_dataset = WordData(list(x_validation), list(y_validation))
validation_loader = DataLoader(validation_dataset, batch_size=64)

Loading data:   0%|          | 0/107000 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [None]:
with open(f'{path}/train_dataset.pkl', 'wb') as f:
    pickle.dump(train_dataset, f)
with open(f'{path}/test_dataset.pkl', 'wb') as f:
    pickle.dump(train_dataset, f)

In [50]:
n_classes = data.category.unique().shape[0]

In [56]:
model = ModelWithAtt(n_classes)

In [57]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters())

model = model.to(device)
criterion = criterion.to(device)

In [58]:
trainer = ModelTrainer(model, criterion, optimizer)

In [59]:
trainer.learning_cycle(train_loader, validation_loader, epochs=10)

Epochs:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/9000 [00:00<?, ?it/s]


Losses: train - 0.972, test - 0.842
F1 test - 0.701


Epoch 2:   0%|          | 0/9000 [00:00<?, ?it/s]


Losses: train - 0.864, test - 0.774
F1 test - 0.710


Epoch 3:   0%|          | 0/9000 [00:00<?, ?it/s]


Losses: train - 0.801, test - 0.741
F1 test - 0.710


Epoch 4:   0%|          | 0/9000 [00:00<?, ?it/s]


Losses: train - 0.760, test - 0.755
F1 test - 0.705


Epoch 5:   0%|          | 0/9000 [00:00<?, ?it/s]


Losses: train - 0.731, test - 0.766
F1 test - 0.712


Epoch 6:   0%|          | 0/9000 [00:00<?, ?it/s]


Losses: train - 0.705, test - 0.789
F1 test - 0.697


Epoch 7:   0%|          | 0/9000 [00:00<?, ?it/s]


Losses: train - 0.683, test - 0.869
F1 test - 0.707


Epoch 8:   0%|          | 0/9000 [00:00<?, ?it/s]


Losses: train - 0.663, test - 0.909
F1 test - 0.708


Epoch 9:   0%|          | 0/9000 [00:00<?, ?it/s]


Losses: train - 0.644, test - 1.047
F1 test - 0.684


Epoch 10:   0%|          | 0/9000 [00:00<?, ?it/s]


Losses: train - 0.627, test - 0.885
F1 test - 0.703


Epoch 11:   0%|          | 0/9000 [00:00<?, ?it/s]


Losses: train - 0.611, test - 1.165
F1 test - 0.655


Epoch 12:   0%|          | 0/9000 [00:00<?, ?it/s]


Losses: train - 0.597, test - 1.182
F1 test - 0.674


Epoch 13:   0%|          | 0/9000 [00:00<?, ?it/s]


Losses: train - 0.583, test - 1.367
F1 test - 0.696


Epoch 14:   0%|          | 0/9000 [00:00<?, ?it/s]


Losses: train - 0.570, test - 1.566
F1 test - 0.688


Epoch 15:   0%|          | 0/9000 [00:00<?, ?it/s]


Losses: train - 0.558, test - 1.387
F1 test - 0.707


Epoch 16:   0%|          | 0/9000 [00:00<?, ?it/s]


Losses: train - 0.547, test - 1.147
F1 test - 0.706


Epoch 17:   0%|          | 0/9000 [00:00<?, ?it/s]


Losses: train - 0.535, test - 1.277
F1 test - 0.705


Epoch 18:   0%|          | 0/9000 [00:00<?, ?it/s]


Losses: train - 0.523, test - 1.718
F1 test - 0.682


Epoch 19:   0%|          | 0/9000 [00:00<?, ?it/s]


Losses: train - 0.512, test - 1.688
F1 test - 0.694


Epoch 20:   0%|          | 0/9000 [00:00<?, ?it/s]


Losses: train - 0.501, test - 1.762
F1 test - 0.701
