### Валерия Бунтякова

In [7]:
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from tqdm.notebook import tqdm
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from math import sqrt
import torch
from random import seed
from random import choices

## Данные

In [10]:
data = pd.read_csv('answers_subsample.csv')

In [11]:
cat_mapper = {cat: n for n, cat in enumerate(data.category.unique())}
data.category = data.category.map(cat_mapper)

In [12]:
punctuation = list('.,?!"\'\\/') + ['??', '???', '????', '!!', '!!!', '!!!!']

def process_text(text):
    
    words = wordpunct_tokenize(text.lower())
    # words = [word for word in words if word not in punctuation]
    # сначала я убрала всю пунктуацию, а потом поняла что для нее всё равно нет векторов
    # и в итоге она всё равно не используется
    
    return words

In [13]:
word2freq = {}
lengths = []

for text in tqdm(data.text):
    
    words = process_text(text)
    
    lengths.append(len(words))
    
    for word in words:
        
        if word in word2freq:
            word2freq[word] += 1
        else:
            word2freq[word] = 1

HBox(children=(FloatProgress(value=0.0, max=237779.0), HTML(value='')))




In [14]:
word2index = {'PAD': 0}
vectors = []
    
word2vec_file = open('cc.ru.300.vec')
    
n_words, embedding_dim = word2vec_file.readline().split()
n_words, embedding_dim = int(n_words), int(embedding_dim)

# Zero vector for PAD
vectors.append(np.zeros((1, embedding_dim)))

progress_bar = tqdm(desc='Read word2vec', total=n_words)

while True:

    line = word2vec_file.readline().strip()

    if not line:
        break
        
    current_parts = line.split()

    current_word = ' '.join(current_parts[:-embedding_dim])

    if current_word in word2freq:

        word2index[current_word] = len(word2index)

        current_vectors = current_parts[-embedding_dim:]
        current_vectors = np.array(list(map(float, current_vectors)))
        current_vectors = np.expand_dims(current_vectors, 0)

        vectors.append(current_vectors)

    progress_bar.update(1)

progress_bar.close()

word2vec_file.close()

vectors = np.concatenate(vectors)

HBox(children=(FloatProgress(value=0.0, description='Read word2vec', max=2000000.0, style=ProgressStyle(descri…




In [15]:
class WordData(Dataset):
    
    def __init__(self, x_data, y_data, word2index, sequence_length=32, pad_token='PAD', verbose=True):
        
        super().__init__()
        
        self.x_data = []
        self.y_data = y_data
        
        self.word2index = word2index
        self.sequence_length = sequence_length
        
        self.pad_token = pad_token
        self.pad_index = self.word2index[self.pad_token]
        
        self.load(x_data, verbose=verbose)
        
    @staticmethod
    def process_text(text):
        
        words = wordpunct_tokenize(text.lower())
        words = [word for word in words if word not in punctuation]
        return words
        
    def load(self, data, verbose=True):
        
        data_iterator = tqdm(data, desc='Loading data', disable=not verbose)
        
        for text in data_iterator:
            
            words = self.process_text(text)
            
            indexed_words = self.indexing(words)
            
            self.x_data.append(indexed_words)
    
    def indexing(self, tokenized_text):
        return [self.word2index[word] for word in tokenized_text if word in self.word2index]
    
    def padding(self, sequence):
        
        if len(sequence)< self.sequence_length:
            add_pad = self.sequence_length - len(sequence)
            return sequence+[self.pad_index]*add_pad
        else:
            return sequence[:self.sequence_length]
    
    def __len__(self):
        
        return len(self.x_data)
    
    def __getitem__(self, idx):
        
        x = self.x_data[idx]
        x = self.padding(x)
        x = torch.Tensor(x).long()
        
        y = self.y_data[idx]
        
        return x, y

In [16]:
x_train, x_validation, y_train, y_validation = train_test_split(data.text, data.category, test_size=0.1)

train_dataset = WordData(list(x_train), list(y_train), word2index)
train_loader = DataLoader(train_dataset, batch_size=64)

validation_dataset = WordData(list(x_validation), list(y_validation), word2index)
validation_loader = DataLoader(validation_dataset, batch_size=64)

HBox(children=(FloatProgress(value=0.0, description='Loading data', max=214001.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Loading data', max=23778.0, style=ProgressStyle(descripti…




## Модель

In [17]:
class model_with_att(torch.nn.Module):
    def __init__(self, matrix_w, n, lstm_size=256, linear_size=256, cnn_out_size=128, inner_linear_size=256): 
        #n - количество категорий
        
        super().__init__()

        self.n = n

        self.emb_layer = torch.nn.Embedding.from_pretrained(torch.Tensor(matrix_w))

        self.LSTM = torch.nn.LSTM(300, lstm_size, num_layers=2, batch_first=True, bidirectional=True) # 64 32 124
        # задайте лстм, можно 2 уровня, лучше бидирекциональный
        
        
        # три линейных преобразования, размерность совпадает с выходом из лстм (если БИлстм то надо умножить ее на 2)
        self.q_proj = torch.nn.Linear(in_features=2*lstm_size, out_features=linear_size)
        self.k_proj = torch.nn.Linear(in_features=2*lstm_size, out_features=linear_size)
        self.v_proj = torch.nn.Linear(in_features=2*lstm_size, out_features=linear_size)

        self.att_soft = torch.nn.Softmax(dim = 2)
        
        # три конволюционных фильтра с разными ядрами (3,4,5) чтобы были всякие нграммы ловить
        self.cnn_3gr = torch.nn.Conv1d(in_channels=linear_size, out_channels=cnn_out_size, kernel_size=3)
        self.cnn_4gr = torch.nn.Conv1d(in_channels=linear_size, out_channels=cnn_out_size, kernel_size=4)
        self.cnn_5gr = torch.nn.Conv1d(in_channels=linear_size, out_channels=cnn_out_size, kernel_size=5)
        
        # сверху накидываем два полносвязных слоя для классификации
        self.linear_1 = torch.nn.Linear(in_features=3*cnn_out_size, out_features=inner_linear_size)
        self.relu = torch.nn.ReLU()
        self.linear_2 = torch.nn.Linear(in_features=inner_linear_size, out_features=n) 
        
        
    def forward(self, x):
        x_emb = self.emb_layer(x)  # примените эмбеддинги
        
        x, _ = self.LSTM(x_emb)

        # применим линейные преобразования для селф-эттеншена
        x_q = self.q_proj(x) 
        x_k = self.k_proj(x)
        x_v = self.v_proj(x) 
        
        att_scores = torch.bmm(x_k, x_q.transpose(2, 1)) / sqrt(300)
        att_dist = self.att_soft(att_scores) # накидываем софтмакс
        attention_vectors = torch.bmm(att_dist, x_v)

        #транспонируем для конволюционных фильтров
        x_att = attention_vectors.transpose(2,1) 

        x_cnn3 = self.cnn_3gr(x_att)
        x_cnn4 = self.cnn_4gr(x_att)
        x_cnn5 = self.cnn_5gr(x_att)

        frst, _ = x_cnn3.max(dim= -1,) # cделаем макс пуллинг
        sc, _ = x_cnn4.max(dim= -1,)
        thr, _ = x_cnn5.max(dim= -1,)

        x_cat = torch.cat((frst, sc, thr), dim=-1) # а теперь объединим результаты
        
        # пару полносвязных слоев с релу для классификации
        x = self.linear_1(x_cat)
        x = self.relu(x)    
        x = self.linear_2(x)

        return x

In [25]:
def train_model(model):
    
    model = model.to(device)
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(params=model.parameters())

    criterion = criterion.to(device)
    epochs = 10
    losses = []
    best_test_loss = 10.

    test_f1 = []

    for n_epoch in range(epochs):

        train_losses = []
        test_losses = []
        test_targets = []
        test_pred_class = []

        progress_bar = tqdm(total=len(train_loader.dataset), desc='Epoch {}'.format(n_epoch + 1))

        model.train()

        for x, y in train_loader:

            x = x.to(device)
            y = y.to(device)

            optimizer.zero_grad()

            pred = model(x)
            loss = criterion(pred, y)

            loss.backward()

            optimizer.step()

            train_losses.append(loss.item())
            losses.append(loss.item())

            progress_bar.set_postfix(train_loss = np.mean(losses[-500:]))

            progress_bar.update(x.shape[0])

        progress_bar.close()

        model.eval()

        for x, y in validation_loader:

            x = x.to(device)

            with torch.no_grad():

                pred = model(x)

                pred = pred.cpu()

                test_targets.append(y.numpy())
                test_pred_class.append(np.argmax(pred, axis=1))

                loss = criterion(pred, y)

                test_losses.append(loss.item())

        mean_test_loss = np.mean(test_losses)

        test_targets = np.concatenate(test_targets).squeeze()
        test_pred_class = np.concatenate(test_pred_class).squeeze()

        f1 = f1_score(test_targets, test_pred_class, average='micro')

        test_f1.append(f1)

#         print()
#         print('Losses: train - {:.3f}, test - {:.3f}'.format(np.mean(train_losses), mean_test_loss))

#         print('F1 test - {:.3f}'.format(f1))

        # Early stopping:
        if mean_test_loss < best_test_loss:
            best_test_loss = mean_test_loss
        else:
            print('Early stopping')
            print('F1 test - {:.3f}'.format(f1))
            return f1

In [19]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
n_classes = data.category.unique().shape[0]

In [26]:
seed(13)
model = model_with_att(vectors, n_classes)
f1 = train_model(model)

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=214001.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=214001.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Epoch 3', max=214001.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Epoch 4', max=214001.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Epoch 5', max=214001.0, style=ProgressStyle(description_w…


Early stopping
F1 test - 0.838


Дефолтная модель показывает f1-score 0.838. Попробуем подобрать гиперпараметры.

## Гиперпараметры 

In [3]:
grid = {'lstm_size':[128, 256, 512],
        'linear_size':[128, 256, 512],
        'cnn_out_size':[64, 128, 256],
        'inner_linear_size':[64, 128, 256]}

In [4]:
# я не знала как сделать это лучше... кринж
pars = []
for i in range(3):
    for j in range(3):
        for k in range(3):
            for h in range(3):
                pars.append([grid['lstm_size'][i], 
                             grid['linear_size'][j], 
                             grid['cnn_out_size'][k], 
                             grid['inner_linear_size'][h]])

In [8]:
chosen_pars = choices(pars, k=40)

In [9]:
chosen_pars

[[128, 512, 256, 64],
 [128, 256, 256, 128],
 [256, 128, 128, 64],
 [256, 128, 128, 256],
 [128, 128, 128, 64],
 [128, 128, 256, 256],
 [512, 256, 64, 128],
 [128, 512, 64, 128],
 [128, 128, 128, 128],
 [256, 256, 64, 128],
 [512, 512, 64, 64],
 [128, 256, 64, 128],
 [512, 256, 64, 256],
 [256, 256, 256, 128],
 [512, 256, 128, 64],
 [128, 256, 256, 128],
 [128, 512, 64, 64],
 [256, 128, 256, 64],
 [512, 512, 64, 64],
 [128, 256, 256, 128],
 [512, 128, 128, 64],
 [256, 512, 256, 128],
 [128, 512, 64, 128],
 [256, 128, 256, 64],
 [512, 128, 64, 128],
 [512, 128, 64, 64],
 [256, 128, 128, 128],
 [128, 512, 64, 64],
 [128, 512, 256, 128],
 [256, 256, 256, 256],
 [128, 512, 128, 256],
 [256, 256, 256, 256],
 [128, 512, 256, 64],
 [512, 128, 256, 64],
 [256, 128, 64, 128],
 [512, 512, 256, 128],
 [128, 512, 128, 256],
 [128, 512, 128, 64],
 [256, 256, 256, 64],
 [512, 512, 64, 64]]

In [None]:
seed(13)
worked = []
for p in tqdm(chosen_pars):
    lstm_size = p[0]
    linear_size = p[1]
    cnn_out_size = p[2]
    inner_linear_size = p[3]
    model = model_with_att(vectors, 
                           n_classes, 
                           lstm_size=lstm_size, 
                           linear_size=linear_size, 
                           cnn_out_size=cnn_out_size,
                           inner_linear_size=inner_linear_size)
    
    print('========== MODEL ===========')
    print(lstm_size, linear_size, cnn_out_size, inner_linear_size)
    f1 = train_model(model)
    worked.append(f1)
#     print(lstm_size, linear_size, cnn_out_size)
    print('========== DONE ===========')
    print()

In [39]:
# я скрыла аутпут в предыдущей ячейке, потому что он слишкм длинный

In [35]:
worked.index(max(worked))

31

In [37]:
worked[31]

0.8434266969467575

In [36]:
chosen_pars[31]

[256, 256, 256, 256]

Я выбрала по три возможных значения для каждого из гиперпараметров (дефолтное, в два раза больше и в два раза меньше). Всего получился 81 набор возможных значений. Из них я выбрала случайные 40 и посчитала, при каком наборе получается наибольший f1-score. Получилось 0.843 при [256, 256, 256, 256]

In [38]:
seed(13)
model = model_with_att(vectors, 
                       n_classes, 
                       lstm_size=256, 
                       linear_size=256, 
                       cnn_out_size=256,
                       inner_linear_size=256)
train_model(model)

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=214001.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=214001.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Epoch 3', max=214001.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Epoch 4', max=214001.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Epoch 5', max=214001.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Epoch 6', max=214001.0, style=ProgressStyle(description_w…


Early stopping
F1 test - 0.842


0.8417444696778535

Почему значение здесь и значение выше разные? Я не знаю, seed одинаковый(( Главное, что стало чуть-чуть лучше в любом случае. 