In [2]:
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from copy import deepcopy

import transformers
from transformers import BertTokenizer

import seaborn
seaborn.set_style('whitegrid')





In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [6]:
data = pd.read_excel('data_train.xlsx')
data

Unnamed: 0,posts,label
0,–ö–∞–∫ —Ç—Ä–∞–Ωc–ª–∏—Ç–µ—Ä–∏—Ä–æ–≤–∞—Ç—å —É–∫—Ä–∞–∏–Ω—Å–∫–∏–µ –±—É–∫–≤—ã –Ω–∞ –ª–∞—Ç–∏...,
1,–õ—É—á—à–∏–µ –æ–∑–µ—Ä–∞ –ë–µ—Ä–ª–∏–Ω–∞: –æ—Ç –ø–æ–ø—É–ª—è—Ä–Ω—ã—Ö –¥–æ —Å–∫—Ä—ã—Ç—ã—Ö...,
2,–û–≥—Ä–∞–Ω–∏—á–µ–Ω–∏—è –¥–≤–∏–∂–µ–Ω–∏—è –ø–æ –ë–µ—Ä–ª–∏–Ω—É –Ω–∞ –≤—ã—Ö–æ–¥–Ω—ã—Ö\n\...,
3,–û–¥–Ω–∞ –∏–∑ –≤–∞–∂–Ω–µ–π—à–∏—Ö —Å—Ç—Ä–∞—Ö–æ–≤–æ–∫ –≤ –ì–µ—Ä–º–∞–Ω–∏–∏ ‚Äî —ç—Ç–æ R...,
4,–õ—é—Å—Ç–≥–∞—Ä—Ç–µ–Ω –ø–∞—Ä–∫ –≤ –ë–µ—Ä–ª–∏–Ω–µ\n\n–ü–∞—Ä–∫ ¬´–õ—é—Å—Ç–≥–∞—Ä—Ç–µ–Ω¬ª...,
...,...,...
184,–¢—Ä–∏ —á–µ–ª–æ–≤–µ–∫–∞ –ø–æ–≥–∏–±–ª–∏ –≤ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–µ –∞–≤–∞—Ä–∏–∏ –Ω–∞ A2...,
185,–ü—Ä–µ–¥—É–ø—Ä–µ–∂–¥–µ–Ω–∏–µ –æ –ø–ª–æ—Ö–æ–π –ø–æ–≥–æ–¥–µ –≤ –ë–µ—Ä–ª–∏–Ω–µ –∏ –ë—Ä–∞...,
186,–¢–µ–≥–µ–ª—å—Å–∫–∏–π –ø—Ä–∞–∑–¥–Ω–∏–∫ —É –≥–∞–≤–∞–Ω–∏ \n\n\n18 - 21 –∏—é–ª...,
187,–ü–æ–≥—Ä—É–∑–∏—Ç–µ—Å—å –≤ –∑–∞—Ö–≤–∞—Ç—ã–≤–∞—é—â–∏–π –º–∏—Ä –∞–¥—Ä–µ–Ω–∞–ª–∏–Ω–∞ –Ω–∞...,


In [7]:
train_texts, val_texts, train_labels, val_labels = train_test_split(np.array(data.posts), np.array(data.label), test_size=0.2)

In [8]:
tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=False,
            return_tensors='pt',
            truncation=True
        )
        
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'label': torch.tensor(label, dtype=torch.float)
        }

In [6]:
MAX_LEN = 256
BATCH_SIZE = 32

train_dataset = TextDataset(train_texts, train_labels, tokenizer, MAX_LEN)
val_dataset = TextDataset(val_texts, val_labels, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [9]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, fc_out_size, num_layers, output_size):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn1 = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc1 = nn.Linear(hidden_size, fc_out_size)
        self.rnn2 = nn.RNN(fc_out_size, hidden_size, num_layers, batch_first=True)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, input_ids):
        x = self.embedding(input_ids)
        rnn_out1, _ = self.rnn1(x)
        out1 = self.fc1(rnn_out1[:, -1, :]).unsqueeze(1)
        rnn_out2, _ = self.rnn2(out1)
        out2 = self.fc2(rnn_out2[:, -1, :]).flatten()
        return self.sigmoid(out2)

embed_size = 256
hidden_size = 256
fc_out_size = 512
num_layers = 7
output_size = 1

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = RNNModel(len(tokenizer), embed_size, hidden_size, fc_out_size, num_layers, output_size).to(device)

In [24]:
def train_model(model, train_loader, val_loader, device, num_epochs=30):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=0.001)
    best_f1 = -1000
    best_model_weights = deepcopy(model.state_dict())

    for epoch in tqdm(range(num_epochs), desc='Epoch Progress'):
        model.train()
        running_loss = 0.0
        for batch in tqdm(train_loader, desc=f'Epoch {epoch+1} Training', leave=False):
            input_ids = batch['input_ids'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        epoch_loss = running_loss / len(train_loader)
        print(f'Epoch {epoch+1} - Training loss: {epoch_loss}')

        model.eval()
        all_preds = []
        all_labels = []
        val_loss = 0.0
        with torch.no_grad():
            for batch in tqdm(val_loader, desc='Validation', leave=False):
                input_ids = batch['input_ids'].to(device)
                labels = batch['label'].to(device)

                outputs = model(input_ids)
                preds = outputs.round().cpu().numpy()
                all_preds.extend(preds)
                all_labels.extend(labels.cpu().numpy())
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                
        epoch_val_loss = val_loss / len(val_loader)
        correct = 0
        for i in range(len(all_preds)):
            if all_preds[i] == all_labels[i]:
                correct += 1
        total = len(all_labels)

        f1_val = f1_score(all_labels, all_preds)
        print(f'Validation Loss: {epoch_val_loss}, Accuracy: {100 * correct / total}, F1: {f1_val}%')

        
        if f1_val >= best_f1:
            best_f1 = f1_val
            best_model_weights = deepcopy(model.state_dict())

    print('Finished Training')
    model.load_state_dict(best_model_weights)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [25]:
num_epochs = 25

train_model(model, train_loader, val_loader, device, num_epochs)

Epoch Progress:   0%|          | 0/25 [00:00<?, ?it/s]

Epoch 1 Training:   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 1 - Training loss: 30.92833948135376


Validation:   0%|          | 0/1 [00:00<?, ?it/s]

Validation Loss: 37.06378936767578, Accuracy: 65.625, F1: 0.6451612903225806%


Epoch 2 Training:   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 2 - Training loss: 29.292370319366455


Validation:   0%|          | 0/1 [00:00<?, ?it/s]

Validation Loss: 36.04204559326172, Accuracy: 65.625, F1: 0.6451612903225806%


Epoch 3 Training:   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 3 - Training loss: 29.11060619354248


Validation:   0%|          | 0/1 [00:00<?, ?it/s]

Validation Loss: 35.85891342163086, Accuracy: 68.75, F1: 0.6666666666666666%


Epoch 4 Training:   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 4 - Training loss: 28.95097827911377


Validation:   0%|          | 0/1 [00:00<?, ?it/s]

Validation Loss: 35.85868453979492, Accuracy: 68.75, F1: 0.6666666666666666%


Epoch 5 Training:   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 5 - Training loss: 29.060324668884277


Validation:   0%|          | 0/1 [00:00<?, ?it/s]

Validation Loss: 35.858604431152344, Accuracy: 68.75, F1: 0.6666666666666666%


Epoch 6 Training:   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 6 - Training loss: 29.035834789276123


Validation:   0%|          | 0/1 [00:00<?, ?it/s]

Validation Loss: 35.85857009887695, Accuracy: 68.75, F1: 0.6666666666666666%


Epoch 7 Training:   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 7 - Training loss: 29.196378707885742


Validation:   0%|          | 0/1 [00:00<?, ?it/s]

Validation Loss: 35.858551025390625, Accuracy: 68.75, F1: 0.6666666666666666%


Epoch 8 Training:   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 8 - Training loss: 29.15787172317505


Validation:   0%|          | 0/1 [00:00<?, ?it/s]

Validation Loss: 35.85853576660156, Accuracy: 68.75, F1: 0.6666666666666666%


Epoch 9 Training:   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 9 - Training loss: 29.012422561645508


Validation:   0%|          | 0/1 [00:00<?, ?it/s]

Validation Loss: 35.858524322509766, Accuracy: 68.75, F1: 0.6666666666666666%


Epoch 10 Training:   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 10 - Training loss: 29.04350233078003


Validation:   0%|          | 0/1 [00:00<?, ?it/s]

Validation Loss: 35.858524322509766, Accuracy: 68.75, F1: 0.6666666666666666%


Epoch 11 Training:   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 11 - Training loss: 28.96778154373169


Validation:   0%|          | 0/1 [00:00<?, ?it/s]

Validation Loss: 35.858516693115234, Accuracy: 68.75, F1: 0.6666666666666666%


Epoch 12 Training:   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 12 - Training loss: 28.99263286590576


Validation:   0%|          | 0/1 [00:00<?, ?it/s]

Validation Loss: 35.85851287841797, Accuracy: 68.75, F1: 0.6666666666666666%


Epoch 13 Training:   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 13 - Training loss: 28.828795433044434


Validation:   0%|          | 0/1 [00:00<?, ?it/s]

Validation Loss: 35.85851287841797, Accuracy: 68.75, F1: 0.6666666666666666%


Epoch 14 Training:   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 14 - Training loss: 28.791573524475098


Validation:   0%|          | 0/1 [00:00<?, ?it/s]

Validation Loss: 35.8585090637207, Accuracy: 68.75, F1: 0.6666666666666666%


Epoch 15 Training:   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 15 - Training loss: 28.829050302505493


Validation:   0%|          | 0/1 [00:00<?, ?it/s]

Validation Loss: 35.85850524902344, Accuracy: 68.75, F1: 0.6666666666666666%


Epoch 16 Training:   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 16 - Training loss: 28.611721515655518


Validation:   0%|          | 0/1 [00:00<?, ?it/s]

Validation Loss: 35.85850524902344, Accuracy: 68.75, F1: 0.6666666666666666%


Epoch 17 Training:   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 17 - Training loss: 28.847722053527832


Validation:   0%|          | 0/1 [00:00<?, ?it/s]

Validation Loss: 35.85850524902344, Accuracy: 68.75, F1: 0.6666666666666666%


Epoch 18 Training:   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 18 - Training loss: 28.977430820465088


Validation:   0%|          | 0/1 [00:00<?, ?it/s]

Validation Loss: 35.85850524902344, Accuracy: 68.75, F1: 0.6666666666666666%


Epoch 19 Training:   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 19 - Training loss: 29.063013792037964


Validation:   0%|          | 0/1 [00:00<?, ?it/s]

Validation Loss: 35.85850143432617, Accuracy: 68.75, F1: 0.6666666666666666%


Epoch 20 Training:   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 20 - Training loss: 28.787882328033447


Validation:   0%|          | 0/1 [00:00<?, ?it/s]

Validation Loss: 35.858497619628906, Accuracy: 68.75, F1: 0.6666666666666666%


Epoch 21 Training:   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 21 - Training loss: 28.80597448348999


Validation:   0%|          | 0/1 [00:00<?, ?it/s]

Validation Loss: 35.858497619628906, Accuracy: 68.75, F1: 0.6666666666666666%


Epoch 22 Training:   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 22 - Training loss: 28.916428565979004


Validation:   0%|          | 0/1 [00:00<?, ?it/s]

Validation Loss: 35.858497619628906, Accuracy: 68.75, F1: 0.6666666666666666%


Epoch 23 Training:   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 23 - Training loss: 28.948423862457275


Validation:   0%|          | 0/1 [00:00<?, ?it/s]

Validation Loss: 35.85849380493164, Accuracy: 68.75, F1: 0.6666666666666666%


Epoch 24 Training:   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 24 - Training loss: 29.044511079788208


Validation:   0%|          | 0/1 [00:00<?, ?it/s]

Validation Loss: 35.85849380493164, Accuracy: 68.75, F1: 0.6666666666666666%


Epoch 25 Training:   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 25 - Training loss: 28.940399646759033


Validation:   0%|          | 0/1 [00:00<?, ?it/s]

Validation Loss: 35.85849380493164, Accuracy: 68.75, F1: 0.6666666666666666%
Finished Training


In [26]:
def preprocess_text(text):
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=False,
        return_tensors='pt',
        truncation=True
    )
    return encoding['input_ids']


text = '''‚ö°Ô∏è–í—ã –∂–∏–≤—ë—Ç–µ –≤ –ë–µ—Ä–ª–∏–Ω–µ , —Å–æ–±–∏—Ä–∞–µ—Ç–µ—Å—å —Ç—É–¥–∞ –ø–µ—Ä–µ–µ–∑–∂–∞—Ç—å –∏–ª–∏ –≤–æ–∑–º–æ–∂–Ω–æ –ø—Ä–æ—Å—Ç–æ –ø—É—Ç–µ—à–µ—Å—Ç–≤—É–µ—Ç–µ‚ùì 

–ö–∞–Ω–∞–ª BERLIN LIVE  —Å–æ–∑–¥–∞–Ω –∏–º–µ–Ω–Ω–æ –¥–ª—è –≤–∞—Å!

‚úÖ –ï–∂–µ–¥–Ω–µ–≤–Ω—ã–µ –æ–±–Ω–æ–≤–ª–µ–Ω–∏—è –æ —Å–æ–±—ã—Ç–∏—è—Ö –≤ –ë–µ—Ä–ª–∏–Ω–µ 

‚úÖ –†–∞—Å—Å–∫–∞–∑—ã –æ –∑–Ω–∞–º–µ–Ω–∏—Ç—ã—Ö –¥–æ—Å—Ç–æ–ø—Ä–∏–º–µ—á–∞—Ç–µ–ª—å–Ω–æ—Å—Ç—è—Ö –∏ —Å–∫—Ä—ã—Ç—ã—Ö –∂–µ–º—á—É–∂–∏–Ω–∞—Ö –≥–æ—Ä–æ–¥–∞

‚úÖ –°–æ–≤–µ—Ç—ã –æ —Ç–æ–º, –≥–¥–µ –ø–æ–µ—Å—Ç—å, —á—Ç–æ –ø–æ—Å–º–æ—Ç—Ä–µ—Ç—å –∏ –∫–∞–∫ –ø—Ä–æ–≤–µ—Å—Ç–∏ –≤—Ä–µ–º—è –≤ –ë–µ—Ä–ª–∏–Ω–µ 

‚úÖ –ò–Ω—Ç–µ—Ä–µ—Å–Ω—ã–µ –∏—Å—Ç–æ—Ä–∏–∏ –æ –∂–∏–∑–Ω–∏ –≤ —ç—Ç–æ–º —É–¥–∏–≤–∏—Ç–µ–ª—å–Ω–æ–º –≥–æ—Ä–æ–¥–µ

üëâ –°—Å—ã–ª–∫–∞ –Ω–∞ –∫–∞–Ω–∞–ª 
üëâ –ü–æ–¥–ø–∏—Å–∞—Ç—å—Å—è '''

input_id = preprocess_text(text).to(device)

with torch.no_grad():
    model.eval() 
    output = model(input_id)
    prediction_prob = output.item()

prediction = ''
if round(prediction_prob) == 1:
    prediction = '–†–µ–∫–ª–∞–º–∞'
else:
    prediction = '–ù–µ —Ä–µ–∫–ª–∞–º–∞'


print(f"Prediction: {prediction}, probability: {float(str(prediction_prob)[:4])*100}%")

Prediction: –†–µ–∫–ª–∞–º–∞, probability: 99.0%


In [11]:
# —Å–µ–π–≤–∏—Ç—å –±—É–¥–µ–º –≤—Å—é –º–æ–¥–µ–ª—å

torch.save(model, 'model.pth')

# –∑–∞–≥—Ä—É–∑–∫–∞ –≤—Å–µ–π –º–æ–¥–µ–ª–∏
# model = torch.load('model.pth')
# model.to(device)