In [1]:
import pandas as pd
import numpy as np

from transformers import BertTokenizer, AutoTokenizer, BertForSequenceClassification, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

import torch
from torch.optim import Adam
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader

from datetime import datetime

from sklearn.metrics import classification_report
from torch.utils.tensorboard import SummaryWriter
from scipy.stats import entropy

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#df = pd.read_excel('6_label_100_sample.xlsx')
#df = pd.read_excel('2108(1000 sample).xlsx')
df = pd.read_excel('1_for_train_1000_sample.xlsx')
df_real_val = pd.read_excel('1_for_real_val.xlsx')
#df_real_val = pd.read_excel('for_inferense_august.xlsx')

In [3]:
#base_model = 'ai-forever/ruBert-large'
#base_model = 'ai-forever/ruRoberta-large'
#base_model = 'bert-base-uncased'
#base_model = 'ai-forever/ru-en-RoSBERTa'
#base_model = 'microsoft/Multilingual-MiniLM-L12-H384'
base_model = 't-bank-ai/response-toxicity-classifier-base'
#base_model = 'DeepPavlov/distilrubert-tiny-cased-conversational'

In [4]:
tokenizer = AutoTokenizer.from_pretrained(base_model)
data = Dataset.from_dict({'text': df['Text'], 'label': df['Label']}).train_test_split(test_size=0.2, seed=42)
data_real_val = Dataset.from_dict({'text': df_real_val['Text']})
data_tokenized = data.map(lambda x: tokenizer(x['text'], truncation=True, max_length=512), batched=True, remove_columns=['text'])
data_real_val_tokenized = data_real_val.map(lambda x: tokenizer(x['text'], truncation=True, max_length=512), batched=True, remove_columns=['text'])

Map: 100%|██████████| 762/762 [00:00<00:00, 17539.85 examples/s]
Map: 100%|██████████| 191/191 [00:00<00:00, 17248.25 examples/s]
Map: 100%|██████████| 3683/3683 [00:00<00:00, 14883.23 examples/s]


In [5]:
collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_dataloader = DataLoader(data_tokenized['train'], shuffle=True, batch_size=8, collate_fn=collator)
val_dataloader = DataLoader(data_tokenized['test'], shuffle=False, batch_size=8, collate_fn=collator)
real_val_dataloader = DataLoader(data_real_val_tokenized, shuffle=False, batch_size=8, collate_fn=collator)

In [6]:
model = AutoModelForSequenceClassification.from_pretrained(base_model, num_labels=6, ignore_mismatched_sizes=True) # for t-bank-ai/response-toxicity-classifier-base
#model = AutoModelForSequenceClassification.from_pretrained(base_model, num_labels=6)
optimizer = Adam(model.parameters(), lr=1e-6)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at t-bank-ai/response-toxicity-classifier-base and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([4, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([6]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
if torch.cuda.is_available():
    model.cuda()

In [8]:
import gc
#del model
torch.cuda.empty_cache()
gc.collect()

4

In [9]:
writer = SummaryWriter(f'runs/{base_model}, batch_size = {train_dataloader.batch_size}, date = {datetime.now().strftime("%Y-%m-%d, %H:%M")}')
losses = []
n_epoch = 20
train_batch_count = len(train_dataloader)
val_batch_count = len(val_dataloader)
for epoch in range(n_epoch):
    model.train()
    for i, batch in enumerate(train_dataloader):
        out = model(**batch.to(model.device))
        out.loss.backward()
        
        optimizer.step()
        optimizer.zero_grad()
        losses.append(out.loss.item())
    writer.add_scalar("Loss/train", np.mean(losses[-train_batch_count:]), epoch+1)    
    print(f'Epoch {epoch + 1}/{n_epoch} - recent train loss: {np.mean(losses[-train_batch_count:]):2.2f}')

    model.eval()
    eval_losses = []
    eval_preds = []
    eval_targets = []
    for batch in val_dataloader:
        with torch.no_grad():
            out = model(**batch.to(model.device))
        eval_losses.append(out.loss.item())
        eval_preds.extend(out.logits.argmax(1).tolist())
        eval_targets.extend(batch['labels'].tolist())
    writer.add_scalar("Loss/val", np.mean(eval_losses), epoch+1) 
    writer.add_scalar("Accuracy/val", np.mean(np.array(eval_targets) == eval_preds), epoch+1) 
    

    print(f'train loss: {np.mean(losses[-100:]):2.2f}, eval loss: {np.mean(eval_losses):2.2f},  accuracy: {np.mean(np.array(eval_targets) == eval_preds):2.2f}')

writer.flush()
writer.close()

Epoch 1/20 - recent train loss: 1.83
train loss: 1.83, eval loss: 1.78,  accuracy: 0.12
Epoch 2/20 - recent train loss: 1.74
train loss: 1.74, eval loss: 1.71,  accuracy: 0.34
Epoch 3/20 - recent train loss: 1.66
train loss: 1.66, eval loss: 1.61,  accuracy: 0.43
Epoch 4/20 - recent train loss: 1.53
train loss: 1.54, eval loss: 1.46,  accuracy: 0.45
Epoch 5/20 - recent train loss: 1.41
train loss: 1.41, eval loss: 1.35,  accuracy: 0.48
Epoch 6/20 - recent train loss: 1.29
train loss: 1.29, eval loss: 1.27,  accuracy: 0.53


In [None]:
model.eval()

eval_losses = []
eval_preds = []
eval_targets = []

mistake = pd.DataFrame(columns=['text', 'preds', 'true', 'probs'])

for batch in val_dataloader:
        with torch.no_grad():
            out = model(**batch.to(model.device))
        eval_losses.append(out.loss.item())
        eval_preds.extend(out.logits.argmax(1).tolist())
        eval_targets.extend(batch['labels'].tolist())

        batch_error_df = pd.DataFrame({'text': list(map(lambda x: tokenizer.decode(x, skip_special_tokens=True), batch['input_ids'])), 'preds': out.logits.argmax(1).tolist(), 'true': batch['labels'].tolist(), 
                                       'probs': map(lambda x: [round(val, 2) for val in torch.nn.Softmax(dim=0)(x.cpu()).tolist()], out.logits),
                                       'max_prob': map(lambda x: max([round(val, 2) for val in torch.nn.Softmax(dim=0)(x.cpu()).tolist()]), out.logits),
                                       'entropy': map(lambda y: round(entropy(y, base=2), 2), list(map(lambda x: [round(val, 2) for val in torch.nn.Softmax(dim=0)(x.cpu()).tolist()], out.logits)))})
        mistake = pd.concat([mistake, batch_error_df[batch_error_df['preds'] != batch_error_df['true']]])


report = classification_report(eval_targets, eval_preds, target_names=["Инцидент", "Создание УЗ", "Восстановление УЗ", "Добавление ПФ", "Изменение реквизитов", "Блокировка пользователя"])
print(report)
pd.options.display.max_colwidth = 200
mistake.reset_index(inplace=True, drop=True)
mistake


In [None]:
#model.cpu()
model.eval()
predictions = pd.DataFrame(columns=['text', 'preds', 'probs'])
for batch in real_val_dataloader:
        with torch.no_grad():
            out = model(**batch.to(model.device))

        pred_batch = pd.DataFrame({'text': list(map(lambda x: tokenizer.decode(x, skip_special_tokens=True), batch['input_ids'])), 'preds': out.logits.argmax(1).tolist(), 
                                   'probs': list(map(lambda x: [round(val, 2) for val in torch.nn.Softmax(dim=0)(x.cpu()).tolist()], out.logits)),
                                   'max_prob': list(map(lambda x: max([round(val, 2) for val in torch.nn.Softmax(dim=0)(x.cpu()).tolist()]), out.logits)),
                                   'entropy': map(lambda y: round(entropy(y, base=2), 2), list(map(lambda x: [round(val, 2) for val in torch.nn.Softmax(dim=0)(x.cpu()).tolist()], out.logits)))})
        predictions = pd.concat([predictions, pred_batch])


In [None]:
predictions

In [None]:
predictions[(predictions['preds'] != 0) & (predictions['max_prob'] > 0.9)]

In [None]:
round(predictions['entropy'].mean(), 2)

In [None]:
predictions['preds'].value_counts()

In [None]:
predictions[predictions['preds'] == 3]

In [None]:
model