In [1]:
! ls ./files/bert/rubert_cased_L-12_H-768_A-12_pt/

config.json  pytorch_model.bin	vocab.txt


In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from transformers import BertForSequenceClassification, BertModel, BertTokenizer
from torch.utils.data import Dataset, DataLoader
import torch
from torch import nn
from torch import optim

I0509 14:43:03.467619 140446016481088 file_utils.py:41] PyTorch version 1.2.0 available.


In [3]:
# предобученный Берт для русского языка (iPavlov)
BERT_PATH = './files/bert/rubert_cased_L-12_H-768_A-12_pt/'
DATA_PATH = './files/data/data_bert/'
FINETUNE_MODEL_PATH = './files/bert/rubert_finetune/'

### Finetuning Bert

In [4]:
MAX_LEN = 102
batch_size = 32

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [6]:
class TextDataset(Dataset):

    def __init__(self, data, target_mapping):
        """
            Extract sample

            ----------
            data : array-like of shape (n_samples, )
                Every sample is the dictionary with keys ['target','sent','data']. Where data is parameters of one sample after bert tokenization.
            target_mapping: dict
                Number associated with real target. Keys range from 0 to number of classes
        """        
        self.sentences_features = data
        self.target_mapping = target_mapping
    def __len__(self):
        return len(self.sentences_features)

    def __getitem__(self, idx):
        sample = self.sentences_features[idx]['data']
        sample['target'] = self.target_mapping[self.sentences_features[idx]['target']]
        return sample

In [7]:
train_data = np.load(DATA_PATH + 'train.npy')[:]
test_data = np.load(DATA_PATH + 'test.npy')[:]
dev_data = np.load(DATA_PATH + 'dev.npy')

Т.к классы идут, не по-порядку присвоим им последовательные значения от 0 до кол-ва классов

In [8]:
target_mapping = pd.value_counts([x['target'] for x in train_data])
target_mapping = dict(zip(target_mapping.index, range(len(target_mapping))))
num_classes = len(target_mapping)
print('Кол-во классов', num_classes)

Кол-во классов 80


In [9]:
targets = [target_mapping[x['target']] for x in train_data]
weight = pd.value_counts(targets).sort_index() / sum(pd.value_counts(targets))
weight = torch.tensor(list(weight.values[::-1]))

In [10]:
trainset = TextDataset(train_data, target_mapping)
testset = TextDataset(test_data, target_mapping)
devset = TextDataset(dev_data, target_mapping)

In [11]:
train_loader = DataLoader(trainset, batch_size=batch_size,shuffle=True)
test_loader = DataLoader(testset, batch_size=batch_size)
dev_loader = DataLoader(devset, batch_size=batch_size)

In [12]:
initial_model = BertModel.from_pretrained(pretrained_model_name_or_path = BERT_PATH, cache_dir=None).to(device);


I0509 14:43:29.877933 140446016481088 configuration_utils.py:280] loading configuration file ./files/bert/rubert_cased_L-12_H-768_A-12_pt/config.json
I0509 14:43:29.878886 140446016481088 configuration_utils.py:318] Model config BertConfig {
  "_num_labels": 2,
  "architectures": null,
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "directionality": "bidi",
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "min_length": 0,
  "model_type": "bert",
  "no_repeat_ngram_size": 0,
  "num

In [13]:
class FinetuneBert(nn.Module):
    def __init__(self, initial_model, output_dim, freeze_layers):
        super(FinetuneBert, self).__init__()
        self.bert = initial_model
        self.cls = nn.Linear(768, output_dim)
        for layer_idx in freeze_layers:
            print ("Froze Layer: ", layer_idx)
            for param in list(self.bert.encoder.layer[layer_idx].parameters()):
                param.requires_grad = False
                
    def forward(self, input, attention_mask):
        _, x = self.bert(input, attention_mask=attention_mask)
        return self.cls(x)
            

In [14]:
#обучаем последние 12 - freeze_layers_num слоев
freeze_layers_num = 6

for m in initial_model.parameters():
    m.requires_grad = True
    
model = FinetuneBert(initial_model, num_classes, range(freeze_layers_num)).to(device)

Froze Layer:  0
Froze Layer:  1
Froze Layer:  2
Froze Layer:  3
Froze Layer:  4
Froze Layer:  5


In [15]:
optimizer = optim.Adam([{'params': model.bert.parameters(), 'lr': 2e-5, 'name':'Bert'}, 
                        {'params': model.cls.parameters(), 'name':'Cls'}], lr=1e-3)
criterion = nn.CrossEntropyLoss(weight=weight.to(device))
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

In [16]:
from sklearn.metrics import f1_score, roc_auc_score

def f1_metrics(pred, true):
    pred = pred.argmax(1).data.numpy()
    true = true.data.numpy()
    return f1_score(true, pred, average='macro')

In [17]:
def get_lr(optimizer):
    print('LR: ', end='')
    for param_group in optimizer.param_groups:
        print(f'{param_group["name"]}: {param_group["lr"]},', end=' ')
    print()

In [18]:
def train(model, optimizer, criterion, dataloader):
    metrics = {'loss': 0, 'f1': 0}
    model.train()
    for batch in tqdm(dataloader, desc='train...'):
        batch = {t: batch[t].to(device) for t in batch}
        optimizer.zero_grad()
        output = model(batch['input_ids'], attention_mask=batch['attention_mask'])
        loss = criterion(output, batch['target']) 
        loss.backward()
        optimizer.step()
        f1 = f1_metrics(output.cpu(), batch['target'].cpu())
        for n, m in zip(['loss','f1'], [loss.item(), f1]):
            metrics[n] =  metrics[n] + m
    return {n: l / len(dataloader) for n, l in metrics.items()}

def test(model, criterion, dataloader):
    metrics = {'loss': 0, 'f1': 0}
    model.eval()
    for batch in tqdm(dataloader,  desc='test...'):
        batch = {t: batch[t].to(device) for t in batch}
        output = model(batch['input_ids'], attention_mask=batch['attention_mask'])
        loss = criterion(output, batch['target']) 
        f1 = f1_metrics(output.cpu(), batch['target'].cpu())
        for n, m in zip(['loss','f1'], [loss.item(), f1]):
            metrics[n] =  metrics[n] + m
    return {n: l / len(dataloader) for n, l in metrics.items()}


In [19]:
from IPython.display import clear_output

def plot_metrics(metrics_train, metrics_test):
    plt.figure(figsize=(15, 10))
    plt.grid()
    plt.subplot(2, 2, 1)
    plt.grid()
    plt.plot([x['loss'] for x in metrics_test], label='test')
    plt.plot([x['loss'] for x in metrics_train], label='train')
    plt.legend()
    plt.title('LOSS')

    plt.subplot(2, 2,  2)
    plt.grid()
    plt.plot([x['f1'] for x in metrics_test], label='test')
    plt.plot([x['f1'] for x in metrics_train], label='train')
    plt.legend()
    plt.title('F1')
    plt.show()

In [21]:
best_f1 = 0
train_loss = []
test_loss = []

for epoch in range(20):
    train_metics = train(model, optimizer, criterion, train_loader)
    train_loss.append(train_metics)
    
    test_metics = test(model, criterion, test_loader)
    test_loss.append(test_metics)
    clear_output(wait=False)
    print(f"Epoch: {epoch+1:02}, Train Loss: {train_loss[-1]['loss']:.3f}, Train F1: {train_loss[-1]['f1']:.3f},  Test Loss: {test_loss[-1]['loss']:.3f}, Test F1: {test_loss[-1]['f1']:.3f}")
    get_lr(optimizer)
    scheduler.step()
          
    torch.save(model.state_dict(), FINETUNE_MODEL_PATH + 'current_model.pt')      
    if best_f1 < test_loss[-1]['f1']:
        torch.save(model.state_dict(), FINETUNE_MODEL_PATH + 'best_model.pt')
        best_f1 = test_loss[-1]['f1']
    plot_metrics(train_loss, test_loss)

In [22]:
# model.load_state_dict(torch.load('files/bert/rubert_finetune/best_model.pt', map_location=torch.device('cpu')))
# model.eval();