In [2]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [3]:
from transformers import BertForSequenceClassification, BertModel, BertTokenizer
from torch.utils.data import Dataset, DataLoader
import torch
from torch import nn


I0509 15:47:36.057192 139668169754432 file_utils.py:41] PyTorch version 1.2.0 available.


In [4]:
# предобученный Берт для русского языка (iPavlov)
BERT_PATH = './files/bert/rubert_cased_L-12_H-768_A-12_pt/'
DATA_PATH = './files/data/bert_data/'
SAVE_LOGITS_PATH = './files/data/logits/'
FINETUNE_MODEL_PATH = './files/bert/rubert_finetune/'

### Извлекаем логиты из предобученного RuBERT

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [6]:
class TextDataset(Dataset):

    def __init__(self, data, target_mapping):   
        self.sentences_features = data
        self.target_mapping = target_mapping
    def __len__(self):
        return len(self.sentences_features)

    def __getitem__(self, idx):
        sample = self.sentences_features[idx]['data']
        sample['target'] = self.target_mapping[self.sentences_features[idx]['target']]
        return sample

In [7]:
train_data = np.load(DATA_PATH + 'train.npy')
test_data = np.load(DATA_PATH + 'test.npy')
dev_data = np.load(DATA_PATH + 'dev.npy')

In [8]:
initial_model = BertModel.from_pretrained(pretrained_model_name_or_path = BERT_PATH, cache_dir=None).to(device);


I0509 15:47:47.981318 139668169754432 configuration_utils.py:280] loading configuration file ./files/bert/rubert_cased_L-12_H-768_A-12_pt/config.json
I0509 15:47:47.982127 139668169754432 configuration_utils.py:318] Model config BertConfig {
  "_num_labels": 2,
  "architectures": null,
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "directionality": "bidi",
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "min_length": 0,
  "model_type": "bert",
  "no_repeat_ngram_size": 0,
  "num

In [9]:
class FinetuneBert(nn.Module):
    def __init__(self, initial_model, output_dim, freeze_layers):
        super(FinetuneBert, self).__init__()
        self.bert = initial_model
        self.cls = nn.Linear(768, output_dim)
        for layer_idx in freeze_layers:
            print ("Froze Layer: ", layer_idx)
            for param in list(self.bert.encoder.layer[layer_idx].parameters()):
                param.requires_grad = False
                
    def forward(self, input, attention_mask):
        _, x = self.bert(input, attention_mask=attention_mask)
        return self.cls(x)
            

In [10]:
model = FinetuneBert(initial_model, 80, range(0)).to(device)

In [11]:
model.load_state_dict(torch.load(FINETUNE_MODEL_PATH + 'best_model.pt', map_location=torch.device('cpu')))
model.eval();

In [12]:
def extract_embedding(model, data):
    logits = np.empty((len(data), 80))
    indexes = np.empty(len(data))
    for i, sample in enumerate(tqdm(data)):
        tokens = sample['data']['input_ids'].view(1, -1)
        mask = sample['data']['attention_mask'].view(1, -1)
        with torch.no_grad():
            output = model(tokens, attention_mask=mask)
        logits[i] = output[0].data.numpy()
        indexes[i] = sample['id']
    df = pd.DataFrame(logits)
    df['id'] = indexes
    return df

In [13]:
os.makedirs(SAVE_LOGITS_PATH, exist_ok=True)


In [14]:
test_data_logits = extract_embedding(model, test_data[:])
test_data_logits.to_csv(os.path.join(SAVE_LOGITS_PATH, 'logits_test.csv'), index=False)


HBox(children=(FloatProgress(value=0.0, max=18570.0), HTML(value='')))




In [15]:
dev_data_logits = extract_embedding(model, dev_data)
dev_data_logits.to_csv(os.path.join(SAVE_LOGITS_PATH, 'logits_dev.csv'), index=False)


HBox(children=(FloatProgress(value=0.0, max=18571.0), HTML(value='')))




In [16]:
train_data_logits = extract_embedding(model, train_data)
train_data_logits.to_csv(os.path.join(SAVE_LOGITS_PATH, 'logits_train.csv'), index=False)


HBox(children=(FloatProgress(value=0.0, max=55710.0), HTML(value='')))


