In [5]:
# %pip install transformers

In [14]:
# %pip install evaluate

In [15]:
import warnings
import re
import numpy as np
import pandas as pd

import torch
import gc

from tqdm.auto import tqdm

from sklearn.metrics import roc_auc_score

from lib.config import model_path, device

import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AdamW
from transformers import BertForSequenceClassification
from transformers import pipeline
from transformers import Trainer
from transformers import AutoTokenizer
from transformers import TrainingArguments

from scipy.special import softmax
from sklearn.metrics import roc_auc_score, accuracy_score
import evaluate

import os

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

warnings.filterwarnings('ignore')

# Задача 1


В первой задаче необходимо оценить вероятность наличия в объявлении контактной информации.
Результатом работы модели является `pd.DataFrame` с колонками:
* `index`: `int`, положение записи в файле;
* `prediction`: `float` от 0 до 1.

## Загрузка данных и токенизация

In [28]:
dtype_dict = {
    'subcategory': 'category',
    'category': 'category',
    'price': 'float32',
    'region': 'category',
    'city': 'category',
    'is_bad': 'bool'
}

train = pd.read_csv('/content/gdrive/MyDrive/test/data/train.csv',
                    dtype=dtype_dict)

val = pd.read_csv('/content/gdrive/MyDrive/test/data/val.csv',
                  dtype=dtype_dict)

In [None]:
from torch.utils.data import Dataset
from torch.utils.data import TensorDataset, ConcatDataset, DataLoader


class PrepareDataset:
    def __init__(self, texts, tokenizer, batch_size_split=10, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.batch_size_split = batch_size_split

    def pre_tokenizer(self, data):
        return self.tokenizer(data,
                              padding='max_length',
                              truncation=True,
                              return_tensors='pt',
                              max_length=self.max_length)

    def transform(self):
        N = len(self.texts)
        size_split = N // self.batch_size_split

        train_encodings = self.pre_tokenizer(self.texts[:size_split])
        input_ids = train_encodings['input_ids']
        attention_mask = train_encodings['attention_mask']
        token_type_ids = train_encodings['token_type_ids']

        for pos in tqdm(range(size_split, N, size_split)):
            train_encodings_2 = self.pre_tokenizer(self.texts[pos:pos +
                                                              size_split])
            input_ids = torch.cat((input_ids, train_encodings_2['input_ids']))
            attention_mask = torch.cat(
                (attention_mask, train_encodings_2['attention_mask']))
            token_type_ids = torch.cat(
                (token_type_ids, train_encodings_2['token_type_ids']))

        return {
            'input_ids': input_ids,
            'token_type_ids': token_type_ids,
            'attention_mask': attention_mask
        }

In [None]:
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2", do_lower_case=True)

Downloading:   0%|          | 0.00/401 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.66M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
%%time
max_length = 512

clf = PrepareDataset(texts=train.description.tolist(),
                     tokenizer=tokenizer,
                     batch_size_split=15)
train_encodings = clf.transform()

In [None]:
clf = PrepareDataset(texts = val.description.tolist(), tokenizer=tokenizer)
test_encodings = clf.transform()

In [None]:
gc.collect()

46

In [None]:
class GroupsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}

        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = GroupsDataset(train_encodings, train.is_bad.astype(int).tolist())
test_dataset = GroupsDataset(test_encodings, val.is_bad.astype(int).tolist())

In [None]:
model = BertForSequenceClassification.from_pretrained('cointegrated/rubert-tiny2', num_labels=2)

Downloading:   0%|          | 0.00/715 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112M [00:00<?, ?B/s]

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not 

## Обучение модели bert-tiny2

### Заморозка параметров

In [None]:
for param in model.bert.parameters():
    param.requires_grad = False

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(83828, 312, padding_idx=0)
      (position_embeddings): Embedding(2048, 312)
      (token_type_embeddings): Embedding(2, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-2): 3 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=312, out_features=312, bias=True)
              (key): Linear(in_features=312, out_features=312, bias=True)
              (value): Linear(in_features=312, out_features=312, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=312, out_features=312, bias=True)
              (LayerNorm): LayerNorm((312,), eps=1e-12, 

In [None]:
def compute_metrics(eval_preds):
    """Расчет метрики roc-auc"""

    metric = evaluate.load("roc_auc")
    logits, labels = eval_preds
    predictions = softmax(logits)[:, 1]
    res = metric.compute(prediction_scores=predictions, references=labels)

    return {'roc_auc': res['roc_auc']}

In [None]:
args = TrainingArguments("bert_is_bad_tiny2",
                         evaluation_strategy="epoch",
                         save_strategy="no",
                         learning_rate=2e-5,
                         num_train_epochs=10,
                         weight_decay=0.01,
                         push_to_hub=False,
                         per_device_train_batch_size=64)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.save_model("bert_is_bad_tiny2")

***** Running training *****
  Num examples = 984487
  Num Epochs = 10
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 153830


Epoch,Training Loss,Validation Loss,Roc Auc
1,0.4531,0.470817,0.778622
2,0.4218,0.451201,0.796783
3,0.4143,0.441761,0.804391
4,0.4108,0.437265,0.808483
5,0.4058,0.434195,0.810692
6,0.4065,0.432223,0.812027
7,0.4038,0.430825,0.812823
8,0.4011,0.429934,0.813391
9,0.3994,0.429401,0.813716
10,0.3953,0.429509,0.813766


***** Running Evaluation *****
  Num examples = 16237
  Batch size = 8


Downloading builder script:   0%|          | 0.00/9.54k [00:00<?, ?B/s]

***** Running Evaluation *****
  Num examples = 16237
  Batch size = 8
***** Running Evaluation *****
  Num examples = 16237
  Batch size = 8
***** Running Evaluation *****
  Num examples = 16237
  Batch size = 8
***** Running Evaluation *****
  Num examples = 16237
  Batch size = 8
***** Running Evaluation *****
  Num examples = 16237
  Batch size = 8
***** Running Evaluation *****
  Num examples = 16237
  Batch size = 8
***** Running Evaluation *****
  Num examples = 16237
  Batch size = 8
***** Running Evaluation *****
  Num examples = 16237
  Batch size = 8
***** Running Evaluation *****
  Num examples = 16237
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to bert_is_bad_tiny2
Configuration saved in bert_is_bad_tiny2/config.json
Model weights saved in bert_is_bad_tiny2/pytorch_model.bin


In [None]:
# model_path = "bert_is_bad_tiny2"
# model.save_pretrained(model_path)
# tokenizer.save_pretrained(model_path)

### Разморозка параметров

In [None]:
for param in model.bert.parameters():
    param.requires_grad = True

In [None]:
args = TrainingArguments("bert_is_bad_tiny2",
                         evaluation_strategy="epoch",
                         save_strategy="no",
                         learning_rate=2e-5,
                         num_train_epochs=10,
                         weight_decay=0.01,
                         push_to_hub=False,
                         per_device_train_batch_size=64)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.save_model("bert_is_bad_tiny2")

***** Running training *****
  Num examples = 984487
  Num Epochs = 10
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 153830


Epoch,Training Loss,Validation Loss,Roc Auc
1,0.1165,0.14005,0.981013
2,0.0982,0.140921,0.982256
3,0.0881,0.146579,0.982397
4,0.0826,0.146537,0.983247


***** Running Evaluation *****
  Num examples = 16237
  Batch size = 8
***** Running Evaluation *****
  Num examples = 16237
  Batch size = 8
***** Running Evaluation *****
  Num examples = 16237
  Batch size = 8
***** Running Evaluation *****
  Num examples = 16237
  Batch size = 8


KeyboardInterrupt: 

In [None]:
model_path = "bert_is_bad_tiny2"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

Configuration saved in bert_is_bad_tiny2/config.json
Model weights saved in bert_is_bad_tiny2/pytorch_model.bin
tokenizer config file saved in bert_is_bad_tiny2/tokenizer_config.json
Special tokens file saved in bert_is_bad_tiny2/special_tokens_map.json


('bert_is_bad_tiny2/tokenizer_config.json',
 'bert_is_bad_tiny2/special_tokens_map.json',
 'bert_is_bad_tiny2/vocab.txt',
 'bert_is_bad_tiny2/added_tokens.json',
 'bert_is_bad_tiny2/tokenizer.json')

## Результаты

In [18]:
from transformers import pipeline as pipe
from typing import List


def predict_proba(text: str, pipeline) -> float:
    """Получение вероятности контакта в тексте"""
    predict = pipeline(text)[0]
    if predict["label"] == "LABEL_0":
        return 1 - predict["score"]
    return predict["score"]


def task1(description: pd.Series) -> List[float]:
    """Получение результатов предсказания модели поиска контактов в тексте"""
    clf = pipe("text-classification", model=model_path, device=device)
    dataset_pbar = tqdm(description)
    result = [predict_proba(data, clf) for data in dataset_pbar]
    return result

In [29]:
y_pred = pd.Series(task1(val.description), index=val.index)

  0%|          | 0/16237 [00:00<?, ?it/s]

In [30]:
y_test = val.is_bad
calegories = np.unique(val.category.tolist())
roc_auc_category = {}

for cat in calegories:
    idx = val[val.category == cat].index
    roc_auc = roc_auc_score(y_test[idx], y_pred[idx])
    roc_auc_category[cat] = roc_auc
    print(f'{cat} - {roc_auc:0.2f}')

print(f'\nROC_AUC = {np.mean(list(roc_auc_category.values())):.2f}')

Бытовая электроника - 0.97
Для бизнеса - 0.97
Для дома и дачи - 0.97
Животные - 0.96
Личные вещи - 0.89
Недвижимость - 0.99
Работа - 0.96
Транспорт - 1.00
Услуги - 0.95
Хобби и отдых - 0.95

ROC_AUC = 0.96


In [34]:
print(f'ROC-AUC validation = {roc_auc_score(y_test, y_pred):.3f}')

ROC-AUC validation = 0.985
