# Fine-tuning BERT модели под NER задачу



In [14]:
!pip install datasets > /dev/null
!pip install seqeval > /dev/null
!pip install transformers > /dev/null
!pip install ipymarkup > /dev/null

In [15]:
import numpy as np
import torch
import pandas as pd
from tqdm import tqdm
import string
import logging
from transformers.trainer import logger as noisy_logger
from datasets import Dataset, DatasetDict, load_metric
from ipymarkup import show_span_box_markup
import re

from transformers import (
    AutoModel,
    AutoTokenizer, 
    AutoModelForTokenClassification, 
    DataCollatorForTokenClassification,
    TrainingArguments, 
    Trainer,
    pipeline
)

In [3]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model_checkpoint = "cointegrated/rubert-tiny-toxicity"
batch_size = 128

In [4]:
df = pd.read_csv('toxic_dataset.csv')

In [5]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,text,label
0,0,эт точно на кол всех путипидорови едропукиных,1.0
1,6,я не хочу сидеть сегодня дома :( давайте погул...,1.0
2,15,"Кого угодно, но не этих двух отбросов: свинью ...",1.0
3,18,Вот тебе НА... Jeff Hardy покидает TNA ...\nСм...,1.0
4,23,"Ударилась сегодня в творчество ,чтоб поднять н...",1.0


# Подготовка датасета под NER задачу

In [6]:
with open('bad_words.csv') as f:
    bad_words = f.read().splitlines()

In [7]:
def clean_text(text):
    """
    Text preprocessing function.
    """
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

Здесь я обучая модель ни на всем колечестве данных чтобы ускорить процесс

In [8]:
corpus = []

for line in tqdm(df['text'][:1200]):
    texts = []
    tags = []
    for word in line.split():
        texts.append(word)
        if clean_text(word) in bad_words:
            tags.append('TOXIC')
        else:
            tags.append('O')
    corpus.append({'tokens': texts, 'tags': tags})


ner_data = DatasetDict({
    'train': Dataset.from_pandas(pd.DataFrame(corpus[:1200])),
    'test': Dataset.from_pandas(pd.DataFrame(corpus[1200:1100]))
})

# !huggingface-cli login
# ner_data.push_to_hub("tesemnikov-av/toxic_dataset_ner")

100%|██████████| 1200/1200 [00:56<00:00, 21.10it/s]


In [9]:
pd.DataFrame(ner_data['train'][876])

Unnamed: 0,tokens,tags
0,быдло,TOXIC
1,оно,O
2,и,O
3,в,O
4,оше,O
5,быдло,TOXIC


In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_and_align_labels(examples, label_all_tokens=True):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, padding=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples['tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        label_ids = [label_list.index(idx) if isinstance(idx, str) else idx for idx in label_ids]

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

label_list = ['O', 'TOXIC']

In [11]:
tokenized_datasets = ner_data.map(tokenize_and_align_labels, batched=True)

model = AutoModel.from_pretrained(model_checkpoint)
model.save_pretrained('tmp_model')
model = AutoModelForTokenClassification.from_pretrained('tmp_model', num_labels=2)

  0%|          | 0/2 [00:00<?, ?ba/s]

Some weights of the model checkpoint at cointegrated/rubert-tiny-toxicity were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at tmp_model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
model.config.id2label = dict(enumerate(label_list))
model.config.label2id = {v: k for k, v in model.config.id2label.items()}
model = model.to(device)

args = TrainingArguments(
    "ner",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    save_steps=1000,
    report_to='none',
)

data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=0)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

for num, param in enumerate(model.bert.parameters()):
        param.requires_grad = True

for num, param in enumerate(model.bert.parameters()):
    if num > 38:
        print('requires_grad: True on {} layer'.format(num))
        param.requires_grad = True

noisy_logger.setLevel(logging.WARNING)

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

requires_grad: True on 39 layer
requires_grad: True on 40 layer
requires_grad: True on 41 layer
requires_grad: True on 42 layer
requires_grad: True on 43 layer
requires_grad: True on 44 layer
requires_grad: True on 45 layer
requires_grad: True on 46 layer
requires_grad: True on 47 layer
requires_grad: True on 48 layer
requires_grad: True on 49 layer
requires_grad: True on 50 layer
requires_grad: True on 51 layer
requires_grad: True on 52 layer




Epoch,Training Loss,Validation Loss


KeyboardInterrupt: ignored

# Инференс модели

In [None]:
model = AutoModelForTokenClassification.from_pretrained('tesemnikov-av/rubert-ner-toxicity')
tokenizer = AutoTokenizer.from_pretrained('tesemnikov-av/rubert-ner-toxicity')

pipe = pipeline(model=model, tokenizer=tokenizer, task='ner', aggregation_strategy='average')

In [16]:
text = "Вот дурак то! И надо-же быть таким придурком!"
spans = pipe(text.lower())

spans_list = []
for span in spans:
    spans_list.append((span['start'], span['end'], span['entity_group']))
    
show_span_box_markup(text, spans_list)