Based on this notebook https://github.com/huggingface/notebooks/blob/master/examples/token_classification.ipynb

# Loading tokenized RuLegalNER datasets

Label types:

- **IND [2]** - Individual
- **LE [4]** - Legal Entity
- **PEN [9]** -Penalty
- **LAW [13]** - Law
- **CR [17]** - Crime

In [1]:
label_list=['B-CR', 'B-IND', 'B-LAW', 'B-LE', 'B-PEN', 'I-CR', 'I-IND', 'I-LAW', 'I-LE', 'I-PEN', 'O']

In [2]:
!pip install datasets transformers seqeval
!pip install accelerate -U
!pip install razdel



In [3]:
import pandas as pd
import numpy as np
import torch

In [4]:
from torch.utils.data import Dataset, DataLoader

# Preprocessing the data

In [5]:
model_checkpoint = "cointegrated/rubert-tiny"

In [6]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
model.config.id2label = dict(enumerate(label_list))
model.config.label2id = {v: k for k, v in model.config.id2label.items()}

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
model.config.id2label

{0: 'B-CR',
 1: 'B-IND',
 2: 'B-LAW',
 3: 'B-LE',
 4: 'B-PEN',
 5: 'I-CR',
 6: 'I-IND',
 7: 'I-LAW',
 8: 'I-LE',
 9: 'I-PEN',
 10: 'O'}

In [8]:
def tokenize_and_align_labels(example, label_all_tokens=True):
    tokenized_inputs = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    # print('!!!', tokenized_inputs)

    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:
        # Special tokens have a word id that is None. We set the label to -100 so they are automatically
        # ignored in the loss function.
        if word_idx is None:
            label_ids.append(-100)
        # We set the label for the first token of each word.
        elif word_idx != previous_word_idx:
            label_ids.append(example['labels'][word_idx])
        # For the other tokens in a word, we set the label to either the current label or -100, depending on
        # the label_all_tokens flag.
        else:
            label_ids.append(example.labels[word_idx] if label_all_tokens else -100)
        previous_word_idx = word_idx

        label_ids = [label_list.index(idx) if isinstance(idx, str) else idx for idx in label_ids]


    tokenized_inputs["labels"] = label_ids
    return tokenized_inputs

In [9]:
example = pd.read_pickle("./pkl/RuLegalNER_example.pkl")
example

Unnamed: 0,tokens,labels
5,"[Решение, по, гражданскому, делу, Дело, №, 2-5...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
6,"[Решение, по, гражданскому, делу, Дело, №, 2, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
7,"[Решение, по, административному, делу, Адм, .,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [10]:
np.unique(tokenize_and_align_labels(example.loc[5])['labels'])

array([-100,    4,   10])

In [11]:
class MyDataset(Dataset):
    def __init__(self, data_path, size_percent=1):
        data = pd.read_pickle(data_path)
        self.data = data.loc[:len(data)*size_percent]

    def __getitem__(self, idx):
        item = tokenize_and_align_labels(self.data.loc[idx])
        return item

    def __len__(self):
        return len(self.data)

In [12]:
def get_data(batch_size):
    torch.manual_seed(0)
    np.random.seed(0)

    trainset = MyDataset('./pkl/RuLegalNER_train.pkl')
    testset = MyDataset('./pkl/RuLegalNER_test.pkl')
    valset = MyDataset('./pkl/RuLegalNER_validation.pkl')

    train_loader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                               shuffle=True, num_workers=2)
    val_loader = torch.utils.data.DataLoader(valset, batch_size=batch_size,
                                             shuffle=False, num_workers=2)
    test_loader = torch.utils.data.DataLoader(testset, # batch_size=batch_size,
                                              shuffle=False, num_workers=2)

    return train_loader, val_loader, test_loader

# Fine-tuning the model

In [13]:
batch_size = 64

# train_loader, val_loader, test_loader = get_data(batch_size)

In [14]:
args = TrainingArguments(
    "ner",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=9, # epoch count !!!
    weight_decay=0.01,
    save_strategy='no',
    report_to='none',
)

In [15]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [16]:
from datasets import load_metric
metric = load_metric("seqeval")

  metric = load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [17]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=0)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [18]:
# del trainer

trainer = Trainer(
    model,
    args,
    train_dataset=MyDataset('./pkl/RuLegalNER_train.pkl', size_percent=0.5),
    eval_dataset=MyDataset('./pkl/RuLegalNER_validation.pkl', size_percent=0.5),
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [20]:
# trainer.evaluate()

В начале обучения заморозим все параметры в модели, кроме последнего слоя, и посмотрим, насколько хорошо она обучится.

In [19]:
for param in model.bert.parameters():
    param.requires_grad = True

In [20]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name)
        print(param)

bert.embeddings.word_embeddings.weight
Parameter containing:
tensor([[ 0.0006, -0.0328, -0.0623,  ..., -0.0355, -0.0559, -0.0132],
        [ 0.0959, -0.0789, -0.0168,  ..., -0.0862, -0.0831, -0.0013],
        [ 0.0060, -0.0554,  0.0143,  ..., -0.0006,  0.0040,  0.0125],
        ...,
        [-0.0306, -0.1518, -0.1168,  ...,  0.0059, -0.0196, -0.0278],
        [ 0.0520, -0.1090,  0.0432,  ..., -0.0089, -0.1032, -0.0405],
        [ 0.0057, -0.2035, -0.0105,  ...,  0.0129,  0.0341, -0.0240]],
       device='mps:0', requires_grad=True)
bert.embeddings.position_embeddings.weight
Parameter containing:
tensor([[-0.0101,  0.0034,  0.0084,  ...,  0.0159,  0.0042, -0.0047],
        [-0.0154,  0.0136,  0.0322,  ..., -0.0268,  0.0189,  0.0112],
        [-0.0157,  0.0082,  0.0333,  ..., -0.0183,  0.0189,  0.0024],
        ...,
        [ 0.0158,  0.0536, -0.0011,  ...,  0.0021, -0.0024,  0.0041],
        [ 0.0124,  0.0313, -0.0068,  ...,  0.0091, -0.0067,  0.0017],
        [ 0.0153,  0.0755,  0.0060

We can now finetune our model by just calling the `train` method:

In [21]:
import logging
from transformers.trainer import logger as noisy_logger
noisy_logger.setLevel(logging.WARNING)

In [22]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

The `evaluate` method allows you to evaluate again on the evaluation dataset or on another dataset:

In [None]:
trainer.evaluate()

To get the precision/recall/f1 computed for each category now that we have finished training, we can apply the same function as before on the result of the `predict` method:

In [23]:
predictions, labels, _ = trainer.predict(MyDataset('./pkl/RuLegalNER_test.pkl'))
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

{'CR': {'precision': 0.8013868715738182,
  'recall': 0.9293126867698995,
  'f1': 0.8606219180839288,
  'number': 18405},
 'IND': {'precision': 0.8646497283007784,
  'recall': 0.7956376020325423,
  'f1': 0.8287093731965147,
  'number': 36998},
 'LAW': {'precision': 0.755125,
  'recall': 0.8962908011869436,
  'f1': 0.8196743554952511,
  'number': 13480},
 'LE': {'precision': 0.5796178343949044,
  'recall': 0.5499104744852283,
  'f1': 0.5643734925921673,
  'number': 4468},
 'PEN': {'precision': 0.7235324947589099,
  'recall': 0.7692414849899004,
  'f1': 0.7456871813915803,
  'number': 14357},
 'overall_precision': 0.7935219108602611,
 'overall_recall': 0.8223195147535003,
 'overall_f1': 0.8076640966634752,
 'overall_accuracy': 0.9929460463459596}

In [24]:
from sklearn.metrics import confusion_matrix
import pandas as pd

In [25]:
cm = pd.DataFrame(
    confusion_matrix(sum(true_labels, []), sum(true_predictions, []), labels=label_list),
    index=label_list,
    columns=label_list
)
cm

Unnamed: 0,B-CR,B-IND,B-LAW,B-LE,B-PEN,I-CR,I-IND,I-LAW,I-LE,I-PEN,O
B-CR,17197,1,98,0,15,0,1,0,1,0,1092
B-IND,36,29470,1,5,4,0,615,0,9,0,6858
B-LAW,6,2,12099,0,0,0,0,617,0,0,756
B-LE,6,25,2,2695,2,0,1,0,120,0,1617
B-PEN,14,1,3,2,11195,0,0,2,0,0,3140
I-CR,148,0,0,0,3,116,0,0,9,0,139
I-IND,0,40,12,0,0,0,17733,0,0,0,561
I-LAW,0,1,12,0,0,0,0,7071,0,0,77
I-LE,108,17,0,80,1,0,35,0,1180,0,851
I-PEN,0,0,3,0,343,0,0,1,5,0,288


In [26]:
model.save_pretrained('./ner_bert9.bin')
tokenizer.save_pretrained('./ner_bert9.bin')

('./ner_bert7.bin/tokenizer_config.json',
 './ner_bert7.bin/special_tokens_map.json',
 './ner_bert7.bin/vocab.txt',
 './ner_bert7.bin/added_tokens.json',
 './ner_bert7.bin/tokenizer.json')

# Applying the model

In [None]:
import torch

In [None]:
text = ' '.join(ner_train[8]['tokens'])
text = ' '.join(ner_test[4]['tokens'])
text

In [None]:
from transformers import pipeline

In [None]:
pipe = pipeline(model=model, tokenizer=tokenizer, task='ner', aggregation_strategy='average', device=0)

In [None]:
print(text)
print(pipe(text))

Охотно применяю его при борьбе с насморком , что в моем случае явление очень частое .
[{'entity_group': 'DI', 'score': 0.73669535, 'word': 'насморком', 'start': 33, 'end': 42}]
