Based on this notebook https://github.com/huggingface/notebooks/blob/master/examples/token_classification.ipynb

# Loading tokenized RuLegalNER datasets

Label types:

- **IND [2]** - Individual
- **LE [4]** - Legal Entity
- **PEN [9]** -Penalty
- **LAW [13]** - Law
- **CR [17]** - Crime

In [1]:
label_list=['B-CR', 'B-IND', 'B-LAW', 'B-LE', 'B-PEN', 'I-CR', 'I-IND', 'I-LAW', 'I-LE', 'I-PEN', 'O']

In [4]:
%%capture
!pip install datasets transformers seqeval
!pip install accelerate -U
!pip install razdel
!pip install colab

In [5]:
import colab

ImproperlyConfigured: Celery 5.x requires Django 1.11 or later.

In [3]:
from google.colab import drive
drive.mount('/content/drive/')

ModuleNotFoundError: No module named 'google.colab'

In [None]:
import pandas as pd
import numpy as np
import torch

In [None]:
from torch.utils.data import Dataset, DataLoader

# Preprocessing the data

In [None]:
model_checkpoint = "cointegrated/rubert-tiny"

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
model.config.id2label = dict(enumerate(label_list))
model.config.label2id = {v: k for k, v in model.config.id2label.items()}

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
model.config.id2label

{0: 'B-CR',
 1: 'B-IND',
 2: 'B-LAW',
 3: 'B-LE',
 4: 'B-PEN',
 5: 'I-CR',
 6: 'I-IND',
 7: 'I-LAW',
 8: 'I-LE',
 9: 'I-PEN',
 10: 'O'}

In [None]:
def tokenize_and_align_labels(example, label_all_tokens=True):
    tokenized_inputs = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    # print('!!!', tokenized_inputs)

    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:
        # Special tokens have a word id that is None. We set the label to -100 so they are automatically
        # ignored in the loss function.
        if word_idx is None:
            label_ids.append(-100)
        # We set the label for the first token of each word.
        elif word_idx != previous_word_idx:
            label_ids.append(example['labels'][word_idx])
        # For the other tokens in a word, we set the label to either the current label or -100, depending on
        # the label_all_tokens flag.
        else:
            label_ids.append(example.labels[word_idx] if label_all_tokens else -100)
        previous_word_idx = word_idx

        label_ids = [label_list.index(idx) if isinstance(idx, str) else idx for idx in label_ids]


    tokenized_inputs["labels"] = label_ids
    return tokenized_inputs

In [None]:
example = pd.read_pickle("/content/drive/MyDrive/coursework2024/RuLegalNER_example.pkl")
example

Unnamed: 0,tokens,labels
5,"[Решение, по, гражданскому, делу, Дело, №, 2-5...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
6,"[Решение, по, гражданскому, делу, Дело, №, 2, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
7,"[Решение, по, административному, делу, Адм, .,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [None]:
np.unique(tokenize_and_align_labels(example.loc[5])['labels'])

array([-100,    4,   10])

In [None]:
class MyDataset(Dataset):
    def __init__(self, data_path):
        data = pd.read_pickle(data_path)
        self.data = data

    def __getitem__(self, idx):
        item = tokenize_and_align_labels(self.data.loc[idx])
        return item

    def __len__(self):
        return len(self.data)

In [None]:
def get_data(batch_size):
    torch.manual_seed(0)
    np.random.seed(0)

    trainset = MyDataset('/content/drive/MyDrive/coursework2024/RuLegalNER_train.pkl')
    testset = MyDataset('/content/drive/MyDrive/coursework2024/RuLegalNER_test.pkl')
    valset = MyDataset('/content/drive/MyDrive/coursework2024/RuLegalNER_validation.pkl')

    train_loader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                               shuffle=True, num_workers=2)
    val_loader = torch.utils.data.DataLoader(valset, batch_size=batch_size,
                                             shuffle=False, num_workers=2)
    test_loader = torch.utils.data.DataLoader(testset, # batch_size=batch_size,
                                              shuffle=False, num_workers=2)

    return train_loader, val_loader, test_loader

# Fine-tuning the model

In [None]:
batch_size = 64

# train_loader, val_loader, test_loader = get_data(batch_size)

In [None]:
args = TrainingArguments(
    "ner",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3, # epoch count !!!
    weight_decay=0.01,
    save_strategy='no',
    report_to='none',
)

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
from datasets import load_metric
metric = load_metric("seqeval")

  metric = load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [None]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=0)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
# del trainer

trainer = Trainer(
    model,
    args,
    train_dataset=MyDataset('/content/drive/MyDrive/coursework2024/RuLegalNER_train.pkl'),
    eval_dataset=MyDataset('/content/drive/MyDrive/coursework2024/RuLegalNER_validation.pkl'),
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.evaluate()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 2.6039135456085205,
 'eval_precision': 0.002253073298522289,
 'eval_recall': 0.12181721367024881,
 'eval_f1': 0.0044243165406702934,
 'eval_accuracy': 0.029910906025780025,
 'eval_runtime': 4014.1625,
 'eval_samples_per_second': 3.471,
 'eval_steps_per_second': 0.054}

В начале обучения заморозим все параметры в модели, кроме последнего слоя, и посмотрим, насколько хорошо она обучится.

In [None]:
for param in model.bert.parameters():
    param.requires_grad = False

In [None]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name)
        print(param)

classifier.weight
Parameter containing:
tensor([[-9.0127e-03, -3.2471e-02, -1.9451e-02,  ..., -1.8803e-02,
          7.7132e-02, -1.5778e-02],
        [ 2.0753e-02,  2.5036e-02,  8.2198e-03,  ..., -4.7441e-02,
         -5.4137e-03,  5.5149e-03],
        [ 6.2620e-03,  3.2181e-02, -2.3803e-03,  ...,  1.4437e-02,
          3.5842e-02, -1.6756e-02],
        ...,
        [ 2.9666e-02, -2.4053e-02, -4.4269e-05,  ...,  3.1843e-03,
         -3.8098e-02, -7.2351e-03],
        [-1.3779e-02,  3.5150e-02, -1.8063e-02,  ...,  1.0355e-02,
         -1.3671e-02, -2.1359e-03],
        [ 2.6931e-02, -4.2387e-03, -3.6380e-03,  ..., -1.7359e-02,
         -1.8246e-02, -9.7428e-03]], requires_grad=True)
classifier.bias
Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], requires_grad=True)


We can now finetune our model by just calling the `train` method:

In [None]:
import logging
from transformers.trainer import logger as noisy_logger
noisy_logger.setLevel(logging.WARNING)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


The `evaluate` method allows you to evaluate again on the evaluation dataset or on another dataset:

In [None]:
trainer.evaluate()

{'epoch': 20.0,
 'eval_accuracy': 0.9050170279923582,
 'eval_f1': 0.6448696700316409,
 'eval_loss': 0.3367559015750885,
 'eval_precision': 0.6370943733253944,
 'eval_recall': 0.652837095790116,
 'eval_runtime': 1.1185,
 'eval_samples_per_second': 860.049,
 'eval_steps_per_second': 54.535}

To get the precision/recall/f1 computed for each category now that we have finished training, we can apply the same function as before on the result of the `predict` method:

In [None]:
predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

  _warn_prf(average, modifier, msg_start, len(result))


{'ADR': {'f1': 0.30279898218829515,
  'number': 446,
  'precision': 0.35,
  'recall': 0.26681614349775784},
 'DI': {'f1': 0.493963782696177,
  'number': 821,
  'precision': 0.4207369323050557,
  'recall': 0.5980511571254568},
 'Drugclass': {'f1': 0.7868852459016393,
  'number': 336,
  'precision': 0.7880597014925373,
  'recall': 0.7857142857142857},
 'Drugform': {'f1': 0.7922794117647058,
  'number': 565,
  'precision': 0.8240917782026769,
  'recall': 0.7628318584070797},
 'Drugname': {'f1': 0.8734309623430963,
  'number': 918,
  'precision': 0.8400402414486922,
  'recall': 0.9095860566448801},
 'Finding': {'f1': 0.0, 'number': 192, 'precision': 0.0, 'recall': 0.0},
 'overall_accuracy': 0.9050170279923582,
 'overall_f1': 0.6448696700316409,
 'overall_precision': 0.6370943733253944,
 'overall_recall': 0.652837095790116}

In [None]:
from sklearn.metrics import confusion_matrix
import pandas as pd

In [None]:
cm = pd.DataFrame(
    confusion_matrix(sum(true_labels, []), sum(true_predictions, []), labels=label_list),
    index=label_list,
    columns=label_list
)
cm

Unnamed: 0,O,B-ADR,B-DI,B-Drugclass,B-Drugform,B-Drugname,B-Finding,I-ADR,I-DI,I-Drugclass,I-Drugform,I-Drugname,I-Finding
O,19494,29,175,35,60,71,0,20,26,0,0,0,0
B-ADR,159,135,133,8,2,0,0,4,5,0,0,0,0
B-DI,242,21,525,0,17,10,0,3,3,0,0,0,0
B-Drugclass,50,1,17,264,0,4,0,0,0,0,0,0,0
B-Drugform,98,4,11,1,432,17,0,1,1,0,0,0,0
B-Drugname,44,1,16,1,8,848,0,0,0,0,0,0,0
B-Finding,56,32,87,5,3,3,0,1,5,0,0,0,0
I-ADR,180,51,40,0,1,0,0,47,30,0,0,0,0
I-DI,236,17,102,10,0,1,0,11,46,0,0,0,0
I-Drugclass,0,0,0,4,0,0,0,0,0,0,0,0,0


In [None]:
model.save_pretrained('/content/drive/MyDrive/coursework2024/ner_bert.bin')
tokenizer.save_pretrained('/content/drive/MyDrive/coursework2024/ner_bert.bin')

Configuration saved in ner_bert.bin/config.json
Model weights saved in ner_bert.bin/pytorch_model.bin
tokenizer config file saved in ner_bert.bin/tokenizer_config.json
Special tokens file saved in ner_bert.bin/special_tokens_map.json


('ner_bert.bin/tokenizer_config.json',
 'ner_bert.bin/special_tokens_map.json',
 'ner_bert.bin/vocab.txt',
 'ner_bert.bin/added_tokens.json',
 'ner_bert.bin/tokenizer.json')

# Applying the model

In [None]:
import torch

In [None]:
text = ' '.join(ner_train[8]['tokens'])
text = ' '.join(ner_test[4]['tokens'])
text

In [None]:
from transformers import pipeline

In [None]:
pipe = pipeline(model=model, tokenizer=tokenizer, task='ner', aggregation_strategy='average', device=0)

In [None]:
print(text)
print(pipe(text))

Охотно применяю его при борьбе с насморком , что в моем случае явление очень частое .
[{'entity_group': 'DI', 'score': 0.73669535, 'word': 'насморком', 'start': 33, 'end': 42}]
