# Импорт библиотек и константы

In [None]:
!pip install datasets seqeval transformers --quiet
!pip install pandas numpy gdown sklearn 

[?25l[K     |█                               | 10 kB 17.3 MB/s eta 0:00:01[K     |██▏                             | 20 kB 15.0 MB/s eta 0:00:01[K     |███▏                            | 30 kB 10.5 MB/s eta 0:00:01[K     |████▎                           | 40 kB 9.3 MB/s eta 0:00:01[K     |█████▍                          | 51 kB 8.1 MB/s eta 0:00:01[K     |██████▍                         | 61 kB 8.2 MB/s eta 0:00:01[K     |███████▌                        | 71 kB 7.6 MB/s eta 0:00:01[K     |████████▋                       | 81 kB 8.4 MB/s eta 0:00:01[K     |█████████▋                      | 92 kB 8.1 MB/s eta 0:00:01[K     |██████████▊                     | 102 kB 8.1 MB/s eta 0:00:01[K     |███████████▊                    | 112 kB 8.1 MB/s eta 0:00:01[K     |████████████▉                   | 122 kB 8.1 MB/s eta 0:00:01[K     |██████████████                  | 133 kB 8.1 MB/s eta 0:00:01[K     |███████████████                 | 143 kB 8.1 MB/s eta 0:00:01[K  

In [None]:
import pandas as pd 
import numpy as np 
import random
from collections import defaultdict, Counter
import re

In [None]:
import torch 
from torch import nn 
from torch.nn import CrossEntropyLoss
from torch.utils.data import Dataset, DataLoader 
from torch.optim import Adam
from transformers import get_linear_schedule_with_warmup, AutoTokenizer, AutoModelForPreTraining
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from datasets import Dataset, DatasetDict, load_metric
import torch.nn.functional as F

"Константы"

In [None]:
label_list = ['B-LOC', 'I-LOC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER', 'B-FAC', 'I-FAC', 'B-CHAR', 'I-CHAR', 'O']
MODEL_PATH = "DeepPavlov/rubert-base-cased"

OUTPUT_DIR = '.'
res = {}
tokenizer = None 

In [None]:
def set_random_seed(seed):
    torch.backends.cudnn.deterministic = True
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

In [None]:
set_random_seed(42)

# Загрузка данных 

In [None]:
def preprocess_labels(labels): 
    nums = labels.split(', ')
    nums[0] = nums[0][1:] 
    nums[-1] = nums[-1][:-1]

    return list(map(int, nums))

In [None]:
df_train = pd.read_csv('prozhito/prozhito_data/df_train_prozhito.csv')
df_test = pd.read_csv('prozhito/prozhito_data/df_test_prozhito.csv')
df_val = pd.read_csv('prozhito/prozhito_data/df_val_prozhito.csv')

In [None]:
df_train['BIO_nums'] = df_train['BIO_nums'].apply(lambda x: preprocess_labels(x))
df_test['BIO_nums'] = df_test['BIO_nums'].apply(lambda x: preprocess_labels(x))
df_val['BIO_nums'] = df_val['BIO_nums'].apply(lambda x: preprocess_labels(x))

df_train.head() 

Unnamed: 0.1,Unnamed: 0,tokens,BIO_str,BIO_nums,BIO_list
0,0,У меня большая симпатия к Лукьянину — человек ...,O O O O O O O O O O O O O O O O O O O O O,"[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
1,2,"> Каким приговором , указом каким > Ты здесь ,...",O O O O O O O O O O O O O O O O O,"[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
2,3,"Подумал , что летом ребята куда затащили .",O O O O B-CHAR O O O,"[10, 10, 10, 10, 8, 10, 10, 10]","['O', 'O', 'O', 'O', 'B-CHAR', 'O', 'O', 'O']"
3,4,Нашел потрясающие материалы о В . М . Брадисе ...,O O O O B-PER I-PER I-PER I-PER I-PER O O B-CH...,"[10, 10, 10, 10, 6, 7, 7, 7, 7, 10, 10, 8, 8, ...","['O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'I-PER'..."
4,5,[Без даты . ],O O O O,"[10, 10, 10, 10]","['O', 'O', 'O', 'O']"


In [None]:
# Считаем длину последовательности, 
# это понадобится дальше 

df_train['length'] = df_train['BIO_nums'].apply(lambda x: len(x))
df_test['length'] = df_test['BIO_nums'].apply(lambda x: len(x))
df_val['length'] = df_val['BIO_nums'].apply(lambda x: len(x))

In [None]:
data = DatasetDict({
    'train': Dataset.from_pandas(df_train[['tokens', 'BIO_nums', 'length']]),
    'test': Dataset.from_pandas(df_test[['tokens', 'BIO_nums', 'length']]),
    'val': Dataset.from_pandas(df_val[['tokens', 'BIO_nums', 'length']])
})

data

DatasetDict({
    train: Dataset({
        features: ['tokens', 'BIO_nums', 'length'],
        num_rows: 1258
    })
    test: Dataset({
        features: ['tokens', 'BIO_nums', 'length'],
        num_rows: 273
    })
    val: Dataset({
        features: ['tokens', 'BIO_nums', 'length'],
        num_rows: 147
    })
})

# Подготовка данных 

In [None]:
def tokenizer_and_align(texts): 
    tokenized_input = tokenizer(texts['tokens'], truncation=True)
    print(tokenized_input.keys())

    labels = []
    for i, label in enumerate(texts['BIO_nums']): 
        word_ids = tokenized_input.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids: 
            if word_idx is None: 
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                if word_idx > len(label) - 1: 
                    label_ids.append(10)
                else: 
                    label_ids.append(label[word_idx])
            else: 
                if word_idx > len(label) - 1: 
                    label_ids.append(10)
                else:
                    label_ids.append(label[previous_word_idx])

            previous_word_idx = word_idx
            

        label_ids = [label_list.index(idx) if isinstance(idx, str) else idx for idx in label_ids]
        labels.append(label_ids)

    tokenized_input['labels'] = labels 
    return tokenized_input 

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

Downloading:   0%|          | 0.00/674 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.73M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

In [None]:
tokenized_datasets = data.map(tokenizer_and_align, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns('tokens')

  0%|          | 0/2 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


dict_keys(['input_ids', 'attention_mask'])
dict_keys(['input_ids', 'attention_mask'])


  0%|          | 0/1 [00:00<?, ?ba/s]

dict_keys(['input_ids', 'attention_mask'])


  0%|          | 0/1 [00:00<?, ?ba/s]

dict_keys(['input_ids', 'attention_mask'])


## Сортировка по длине батчей

In [None]:
tokenized_datasets = tokenized_datasets.sort('length')

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns('length')
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['BIO_nums', 'attention_mask', 'input_ids', 'labels'],
        num_rows: 1258
    })
    test: Dataset({
        features: ['BIO_nums', 'attention_mask', 'input_ids', 'labels'],
        num_rows: 273
    })
    val: Dataset({
        features: ['BIO_nums', 'attention_mask', 'input_ids', 'labels'],
        num_rows: 147
    })
})

# Обучение

In [None]:
def get_key_to_results(model_path, epochs, lr, weight_decay): 
    model = model_path.split('/')[-1]
    k = f'{model}, epochs = {epochs}, lr = {lr}, weight_decay = {weight_decay}'
    return k 

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH, num_labels=len(label_list))
model.config.id2label = dict(enumerate(label_list))
model.config.label2id = {v: k for k, v in model.config.id2label.items()}

Downloading:   0%|          | 0.00/1.32G [00:00<?, ?B/s]

Some weights of the model checkpoint at sberbank-ai/ruRoberta-large were not used when initializing RobertaForTokenClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at sberbank-ai/ruRoberta-large and are newly initialized: ['classifier.weight', 'classifier.bias']
You should prob

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
metric = load_metric("seqeval")

Downloading:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

In [None]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=0)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
def predict(model, trainer): 
    predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    
    # to dataframe 
    domains_ans = []
    f1s = []
    numbers = []
    precs = []
    recalls = []

    for key in sorted(results.keys()):
        if 'overall' not in key:
            domains_ans.append(key)
            f1s.append(round(results[key]['f1'], 4))
            numbers.append(round(results[key]['number'], 4))
            precs.append(round(results[key]['precision'], 4))
            recalls.append(round(results[key]['recall'], 4))
        else:
            if key == 'overall_f1': 
                f1s.append(round(results[key], 4))
            elif key == 'overall_precision': 
                precs.append(round(results[key], 4)) 
            elif key == 'overall_recall': 
                recalls.append(round(results[key], 4))

    domains_ans.append('Overall')
    numbers.append(sum(numbers))

    # print(len(domains_ans), len(f1s), len(numbers), len(precs), len(recalls))

    to_add = pd.DataFrame({'tag': domains_ans,
                'f1': f1s,
                'precision': precs,
                'recall': recalls,
                'number of occurence': numbers})
    
    return to_add 


In [None]:
def train(model, config): 
    model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH, num_labels=len(label_list))
    model.config.id2label = dict(enumerate(label_list))
    model.config.label2id = {v: k for k, v in model.config.id2label.items()}

    epochs = config['epochs']
    lr = config['lr']
    weight_decay = config['weight_decay']
    batch_size = config['bs']

    args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        evaluation_strategy = "epoch",
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=weight_decay,
        report_to='none',
        save_strategy='epoch', 
        # load_best_model_at_end=True, 
        logging_dir=OUTPUT_DIR,
        logging_strategy='epoch'
    )

    trainer = Trainer(
        model,
        args,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets['val'],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    for param in model.parameters():
        param.requires_grad = True

    import logging
    from transformers.trainer import logger as noisy_logger
    noisy_logger.setLevel(logging.WARNING)

    trainer.train() 

    return trainer 


In [None]:
configs = [
    {
        'epochs': 10, 
        'lr': 2e-5, 
        'weight_decay': 1e-4, 
        'bs': 8
    }
    # {
    #     'epochs': 10, 
    #     'lr': 2e-5, 
    #     'weight_decay': 1e-4, 
    #     'bs': 8

    # }, 
    # {
    #     'epochs': 10, 
    #     'lr': 1e-5, 
    #     'weight_decay': 1e-5, 
    #     'bs': 8
    # }   
]

In [None]:
histories = [] 

In [None]:
for config in configs: 
    model = None 
    trainer = train(model, config)
    to_add = predict(model, trainer)
    new_key = get_key_to_results(MODEL_PATH, config['epochs'], config['lr'], config['weight_decay'])

    res[new_key] = to_add 
    histories.append(trainer.state.log_history)

    del model 
    # del trainer 
    torch.cuda.empty_cache()

Some weights of the model checkpoint at sberbank-ai/ruRoberta-large were not used when initializing RobertaForTokenClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at sberbank-ai/ruRoberta-large and are newly initialized: ['classifier.weight', 'classifier.bias']
You should prob

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,1.0052,0.538973,0.294931,0.831169,0.435374,0.922671
2,0.4523,0.889089,0.401274,0.818182,0.538462,0.947715
3,0.3956,0.805938,0.398773,0.844156,0.541667,0.945079
4,0.1962,0.92692,0.492188,0.818182,0.614634,0.961775
5,0.1039,1.004954,0.451852,0.792208,0.575472,0.956942


Configuration saved in /content/drive/MyDrive/prozhito/ruroberta_checkpoints/checkpoint-158/config.json
Model weights saved in /content/drive/MyDrive/prozhito/ruroberta_checkpoints/checkpoint-158/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/prozhito/ruroberta_checkpoints/checkpoint-158/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/prozhito/ruroberta_checkpoints/checkpoint-158/special_tokens_map.json
Configuration saved in /content/drive/MyDrive/prozhito/ruroberta_checkpoints/checkpoint-316/config.json
Model weights saved in /content/drive/MyDrive/prozhito/ruroberta_checkpoints/checkpoint-316/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/prozhito/ruroberta_checkpoints/checkpoint-316/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/prozhito/ruroberta_checkpoints/checkpoint-316/special_tokens_map.json
Configuration saved in /content/drive/MyDrive/prozhito/ruroberta_checkpoints/checkpo

### Результаты

In [None]:
tmp = pd.concat(res) 
tmp

Unnamed: 0,Unnamed: 1,tag,f1,precision,recall,number of occurence
"ruRoberta-large, epochs = 5, lr = 2e-05, weight_decay = 0.0001",0,CHAR,0.6746,0.5327,0.9194,62
"ruRoberta-large, epochs = 5, lr = 2e-05, weight_decay = 0.0001",1,FAC,0.375,0.2308,1.0,3
"ruRoberta-large, epochs = 5, lr = 2e-05, weight_decay = 0.0001",2,LOC,0.7143,0.5882,0.9091,33
"ruRoberta-large, epochs = 5, lr = 2e-05, weight_decay = 0.0001",3,ORG,0.1538,0.0909,0.5,2
"ruRoberta-large, epochs = 5, lr = 2e-05, weight_decay = 0.0001",4,PER,0.7457,0.6028,0.9773,132
"ruRoberta-large, epochs = 5, lr = 2e-05, weight_decay = 0.0001",5,Overall,0.7006,0.5556,0.9483,232
