# cointegrated/rubert-tiny2

In [173]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from tqdm import tqdm
from sklearn.metrics import f1_score

In [174]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [175]:
def read_dataset(filename, splitter="\t"):
    data = []
    sentence = []
    tags = []
    with open(filename) as f:
        for line in f:
            if not line.isspace():
                word, tag = line.split(splitter)
                sentence.append(word)
                tags.append(tag.strip())
            else:
                data.append((sentence, tags))
                sentence = []
                tags = []
    return data

In [176]:
training_data = read_dataset("task1/train.tsv")

In [177]:
import pandas as pd
ner_data = pd.DataFrame(training_data, columns=['tokens', 'tags'])

In [178]:
ner_data

Unnamed: 0,tokens,tags
0,"[also, ,, i, have, recently, discovered, advil...","[O, O, O, O, O, O, B-Object, O, O, O, B-Predic..."
1,"[i, have, always, heard, that, motrin, is, bet...","[O, O, O, O, O, B-Object, O, B-Predicate, O, B..."
2,"[when, i, was, a, figure, skater, i, injuried,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[in, a, way, ,, halloween, is, even, better, t...","[O, O, O, O, B-Object, O, O, B-Predicate, O, B..."
4,"[i, think, halloween, is, actually, safer, tha...","[O, O, B-Object, O, O, B-Predicate, O, O, O, B..."
...,...,...
2329,"[isn, ', t, plastic, safer, than, wood, .]","[O, O, O, B-Object, B-Predicate, O, B-Object, O]"
2330,"[and, plastic, pallets, are, safer, than, wood...","[O, B-Object, O, O, B-Predicate, O, B-Object, ..."
2331,"[plastic, laminate, flooring, is, one, of, the...","[B-Object, O, B-Aspect, O, O, O, O, O, O, O, O..."
2332,"[plastic, has, long, been, considered, superio...","[B-Object, O, O, O, O, B-Predicate, O, B-Objec..."


In [179]:
label_list = []
for item in ner_data['tags']:
    label_list.extend(item)
label_list = list(set(label_list))
if 'O' in label_list:
    label_list.remove('O')
    label_list = ['O'] + label_list
label_list

['O',
 'B-Object',
 'I-Object',
 'B-Predicate',
 'B-Aspect',
 'I-Predicate',
 'I-Aspect']

In [295]:
from sklearn.model_selection import train_test_split
# ner_data = [extract_labels(item) for item in drugs]
ner_train, ner_test = train_test_split(ner_data, test_size=0.2, random_state=1)

In [296]:
from datasets import load_dataset, load_metric
from datasets import Dataset, DatasetDict

In [297]:
ner_dataset = DatasetDict({
    'train': Dataset.from_pandas(pd.DataFrame(ner_train)),
    'test': Dataset.from_pandas(pd.DataFrame(ner_test))
})
ner_dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags', '__index_level_0__'],
        num_rows: 1867
    })
    test: Dataset({
        features: ['tokens', 'tags', '__index_level_0__'],
        num_rows: 467
    })
})

In [506]:
batch_size = 16
from transformers import AutoTokenizer
model_checkpoint = "cointegrated/rubert-tiny2"
result_name = 'cointegrated-rubert-tiny2'

model_checkpoint = 'dslim/bert-base-NER'
result_name = 'dslim-bert-base-NER'

model_checkpoint = 'nsi319/distilbert-base-uncased-finetuned-app'
result_name = 'distilbert-base-uncased-finetuned-app'

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=False)

loading configuration file config.json from cache at /home/jovyan/.cache/huggingface/hub/models--nsi319--distilbert-base-uncased-finetuned-app/snapshots/0c04e35247420b8be70088d1b15897fcac0a25f3/config.json
Model config DistilBertConfig {
  "_name_or_path": "nsi319/distilbert-base-uncased-finetuned-app",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "Education",
    "1": "Entertainment",
    "2": "Productivity",
    "3": "Sports",
    "4": "News & Magazines",
    "5": "Photography"
  },
  "initializer_range": 0.02,
  "label2id": {
    "Education": 0,
    "Entertainment": 1,
    "News & Magazines": 4,
    "Photography": 5,
    "Productivity": 2,
    "Sports": 3
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "s

In [507]:
tokenizer("Hello, this is one sentence!")

{'input_ids': [101, 7592, 1010, 2023, 2003, 2028, 6251, 999, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [508]:
# from transformers import RobertaTokenizerFast
# tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base", add_prefix_space=True)
# tokenizer("Hello world")['input_ids']
# tokenizer(" Hello world")['input_ids']

In [509]:
tokenized_input = tokenizer(ner_dataset['test']["tokens"][0], is_split_into_words=True,  return_offsets_mapping=True, )
print(tokenized_input)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
PRE_WORD = '##'
# SOS_TOKEN = "[CLS]"
# EOS_TOKEN = "[SEP]"
print(tokens)

{'input_ids': [101, 2057, 6160, 1037, 4045, 2817, 2008, 7860, 1996, 2801, 2008, 5509, 1998, 3886, 3121, 2024, 13726, 1999, 1037, 2543, 2084, 2216, 2007, 3536, 20241, 1010, 2758, 2585, 12110, 1010, 2343, 1998, 5766, 1997, 1042, 19498, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 2), (0, 7), (0, 1), (0, 10), (0, 5), (0, 4), (0, 10), (0, 3), (0, 4), (0, 4), (0, 8), (0, 3), (0, 5), (0, 9), (0, 3), (0, 5), (0, 2), (0, 1), (0, 4), (0, 4), (0, 5), (0, 4), (0, 4), (0, 7), (0, 1), (0, 4), (0, 5), (0, 7), (0, 1), (0, 9), (0, 3), (0, 3), (0, 2), (0, 1), (1, 4), (0, 1), (0, 0)]}
['[CLS]', 'we', 'welcome', 'a', 'scientific', 'study', 'that', 'challenges', 'the', 'idea', 'that', 'concrete', 'and', 'steel', 'buildings', 'are', 'safer', 'in', 'a', 'fire', 'than', 'those', 'with', 'wood', 'framing', ',', 'says', 'david', 'lindsay', ',', 'president', 'and', 'ceo', 'of', 'f'

In [510]:
def tokenize_and_align_labels(examples, label_all_tokens=False):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples['tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        label_ids = [label_list.index(idx) if isinstance(idx, str) else idx for idx in label_ids]

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [511]:
tokenize_and_align_labels(ner_dataset['train'][22:23])

{'input_ids': [[101, 1998, 8224, 7164, 2049, 6960, 20874, 1011, 1002, 2753, 2566, 5310, 2566, 2095, 1011, 2003, 16325, 1998, 16269, 2084, 7513, 1005, 1055, 2436, 2230, 3872, 13202, 2030, 4007, 16375, 3454, 1012, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 0, 1, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 0, 1, 0, 0, 0, 0, 0, 4, 0, 0, 4, 6, 0, -100]]}

In [512]:
tokenized_datasets = ner_dataset.map(tokenize_and_align_labels, batched=True)
# tokenized_datasets['train'][0]

Map:   0%|          | 0/1867 [00:00<?, ? examples/s]

Map:   0%|          | 0/467 [00:00<?, ? examples/s]

In [513]:
label_list

['O',
 'B-Object',
 'I-Object',
 'B-Predicate',
 'B-Aspect',
 'I-Predicate',
 'I-Aspect']

In [514]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, 
                                                        num_labels=len(label_list),
                                                       ignore_mismatched_sizes=True)
model.config.id2label = dict(enumerate(label_list))
model.config.label2id = {v: k for k, v in model.config.id2label.items()}

loading configuration file config.json from cache at /home/jovyan/.cache/huggingface/hub/models--nsi319--distilbert-base-uncased-finetuned-app/snapshots/0c04e35247420b8be70088d1b15897fcac0a25f3/config.json
Model config DistilBertConfig {
  "_name_or_path": "nsi319/distilbert-base-uncased-finetuned-app",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_po

In [515]:
batch_size = 16

args = TrainingArguments(
    "ner",
    evaluation_strategy = "epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy='no',
    report_to='none',
    include_inputs_for_metrics=True,
)

PyTorch: setting up devices


In [516]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [517]:
metric = load_metric("seqeval")

In [518]:
example = ner_dataset['train'][4]
labels = example['tags']
metric.compute(predictions=[labels], references=[labels])

{'Aspect': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'Object': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2},
 'Predicate': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [519]:
import numpy as np

def compute_metrics(p):
    predictions, labels, inputs = p.predictions, p.label_ids, p.inputs
    predictions = np.argmax(p.predictions, axis=2)

    # send only the first token of each word to the evaluation
    true_predictions = []
    true_labels = []
    for prediction, label, tokens in zip(predictions, labels, inputs):
        true_predictions.append([])
        true_labels.append([])
        # print(prediction, label, tokens)
        for (p, l, t) in zip(prediction, label, tokens):
            # print(l, p, tokenizer.convert_ids_to_tokens(int(t)))
            if l != -100 and not tokenizer.convert_ids_to_tokens(int(t)).startswith(PRE_WORD):
                # print('append')
                true_predictions[-1].append(label_list[p])
                true_labels[-1].append(label_list[l])
    # print(true_predictions, true_labels)

    results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=0)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [520]:
# разморозка
for param in model.parameters():
    param.requires_grad = True

In [521]:
args = TrainingArguments(
    "ner",
    evaluation_strategy = "epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.05,
    save_strategy='no',
    report_to='none',
    include_inputs_for_metrics=True,
)

PyTorch: setting up devices


In [522]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [523]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens, tags, __index_level_0__. If tokens, tags, __index_level_0__ are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1867
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1170
  Number of trainable parameters = 66368263
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.259329,0.785192,0.805298,0.795118,0.924525
2,No log,0.206013,0.78648,0.842384,0.813473,0.932606
3,No log,0.187274,0.822979,0.853863,0.838137,0.93952
4,No log,0.183297,0.815789,0.875938,0.844795,0.941103
5,0.271300,0.177583,0.820066,0.873289,0.845841,0.940603
6,0.271300,0.179964,0.819564,0.880353,0.848872,0.941686
7,0.271300,0.18136,0.812576,0.884327,0.846934,0.940437
8,0.271300,0.181294,0.818294,0.880795,0.848395,0.94127
9,0.113900,0.181786,0.819679,0.879029,0.848317,0.940853
10,0.113900,0.183474,0.814029,0.881236,0.846301,0.940103


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens, tags, __index_level_0__. If tokens, tags, __index_level_0__ are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 467
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens, tags, __index_level_0__. If tokens, tags, __index_level_0__ are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 467
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens, tags, __index_level_0__. If tokens, tags, __index_level_0__ are not

TrainOutput(global_step=1170, training_loss=0.17901803367158287, metrics={'train_runtime': 40.8213, 'train_samples_per_second': 457.359, 'train_steps_per_second': 28.661, 'total_flos': 267985057282668.0, 'train_loss': 0.17901803367158287, 'epoch': 10.0})

In [524]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens, tags, __index_level_0__. If tokens, tags, __index_level_0__ are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 467
  Batch size = 16


{'eval_loss': 0.18347392976284027,
 'eval_precision': 0.8140293637846656,
 'eval_recall': 0.8812362030905078,
 'eval_f1': 0.8463006147975408,
 'eval_accuracy': 0.9401032989003666,
 'eval_runtime': 0.4529,
 'eval_samples_per_second': 1031.037,
 'eval_steps_per_second': 66.234,
 'epoch': 10.0}

In [532]:
DEV = False
# if DEV:
#     test_data = read_dataset("task1/dev_no_answers.tsv", splitter="\n")
# else:
test_data = read_dataset("task1/test_no_answers.tsv", splitter="\n")

In [533]:
def predict_sentence(sentence):
    inputs = tokenizer(sentence,
                        is_split_into_words=True, 
                        return_offsets_mapping=True, 
                        padding='max_length', 
                        truncation=True, 
                        return_tensors="pt")
    # move to gpu
    ids = inputs["input_ids"].to('cuda:0')
    mask = inputs["attention_mask"].to('cuda:0')
    # forward pass
    outputs = model(ids, attention_mask=mask)
    logits = outputs[0]

    active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
    flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

    tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
    token_predictions = [label_list[i] for i in flattened_predictions.cpu().numpy()]
    wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

    prediction = []
    # print(inputs["offset_mapping"].squeeze().tolist())
    for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()):
      #only predictions on first word pieces are important
        if mapping[0] == 0 and mapping[1] != 0:
            prediction.append(token_pred[1])
        else:
            continue
    return prediction

sentence = ner_dataset['train'][15]['tokens'] #"@HuggingFace is a company based in New York, but is also has employees working in Paris"
print(sentence)
print(predict_sentence(sentence))

print(test_data[2][0])
print(predict_sentence(test_data[2][0]))


['(', 'of', 'course', ',', 'fox', 'may', 'be', 'even', 'worse', 'than', 'cnn', '.)']
['O', 'O', 'O', 'O', 'B-Object', 'O', 'O', 'O', 'B-Predicate', 'O', 'B-Object', 'O']
['the', 'version', 'we', 'showed', 'here', 'is', 'ios', 'only', ',', 'because', 'the', 'ios', 'code', 'supported', 'ibeacons', 'earlier', 'than', 'android', ',', 'but', 'we', 'are', 'almost', 'finished', 'with', 'an', 'android', 'one', 'as', 'well', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Predicate', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [534]:
with open(f"task1/{result_name}.tsv", "w") as f:
    with torch.no_grad():
        for sentence in tqdm(test_data):
            # print(sentence[0])
            prediction = predict_sentence(sentence[0])
            for w,t in zip(sentence[0], prediction):
                # print(w, '\t', t)
                f.write(w+'\t'+t+'\n')
            f.write('\n')


100%|██████████| 360/360 [00:02<00:00, 162.38it/s]


In [535]:
def fix_bio_in_file(fn, fnout):
    words = []
    tags = []
    prev_tags = []
    with open(fn, "r") as f:
        for line in f:
            line = line.strip("\r\n").split("\t")
            if len(line) > 1:
                words.append(line[0])
                tags.append(line[1])
            else:
                words.append(line[0])
                tags.append('')

    with open(fnout, "w") as f:
        prev_tags = tags.copy()
        prev_tags.insert(0, 'O')
        # print(prev_tags)
        for w, t, p in zip(words, tags, prev_tags):
            # print(w,t,p)
            if p.startswith('O') and t.startswith('I'):
                print("correct I->B")
                print(w, t, p)
                t = 'B' + t[1:]
                print(w, t, p)
            if t.startswith('B') and t == p:
                print("correct B->I")
                t = 'I' + t[1:]
            if w == '':
                f.write('\n')
            else:
                f.write(w + '\t'+ t + '\n')

fix_bio_in_file(f"task1/{result_name}.tsv", f"task1/{result_name}-fix.tsv")             

correct B->I
correct B->I
correct B->I
correct B->I
correct B->I
correct B->I
correct I->B
friendly I-Predicate O
friendly B-Predicate O
correct B->I
correct B->I
correct B->I
correct B->I
correct B->I
correct B->I
correct B->I
correct B->I
correct B->I
correct I->B
up I-Aspect O
up B-Aspect O
correct I->B
college I-Aspect O
college B-Aspect O
correct B->I
correct I->B
carbon I-Aspect O
carbon B-Aspect O
correct B->I
correct B->I
correct B->I
correct B->I
correct B->I
correct I->B
challenging I-Predicate O
challenging B-Predicate O
correct B->I
correct I->B
benz I-Object O
benz B-Object O
correct I->B
force I-Aspect O
force B-Aspect O
correct I->B
force I-Aspect O
force B-Aspect O
correct I->B
- I-Aspect O
- B-Aspect O
correct B->I
correct I->B
room I-Aspect O
room B-Aspect O
correct I->B
t I-Predicate O
t B-Predicate O
correct I->B
as I-Predicate O
as B-Predicate O
correct B->I
correct I->B
t I-Predicate O
t B-Predicate O
correct B->I
correct B->I
correct B->I
correct B->I
correct B->

In [536]:
from task1.evaluation.evaluate_f1_partial import main

In [530]:
main('task1/dev.tsv', f'task1/{result_name}-fix.tsv', f'task1/{result_name}_report.txt')

In [538]:
trainer.save_model('task1/models/distilbert-base-uncased-finetuned-app')

Saving model checkpoint to task1/models/distilbert-base-uncased-finetuned-app
Configuration saved in task1/models/distilbert-base-uncased-finetuned-app/config.json
Model weights saved in task1/models/distilbert-base-uncased-finetuned-app/pytorch_model.bin
tokenizer config file saved in task1/models/distilbert-base-uncased-finetuned-app/tokenizer_config.json
Special tokens file saved in task1/models/distilbert-base-uncased-finetuned-app/special_tokens_map.json


# sberbank-ai/ruRoberta-large

In [353]:
# from sklearn.model_selection import train_test_split
# # ner_data = [extract_labels(item) for item in drugs]
# ner_train, ner_test = train_test_split(ner_data, test_size=0.2, random_state=1)

In [354]:
# from datasets import load_dataset, load_metric
# from datasets import Dataset, DatasetDict

In [355]:
# ner_dataset = DatasetDict({
#     'train': Dataset.from_pandas(pd.DataFrame(ner_train)),
#     'test': Dataset.from_pandas(pd.DataFrame(ner_test))
# })
# ner_dataset

In [408]:
from transformers import AutoTokenizer
model_checkpoint = "sberbank-ai/ruRoberta-large"
result_name = "sberbank-ai-ruRoberta-large"

# model_checkpoint = "liaad/srl-en_xlmr-large"
# result_name = "liaad-srl-en_xlmr-large"

batch_size = 16
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/jovyan/.cache/huggingface/hub/models--sberbank-ai--ruRoberta-large/snapshots/29b46edec511391c384dfd0bbd3892cb72495c5f/config.json
Model config RobertaConfig {
  "_name_or_path": "sberbank-ai/ruRoberta-large",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 1,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 5026

In [409]:
tokenizer("Hello, this is one sentence!")

{'input_ids': [1, 3449, 9691, 83, 16, 34952, 16710, 38237, 2466, 4271, 15922, 5, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [398]:
example = ner_dataset['train'][6]
print(example["tokens"])

['when', 'sony', 'announced', 'at', 'e3', 'how', 'much', 'better', 'it', 'was', 'than', 'microsoft', ',', 'it', 'reaffirmed', 'that', 'the', 'ps4', 'would', 'be', 'region', 'free', '.']


In [407]:
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
PRE_WORD = 'Ġ' # '##'
# PRE_WORD = '_'
SOS_TOKEN = "<s>"
EOS_TOKEN = "</s>"
print(tokens)

['<pad>', 'Ð»ÐµÐ½Ð°', 'ĠÐ·Ð°Ð¿', 'Q', None, '¡', '8', 'Ð½Ð°', 'ĠÐ¿ÑĢÐ¾ÑģÑĤÑĢÐ°Ð½', 'ĠÐ²Ð¸Ð´ÐµÐ»Ð¸', 'ÑĥÐ±Ð»Ð¸Ñĩ', 'ÐºÐ¾', 'Ð¾Ð»ÑĮÐºÐ¾', 'ĠÑģÐ¼Ð¾ÑĤÑĢÐµÐ»', '"', None, '"', '<mask>', 'ÐºÐ¾', 'Ð°ÐµÑĤ', None, 'F', 'ĠÐ³Ð¾Ð²Ð¾ÑĢÐ¸Ð»Ð°', 'ĠÐ¾Ð½', 'b', 'ĠÐ³Ð»Ð¸Ð½Ñı', 'ĠÐĹ', 'ÂłÐ£', 'ù', 'ĠÐ²ÑĭÐºÑĢÐ¸Ðº', 'Ð¸Ð»Ð¾ÑģÐ¾ÑĦ', '"', '!', '</s>']


In [418]:
def tokenize_and_align_labels(examples, label_all_tokens=False):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples['tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        label_ids = [label_list.index(idx) if isinstance(idx, str) else idx for idx in label_ids]

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [419]:
tokenized_datasets = ner_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/1867 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/467 [00:00<?, ? examples/s]

In [420]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
model.config.id2label = dict(enumerate(label_list))
model.config.label2id = {v: k for k, v in model.config.id2label.items()}

loading configuration file config.json from cache at /home/jovyan/.cache/huggingface/hub/models--sberbank-ai--ruRoberta-large/snapshots/29b46edec511391c384dfd0bbd3892cb72495c5f/config.json
Model config RobertaConfig {
  "_name_or_path": "sberbank-ai/ruRoberta-large",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 1,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num

In [421]:
args = TrainingArguments(
    "ner",
    evaluation_strategy = "epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy='no',
    report_to='none',
    include_inputs_for_metrics=True,
)

PyTorch: setting up devices


In [422]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [423]:
metric = load_metric("seqeval")

In [424]:
example = ner_dataset['train'][4]
labels = example['tags']
metric.compute(predictions=[labels], references=[labels])

{'Aspect': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'Object': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2},
 'Predicate': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [425]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [426]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: tokens, tags, __index_level_0__. If tokens, tags, __index_level_0__ are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 467
  Batch size = 16
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


ZeroDivisionError: division by zero

In [None]:
# разморозка
for param in model.parameters():
    param.requires_grad = True

In [None]:
args = TrainingArguments(
    "ner",
    evaluation_strategy = "epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.05,
    save_strategy='no',
    report_to='none',
    include_inputs_for_metrics=True,
)

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [390]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `XLMRobertaForTokenClassification.forward` and have been ignored: tokens, tags, __index_level_0__. If tokens, tags, __index_level_0__ are not expected by `XLMRobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 467
  Batch size = 16


{'eval_loss': 0.2424028217792511,
 'eval_precision': 0.855748833262622,
 'eval_recall': 0.8905077262693156,
 'eval_f1': 0.8727823453050628,
 'eval_accuracy': 0.9490169943352216,
 'eval_runtime': 1.8615,
 'eval_samples_per_second': 250.868,
 'eval_steps_per_second': 16.116,
 'epoch': 10.0}

In [391]:
test_data = read_dataset("task1/dev_no_answers.tsv", splitter="\n")
with open(f"task1/{result_name}.tsv", "w") as f:
    with torch.no_grad():
        for sentence in tqdm(test_data):
            # print(sentence[0])
            prediction = predict_sentence(sentence[0])
            for w,t in zip(sentence[0], prediction):
                # print(w, '\t', t)
                f.write(w+'\t'+t+'\n')
            f.write('\n')

100%|██████████| 283/283 [00:09<00:00, 29.73it/s]


In [394]:
fix_bio_in_file(f"task1/{result_name}.tsv", f"task1/{result_name}_fix.tsv")

correct B->I
correct B->I
correct B->I
correct B->I
correct I->B
former I-Object O
former B-Object O
correct I->B
nearly I-Aspect O
nearly B-Aspect O
correct B->I
correct I->B
is I-Aspect O
is B-Aspect O
correct I->B
carolina I-Object O
carolina B-Object O
correct B->I
correct I->B
normal I-Predicate O
normal B-Predicate O
correct B->I
correct I->B
than I-Aspect O
than B-Aspect O
correct I->B
rights I-Aspect O
rights B-Aspect O
correct I->B
mitigating I-Aspect O
mitigating B-Aspect O
correct I->B
carolina I-Object O
carolina B-Object O
correct I->B
than I-Aspect O
than B-Aspect O
correct I->B
at I-Object O
at B-Object O
correct I->B
unemployment I-Aspect O
unemployment B-Aspect O
correct I->B
my I-Aspect O
my B-Aspect O
correct B->I
correct B->I
correct B->I
correct I->B
, I-Object O
, B-Object O
correct I->B
is I-Object O
is B-Object O
correct I->B
due I-Aspect O
due B-Aspect O
correct I->B
still I-Object O
still B-Object O
correct I->B
is I-Object O
is B-Object O
correct I->B
timber 

In [395]:
main('task1/dev.tsv', f'task1/{result_name}_fix.tsv', f'task1/{result_name}_report.txt')

In [86]:
def read_res_file(fn):
    words = []
    tags = []
    with open(fn, "r") as f:
        for line in f:
            line = line.strip("\r\n").split("\t")
            if len(line) > 1:
                words.append(line[0])
                tags.append(line[1])
            else:
                words.append(line[0])
                tags.append('')
    return words, tags
    
    
words, tags1 = read_res_file("task1/cointegrated-rubert-tiny2-fix.tsv")
words, tags2 = read_res_file("task1/out_test_roberta_large_fix.tsv")

In [89]:
label2id = {v:k for k,v in enumerate(label_list)}
label2id

{'O': 0,
 'B-Object': 1,
 'I-Object': 2,
 'B-Predicate': 3,
 'B-Aspect': 4,
 'I-Predicate': 5,
 'I-Aspect': 6}

In [94]:
with open("task1/majority-class.tsv", "w") as f:
    for w, t1, t2 in zip(words, tags1, tags2):
        if w != '':
            token_win = max(label2id[t1], label2id[t2])
            print(w, t1, t2, '-->', label_list[token_win])
            f.write(w+"\t"+label_list[token_win]+"\n")
        else:
            f.write("\n")
            print('empty line')

meanwhile O O --> O
, O O --> O
though O O --> O
windows O B-Object --> B-Object
8 O O --> O
is O O --> O
significantly O O --> O
at O O --> O
greater B-Predicate B-Predicate --> B-Predicate
risk B-Aspect B-Aspect --> B-Aspect
( O O --> O
1 O O --> O
. O O --> O
73 O O --> O
percent O O --> O
) O O --> O
compared O O --> O
to O O --> O
windows O B-Object --> B-Object
8 O O --> O
. O O --> O
1 O O --> O
, O O --> O
according O O --> O
to O O --> O
redmond O O --> O
' O O --> O
s O O --> O
report O O --> O
, O O --> O
it O O --> O
' O O --> O
s O O --> O
still O O --> O
significantly O O --> O
safer B-Predicate B-Predicate --> B-Predicate
than O O --> O
windows B-Object B-Object --> B-Object
7 O O --> O
, O O --> O
windows B-Object B-Object --> B-Object
xp I-Object O --> I-Object
, O O --> O
or O O --> O
windows B-Object B-Object --> B-Object
vista O O --> O
. O O --> O
empty line
windows O B-Object --> B-Object
7 O O --> O
is O O --> O
still O O --> O
going O O --> O
strong O O --> O
ev

In [None]:
trainer.save_model("path/to/model")

In [150]:
!zip out.zip task1/out_test_roberta_large.tsv

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/bin/bash: zip: command not found
