# cointegrated/rubert-tiny2

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from tqdm import tqdm
from sklearn.metrics import f1_score

In [2]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [3]:
def read_dataset(filename, splitter="\t"):
    data = []
    sentence = []
    tags = []
    with open(filename) as f:
        for line in f:
            if not line.isspace():
                word, tag = line.split(splitter)
                sentence.append(word)
                tags.append(tag.strip())
            else:
                data.append((sentence, tags))
                sentence = []
                tags = []
    return data

In [4]:
training_data = read_dataset("task1/train.tsv")

In [5]:
import pandas as pd
ner_data = pd.DataFrame(training_data, columns=['tokens', 'tags'])

In [6]:
ner_data

Unnamed: 0,tokens,tags
0,"[also, ,, i, have, recently, discovered, advil...","[O, O, O, O, O, O, B-Object, O, O, O, B-Predic..."
1,"[i, have, always, heard, that, motrin, is, bet...","[O, O, O, O, O, B-Object, O, B-Predicate, O, B..."
2,"[when, i, was, a, figure, skater, i, injuried,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[in, a, way, ,, halloween, is, even, better, t...","[O, O, O, O, B-Object, O, O, B-Predicate, O, B..."
4,"[i, think, halloween, is, actually, safer, tha...","[O, O, B-Object, O, O, B-Predicate, O, O, O, B..."
...,...,...
2329,"[isn, ', t, plastic, safer, than, wood, .]","[O, O, O, B-Object, B-Predicate, O, B-Object, O]"
2330,"[and, plastic, pallets, are, safer, than, wood...","[O, B-Object, O, O, B-Predicate, O, B-Object, ..."
2331,"[plastic, laminate, flooring, is, one, of, the...","[B-Object, O, B-Aspect, O, O, O, O, O, O, O, O..."
2332,"[plastic, has, long, been, considered, superio...","[B-Object, O, O, O, O, B-Predicate, O, B-Objec..."


In [7]:
label_list = []
for item in ner_data['tags']:
    label_list.extend(item)
label_list = list(set(label_list))
if 'O' in label_list:
    label_list.remove('O')
    label_list = ['O'] + label_list
label_list

['O',
 'I-Object',
 'I-Aspect',
 'I-Predicate',
 'B-Object',
 'B-Predicate',
 'B-Aspect']

In [8]:
from sklearn.model_selection import train_test_split
# ner_data = [extract_labels(item) for item in drugs]
ner_train, ner_test = train_test_split(ner_data, test_size=0.2, random_state=1)

In [9]:
from datasets import load_dataset, load_metric
from datasets import Dataset, DatasetDict

In [10]:
ner_dataset = DatasetDict({
    'train': Dataset.from_pandas(pd.DataFrame(ner_train)),
    'test': Dataset.from_pandas(pd.DataFrame(ner_test))
})
ner_dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags', '__index_level_0__'],
        num_rows: 1867
    })
    test: Dataset({
        features: ['tokens', 'tags', '__index_level_0__'],
        num_rows: 467
    })
})

In [12]:
from transformers import AutoTokenizer
model_checkpoint = "cointegrated/rubert-tiny2"
batch_size = 16
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=False)

In [13]:
tokenizer("Hello, this is one sentence!")

{'input_ids': [2, 9944, 16, 881, 550, 835, 15503, 5, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [35]:
i = 14

In [73]:
i = i + 1
example = ner_dataset['test'][i]
print(example["tokens"])

['as', 'scala', 'reduces', 'a', 'lot', 'of', 'code', 'overhead', 'from', 'other', 'languages', 'it', 'might', 'even', 'be', 'easier', 'to', 'learn', 'concepts', 'with', 'scala', 'than', 'with', 'java', '/', 'c', '++.']


In [15]:
# from transformers import RobertaTokenizerFast
# tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base", add_prefix_space=True)
# tokenizer("Hello world")['input_ids']
# tokenizer(" Hello world")['input_ids']

In [77]:
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True,  return_offsets_mapping=True, )
print(tokenized_input)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
PRE_WORD = '##'
SOS_TOKEN = "[CLS]"
EOS_TOKEN = "[SEP]"
print(tokens)

{'input_ids': [2, 571, 20031, 9480, 533, 68, 5680, 534, 4354, 856, 5894, 610, 979, 6341, 683, 6167, 2513, 747, 23667, 540, 13645, 16031, 594, 20031, 1236, 594, 622, 759, 19, 70, 15, 15, 18, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 2), (0, 5), (0, 6), (6, 7), (0, 1), (0, 3), (0, 2), (0, 4), (0, 4), (4, 8), (0, 4), (0, 5), (0, 9), (0, 2), (0, 5), (0, 4), (0, 2), (0, 6), (0, 2), (0, 5), (0, 8), (0, 4), (0, 5), (0, 4), (0, 4), (0, 2), (2, 4), (0, 1), (0, 1), (0, 1), (1, 2), (2, 3), (0, 0)]}
['[CLS]', 'as', 'scala', 'reduce', '##s', 'a', 'lot', 'of', 'code', 'over', '##head', 'from', 'other', 'languages', 'it', 'might', 'even', 'be', 'easier', 'to', 'learn', 'concepts', 'with', 'scala', 'than', 'with', 'ja', '##va', '/', 'c', '+', '+', '.', '[SEP]']


In [78]:
def tokenize_and_align_labels(examples, label_all_tokens=False):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples['tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        label_ids = [label_list.index(idx) if isinstance(idx, str) else idx for idx in label_ids]

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [79]:
tokenize_and_align_labels(ner_dataset['train'][22:23])

{'input_ids': [[2, 537, 13195, 17645, 844, 25194, 2051, 5350, 17, 8, 833, 602, 9243, 602, 1136, 17, 550, 4014, 555, 537, 675, 18312, 1236, 16931, 1622, 2365, 11, 86, 3069, 600, 3873, 1589, 8926, 647, 746, 3938, 2881, 13161, 5441, 18, 3]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 0, 4, 0, 0, 0, 6, -100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, -100, 0, 5, -100, 0, 4, -100, -100, 0, 0, 0, 0, 0, 6, -100, -100, 0, 0, 6, -100, 2, 0, -100]]}

In [80]:
tokenized_datasets = ner_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_datasets['train'][0]

Map:   0%|          | 0/1867 [00:00<?, ? examples/s]

Map:   0%|          | 0/467 [00:00<?, ? examples/s]

{'tokens': ['i',
  'also',
  'preferred',
  'the',
  'psp',
  'controls',
  'to',
  'the',
  'wii',
  'controls',
  ',',
  'since',
  'it',
  'was',
  'pretty',
  'much',
  'identical',
  'to',
  'the',
  'original',
  'ps2',
  'release',
  '.'],
 'tags': ['O',
  'O',
  'O',
  'O',
  'B-Object',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-Predicate',
  'O',
  'O',
  'O',
  'B-Object',
  'O',
  'O'],
 '__index_level_0__': 485,
 'input_ids': [2,
  76,
  772,
  18513,
  531,
  26945,
  795,
  18426,
  540,
  531,
  6989,
  542,
  18426,
  16,
  1682,
  683,
  560,
  29049,
  2476,
  17932,
  540,
  531,
  1424,
  26945,
  1012,
  2959,
  18,
  3],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  

In [81]:
label_list

['O',
 'I-Object',
 'I-Aspect',
 'I-Predicate',
 'B-Object',
 'B-Predicate',
 'B-Aspect']

In [82]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
model.config.id2label = dict(enumerate(label_list))
model.config.label2id = {v: k for k, v in model.config.id2label.items()}

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized 

In [83]:
args = TrainingArguments(
    "ner",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy='no',
    report_to='none',
    include_inputs_for_metrics=True,
)

In [84]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [85]:
metric = load_metric("seqeval")

  """Entry point for launching an IPython kernel.


In [86]:
example = ner_dataset['train'][4]
labels = example['tags']
metric.compute(predictions=[labels], references=[labels])

{'Aspect': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'Object': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2},
 'Predicate': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [87]:
import numpy as np

def compute_metrics(p):
    predictions, labels, inputs = p.predictions, p.label_ids, p.inputs
    predictions = np.argmax(p.predictions, axis=2)

    # send only the first token of each word to the evaluation
    true_predictions = []
    true_labels = []
    for prediction, label, tokens in zip(predictions, labels, inputs):
        true_predictions.append([])
        true_labels.append([])
        # print(prediction, label, tokens)
        for (p, l, t) in zip(prediction, label, tokens):
            # print(l, p, tokenizer.convert_ids_to_tokens(int(t)))
            if l != -100 and not tokenizer.convert_ids_to_tokens(int(t)).startswith(PRE_WORD):
                # print('append')
                true_predictions[-1].append(label_list[p])
                true_labels[-1].append(label_list[l])
    # print(true_predictions, true_labels)

    results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=0)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [88]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [89]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: __index_level_0__, tokens, tags. If __index_level_0__, tokens, tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 467
  Batch size = 16
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 2.2178430557250977,
 'eval_precision': 0.047325679081489776,
 'eval_recall': 0.14922737306843267,
 'eval_f1': 0.07186137982353566,
 'eval_accuracy': 0.055314895034988334,
 'eval_runtime': 1.103,
 'eval_samples_per_second': 423.385,
 'eval_steps_per_second': 27.198}

In [90]:
# разморозка
for param in model.parameters():
    param.requires_grad = True

In [91]:
args = TrainingArguments(
    "ner",
    evaluation_strategy = "epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=20,
    weight_decay=0.01,
    save_strategy='no',
    report_to='none',
    include_inputs_for_metrics=True,
)

PyTorch: setting up devices


In [92]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [93]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: __index_level_0__, tokens, tags. If __index_level_0__, tokens, tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1867
  Num Epochs = 20
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2340
  Number of trainable parameters = 29098303


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.630375,0.775281,0.060927,0.112976,0.800816
2,No log,0.438474,0.721236,0.556291,0.628116,0.867544
3,No log,0.372815,0.718182,0.697572,0.707727,0.89037
4,No log,0.338667,0.745389,0.731567,0.738414,0.899783
5,0.609000,0.31674,0.766712,0.744371,0.755376,0.906365
6,0.609000,0.298807,0.779144,0.755408,0.767093,0.911696
7,0.609000,0.289747,0.771316,0.778808,0.775044,0.914279
8,0.609000,0.285354,0.756281,0.797351,0.776273,0.915362
9,0.309200,0.273402,0.781826,0.786313,0.784063,0.91886
10,0.309200,0.268957,0.777298,0.795143,0.78612,0.919943


The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: __index_level_0__, tokens, tags. If __index_level_0__, tokens, tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 467
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: __index_level_0__, tokens, tags. If __index_level_0__, tokens, tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 467
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: __index_level_0__, tokens, tags. If __index_level_0__, tokens, tags are not expected by `BertForTokenClas

TrainOutput(global_step=2340, training_loss=0.34122982188167733, metrics={'train_runtime': 34.0532, 'train_samples_per_second': 1096.518, 'train_steps_per_second': 68.716, 'total_flos': 32565434271084.0, 'train_loss': 0.34122982188167733, 'epoch': 20.0})

In [94]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: __index_level_0__, tokens, tags. If __index_level_0__, tokens, tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 467
  Batch size = 16


{'eval_loss': 0.25202637910842896,
 'eval_precision': 0.7758037225042301,
 'eval_recall': 0.8097130242825608,
 'eval_f1': 0.7923957658241522,
 'eval_accuracy': 0.9224425191602799,
 'eval_runtime': 0.3318,
 'eval_samples_per_second': 1407.466,
 'eval_steps_per_second': 90.415,
 'epoch': 20.0}

In [95]:
test_data = read_dataset("task1/dev_no_answers.tsv", splitter="\n")

In [96]:
i_pred = [3, 4, 6]
print(label_list[i_pred[2]])

B-Aspect


In [195]:
def predict_sentence(sentence):
    inputs = tokenizer(sentence,
                        is_split_into_words=True, 
                        return_offsets_mapping=True, 
                        padding='max_length', 
                        truncation=True, 
                        return_tensors="pt")
    # move to gpu
    ids = inputs["input_ids"].to('cuda:0')
    mask = inputs["attention_mask"].to('cuda:0')
    # forward pass
    outputs = model(ids, attention_mask=mask)
    logits = outputs[0]

    active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
    flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

    tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
    token_predictions = [label_list[i] for i in flattened_predictions.cpu().numpy()]
    wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

    prediction = []
    # print(inputs["offset_mapping"].squeeze().tolist())
    for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()):
      #only predictions on first word pieces are important
        if mapping[0] == 0 and mapping[1] != 0:
            prediction.append(token_pred[1])
        else:
            continue
    return prediction

sentence = ner_dataset['train'][15]['tokens'] #"@HuggingFace is a company based in New York, but is also has employees working in Paris"
print(sentence)
print(predict_sentence(sentence))

print(test_data[2][0])
print(predict_sentence(test_data[2][0]))


['(', 'of', 'course', ',', 'fox', 'may', 'be', 'even', 'worse', 'than', 'cnn', '.)']
['O', 'O', 'O', 'O', 'B-Object', 'O', 'O', 'O', 'B-Predicate', 'O', 'B-Object', 'O']
['i', 'have', 'tried', 'windows', '8', 'and', 'it', "'", 's', 'lighter', 'than', 'windows', 'xp', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Predicate', 'O', 'B-Object', 'B-Object', 'O']


In [199]:
with open("task1/cointegrated-rubert-tiny2.tsv", "w") as f:
    with torch.no_grad():
        for sentence in tqdm(test_data):
            # print(sentence[0])
            prediction = predict_sentence(sentence[0])
            for w,t in zip(sentence[0], prediction):
                # print(w, '\t', t)
                f.write(w+'\t'+t+'\n')
            f.write('\n')
            # # print(sentence[0])
            # tokens = tokenizer(sentence[0], is_split_into_words=True, return_tensors='pt').to(model.device)
            # pred = model(**tokens)
            # indices = pred.logits.argmax(dim=-1)[0].cpu().numpy()
            # token_text = tokenizer.convert_ids_to_tokens(tokens['input_ids'][0])
            # print((indices), (token_text))
            # word_buffer  = ''
            # token_buffer = 0
            # current_token = 0
            # for token, idx in zip(token_text, indices):
            #     if idx  in i_pred:
            #         print("---> ", label_list[idx])
            #     # print(token, idx)
            #     if token == SOS_TOKEN:
            #         continue
            #     if word_buffer != '' and not token.startswith(PRE_WORD):
            #         print(word_buffer + "\t" + label_list[token_buffer])
            #         f.write(word_buffer + "\t" + label_list[token_buffer] + "\n")
            #         current_token = token_buffer
            #         word_buffer = ''
            #         token_buffer = 0
            #     if token == EOS_TOKEN:
            #         continue
            #         # print(wb[1:] + "\t" + label_list[tb])
            #         # f.write(word_buffer + "\t" + label_list[token_buffer] + "\n")
            #     if token.startswith(PRE_WORD):
            #         word_buffer += token[2:]
            #     else:
            #         word_buffer += token
            #     if token_buffer == 0:
            #         token_buffer = idx
            # print("\n")    
            # f.write("\n")
                # print(f'{t:15s} {label_list[idx]:10s}')
            # inputs = tokenizer(prepare_sequence(sentence[0], word_to_ix)
            # tag_scores = model(inputs)
            # tags = [idx_to_tag[int(i)] for i in tag_scores.argmax(dim=-1)]
            # for i, y in zip(sentence[0], tags):
            #     w.write(f"{i}\t{y}\n")
            # w.write("\n")

100%|██████████| 283/283 [00:02<00:00, 96.23it/s] 


In [200]:
previos_bio = 'O'

with open("task1/cointegrated-rubert-tiny2.tsv", "r") as f:
    with open("task1/cointegrated-rubert-tiny2-fix.tsv", "w") as fw:
        for line in f:
            line = line.strip('\r\n').split('\t')
            if len(line) > 1:
                word, bio = line
                current_bio = bio
                if current_bio[1:] == previos_bio[1:] and (previos_bio[:1] == 'B' or previos_bio[:1] == 'I'):
                    current_bio = 'I'+current_bio[1:]
                fw.write(word + '\t'+current_bio+'\n')
                previos_bio = current_bio
            else:
                fw.write("\n")
            

In [150]:
!zip out.zip task1/out_test_roberta_large.tsv

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/bin/bash: zip: command not found


# sberbank-ai/ruRoberta-large

In [201]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [202]:
def read_dataset(filename, splitter="\t"):
    data = []
    sentence = []
    tags = []
    with open(filename) as f:
        for line in f:
            if not line.isspace():
                word, tag = line.split(splitter)
                sentence.append(word)
                tags.append(tag.strip())
            else:
                data.append((sentence, tags))
                sentence = []
                tags = []
    return data

In [203]:
training_data = read_dataset("task1/train.tsv")

In [204]:
import pandas as pd
ner_data = pd.DataFrame(training_data, columns=['tokens', 'tags'])

In [205]:
ner_data

Unnamed: 0,tokens,tags
0,"[also, ,, i, have, recently, discovered, advil...","[O, O, O, O, O, O, B-Object, O, O, O, B-Predic..."
1,"[i, have, always, heard, that, motrin, is, bet...","[O, O, O, O, O, B-Object, O, B-Predicate, O, B..."
2,"[when, i, was, a, figure, skater, i, injuried,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[in, a, way, ,, halloween, is, even, better, t...","[O, O, O, O, B-Object, O, O, B-Predicate, O, B..."
4,"[i, think, halloween, is, actually, safer, tha...","[O, O, B-Object, O, O, B-Predicate, O, O, O, B..."
...,...,...
2329,"[isn, ', t, plastic, safer, than, wood, .]","[O, O, O, B-Object, B-Predicate, O, B-Object, O]"
2330,"[and, plastic, pallets, are, safer, than, wood...","[O, B-Object, O, O, B-Predicate, O, B-Object, ..."
2331,"[plastic, laminate, flooring, is, one, of, the...","[B-Object, O, B-Aspect, O, O, O, O, O, O, O, O..."
2332,"[plastic, has, long, been, considered, superio...","[B-Object, O, O, O, O, B-Predicate, O, B-Objec..."


In [206]:
label_list = []
for item in ner_data['tags']:
    label_list.extend(item)
label_list = list(set(label_list))
if 'O' in label_list:
    label_list.remove('O')
    label_list = ['O'] + label_list
label_list

['O',
 'I-Object',
 'I-Aspect',
 'I-Predicate',
 'B-Object',
 'B-Predicate',
 'B-Aspect']

In [207]:
from sklearn.model_selection import train_test_split
# ner_data = [extract_labels(item) for item in drugs]
ner_train, ner_test = train_test_split(ner_data, test_size=0.2, random_state=1)

In [208]:
from datasets import load_dataset, load_metric
from datasets import Dataset, DatasetDict

In [209]:
ner_dataset = DatasetDict({
    'train': Dataset.from_pandas(pd.DataFrame(ner_train)),
    'test': Dataset.from_pandas(pd.DataFrame(ner_test))
})
ner_dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags', '__index_level_0__'],
        num_rows: 1867
    })
    test: Dataset({
        features: ['tokens', 'tags', '__index_level_0__'],
        num_rows: 467
    })
})

In [210]:
from transformers import AutoTokenizer
# model_checkpoint = "cointegrated/rubert-tiny2"
model_checkpoint = "sberbank-ai/ruRoberta-large"
# model_checkpoint = "liaad/srl-en_xlmr-large"
batch_size = 16
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/jovyan/.cache/huggingface/hub/models--sberbank-ai--ruRoberta-large/snapshots/29b46edec511391c384dfd0bbd3892cb72495c5f/config.json
Model config RobertaConfig {
  "_name_or_path": "sberbank-ai/ruRoberta-large",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 1,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 5026

In [211]:
tokenizer("Hello, this is one sentence!")

{'input_ids': [1, 3449, 9691, 83, 16, 34952, 16710, 38237, 2466, 4271, 15922, 5, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [212]:
example = ner_dataset['train'][6]
print(example["tokens"])

['when', 'sony', 'announced', 'at', 'e3', 'how', 'much', 'better', 'it', 'was', 'than', 'microsoft', ',', 'it', 'reaffirmed', 'that', 'the', 'ps4', 'would', 'be', 'region', 'free', '.']


In [213]:
# from transformers import RobertaTokenizerFast
# tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base", add_prefix_space=True)
# tokenizer("Hello world")['input_ids']
# tokenizer(" Hello world")['input_ids']

In [214]:
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
PRE_WORD = 'Ġ' # '##'
# PRE_WORD = '_'
SOS_TOKEN = "<s>"
EOS_TOKEN = "</s>"
print(tokens)

['<s>', 'Ġwh', 'en', 'Ġs', 'ony', 'Ġan', 'n', 'ou', 'n', 'c', 'ed', 'Ġat', 'Ġe', '3', 'Ġh', 'ow', 'Ġm', 'uch', 'Ġb', 'et', 'ter', 'Ġit', 'Ġwas', 'Ġth', 'an', 'Ġm', 'ic', 'rosoft', 'Ġ,', 'Ġit', 'Ġre', 'a', 'ff', 'ir', 'm', 'ed', 'Ġthat', 'Ġthe', 'Ġp', 's', '4', 'Ġwould', 'Ġbe', 'Ġre', 'g', 'ion', 'Ġf', 'ree', 'Ġ.', '</s>']


In [215]:
def tokenize_and_align_labels(examples, label_all_tokens=False):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples['tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        label_ids = [label_list.index(idx) if isinstance(idx, str) else idx for idx in label_ids]

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [216]:
tokenize_and_align_labels(ner_dataset['train'][22:23])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'input_ids': [[1, 4810, 6340, 14519, 4118, 3267, 5694, 49019, 14753, 87, 20498, 3893, 15808, 26514, 21145, 2595, 3633, 406, 7061, 4996, 33684, 33958, 1188, 33684, 10623, 15446, 406, 16710, 2466, 4458, 20433, 1188, 4810, 3157, 1892, 7420, 1188, 7064, 1429, 3525, 2595, 31963, 8442, 2466, 4179, 74, 18844, 7822, 42491, 5747, 73, 4394, 2595, 11535, 3633, 28347, 2466, 23744, 91, 15964, 225, 14076, 4140, 24520, 22300, 75, 19405, 87, 3026, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 0, 4, -100, 0, -100, -100, -100, 0, -100, 0, -100, -100, -100, 6, -100, -100, 0, 0, 0, 0, 0, -100, 0, 0, -100, 0, 0, 5, -100, -100, -100, 0, 5, -100, -100, -100, 0, -100, 4, -100, -100, 0, 0, 0, -100, -100, 0, 0, -100, -100, 6, -100, -100, -100, 0, 0, -100, -100, -100, 6, -100, -100, -100, 2, -100, -100, -1

In [217]:
tokenized_datasets = ner_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/1867 [00:00<?, ? examples/s]

Map:   0%|          | 0/467 [00:00<?, ? examples/s]

In [218]:
label_list

['O',
 'I-Object',
 'I-Aspect',
 'I-Predicate',
 'B-Object',
 'B-Predicate',
 'B-Aspect']

In [219]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
model.config.id2label = dict(enumerate(label_list))
model.config.label2id = {v: k for k, v in model.config.id2label.items()}

loading configuration file config.json from cache at /home/jovyan/.cache/huggingface/hub/models--sberbank-ai--ruRoberta-large/snapshots/29b46edec511391c384dfd0bbd3892cb72495c5f/config.json
Model config RobertaConfig {
  "_name_or_path": "sberbank-ai/ruRoberta-large",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 1,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num

In [220]:
args = TrainingArguments(
    "ner",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy='no',
    report_to='none',
    include_inputs_for_metrics=True,
)

PyTorch: setting up devices


In [221]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [222]:
metric = load_metric("seqeval")

In [223]:
example = ner_dataset['train'][4]
labels = example['tags']
metric.compute(predictions=[labels], references=[labels])

{'Aspect': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'Object': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2},
 'Predicate': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [224]:
import numpy as np

def compute_metrics(p):
    predictions, labels, inputs = p.predictions, p.label_ids, p.inputs
    predictions = np.argmax(p.predictions, axis=2)

    # send only the first token of each word to the evaluation
    true_predictions = []
    true_labels = []
    for prediction, label, tokens in zip(predictions, labels, inputs):
        true_predictions.append([])
        true_labels.append([])
        # print(prediction, label, tokens)
        for (p, l, t) in zip(prediction, label, tokens):
            # print(l, p, tokenizer.convert_ids_to_tokens(int(t)))
            if l != -100 and tokenizer.convert_ids_to_tokens(int(t)).startswith(PRE_WORD):
                # print('append')
                true_predictions[-1].append(label_list[p])
                true_labels[-1].append(label_list[l])
    # print(true_predictions, true_labels)

    results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=0)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [225]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [226]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: __index_level_0__, tokens, tags. If __index_level_0__, tokens, tags are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 467
  Batch size = 16
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 1.6836447715759277,
 'eval_precision': 0.05716265373289451,
 'eval_recall': 0.1456953642384106,
 'eval_f1': 0.08210997760636975,
 'eval_accuracy': 0.33063978673775407,
 'eval_runtime': 2.6374,
 'eval_samples_per_second': 177.069,
 'eval_steps_per_second': 11.375}

In [227]:
# разморозка
for param in model.parameters():
    param.requires_grad = True

In [228]:
args = TrainingArguments(
    "ner",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=20,
    weight_decay=0.01,
    save_strategy='no',
    report_to='none',
    include_inputs_for_metrics=True,
)

PyTorch: setting up devices


In [229]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [230]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: __index_level_0__, tokens, tags. If __index_level_0__, tokens, tags are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1867
  Num Epochs = 20
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2340
  Number of trainable parameters = 354317319


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.19654,0.816631,0.845475,0.830803,0.934439
2,No log,0.171772,0.835448,0.871965,0.853316,0.939687
3,No log,0.171554,0.84599,0.875497,0.86049,0.943685
4,No log,0.193692,0.813897,0.884327,0.847651,0.936688
5,0.174200,0.192969,0.845601,0.882561,0.863685,0.945685
6,0.174200,0.209315,0.838994,0.883444,0.860645,0.943519
7,0.174200,0.233944,0.864133,0.873289,0.868687,0.947018
8,0.174200,0.251467,0.826069,0.886976,0.85544,0.942019
9,0.047500,0.268532,0.832849,0.886534,0.858854,0.943269
10,0.047500,0.296044,0.837034,0.882119,0.858985,0.944269


The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: __index_level_0__, tokens, tags. If __index_level_0__, tokens, tags are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 467
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: __index_level_0__, tokens, tags. If __index_level_0__, tokens, tags are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 467
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: __index_level_0__, tokens, tags. If __index_level_0__, tokens, tags are not expected by `R

TrainOutput(global_step=2340, training_loss=0.05253812930522821, metrics={'train_runtime': 673.7028, 'train_samples_per_second': 55.425, 'train_steps_per_second': 3.473, 'total_flos': 7060346142497208.0, 'train_loss': 0.05253812930522821, 'epoch': 20.0})

In [234]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: __index_level_0__, tokens, tags. If __index_level_0__, tokens, tags are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 467
  Batch size = 16


{'eval_loss': 0.3769207000732422,
 'eval_precision': 0.8477523324851569,
 'eval_recall': 0.882560706401766,
 'eval_f1': 0.864806402768765,
 'eval_accuracy': 0.946767744085305,
 'eval_runtime': 2.6211,
 'eval_samples_per_second': 178.168,
 'eval_steps_per_second': 11.445,
 'epoch': 20.0}

In [235]:
test_data = read_dataset("task1/dev_no_answers.tsv", splitter="\n")

In [236]:
with open("task1/out_test_roberta_large.tsv", "w") as f:
    with torch.no_grad():
        for sentence in tqdm(test_data):
            # print(sentence[0])
            prediction = predict_sentence(sentence[0])
            for w,t in zip(sentence[0], prediction):
                # print(w, '\t', t)
                f.write(w+'\t'+t+'\n')
            f.write('\n')
#     with torch.no_grad():
#         for sentence in tqdm(test_data):
#             # print(sentence[0])
#             tokens = tokenizer(sentence[0], is_split_into_words=True, return_tensors='pt').to(model.device)
#             pred = model(**tokens)
#             indices = pred.logits.argmax(dim=-1)[0].cpu().numpy()
#             token_text = tokenizer.convert_ids_to_tokens(tokens['input_ids'][0])
#             # print((indices), (token_text))
#             wb = ''
#             tb = 0
#             for t, idx in zip(token_text, indices):
#                 # print(t, idx)
#                 if t == SOS_TOKEN:
#                     continue
#                 if wb != '' and t.startswith(PRE_WORD):
#                     # print(wb[1:] + "\t" + label_list[tb])
#                     f.write(wb[1:] + "\t" + label_list[tb] + "\n")
#                     wb = ''
#                     tb = 0
#                 if t == EOS_TOKEN:
#                     # print(wb[1:] + "\t" + label_list[tb])
#                     f.write(wb[1:] + "\t" + label_list[tb] + "\n")
#                 wb += t
#                 if tb == 0:
#                     tb = idx
                
#             f.write("\n")
#                 # print(f'{t:15s} {label_list[idx]:10s}')
#             # inputs = tokenizer(prepare_sequence(sentence[0], word_to_ix)
#             # tag_scores = model(inputs)
#             # tags = [idx_to_tag[int(i)] for i in tag_scores.argmax(dim=-1)]
#             # for i, y in zip(sentence[0], tags):
#             #     w.write(f"{i}\t{y}\n")
#             # w.write("\n")

100%|██████████| 283/283 [00:03<00:00, 90.55it/s]


In [264]:
words = []
tags = []
prev_tags = []
with open("task1/out_test_roberta_large.tsv", "r") as f:
    for line in f:
        line = line.strip("\r\n").split("\t")
        if len(line) > 1:
            words.append(line[0])
            tags.append(line[1])
        else:
            words.append(line[0])
            tags.append('')

with open("task1/out_test_roberta_large_fix.tsv", "w") as f:
    prev_tags = tags.copy()
    prev_tags.insert(0, 'O')
    # print(prev_tags)
    for w, t, p in zip(words, tags, prev_tags):
        # print(w,t,p)
        if p.startswith('O') and t.startswith('I'):
            print("correct I->B")
            print(w, t, p)
            t = 'B' + t[1:]
            print(w, t, p)
        if t.startswith('B') and t == p:
            print("correct B->I")
            t = 'I' + t[1:]
        if w == '':
            f.write('\n')
        else:
            f.write(w + '\t'+ t + '\n')



correct I->B
greater I-Predicate O
greater B-Predicate O
correct B->I
correct B->I
correct B->I
correct I->B
drops I-Aspect O
drops B-Aspect O
correct B->I
correct I->B
out I-Aspect O
out B-Aspect O
correct I->B
of I-Aspect O
of B-Aspect O
correct I->B
carolina I-Object O
carolina B-Object O
correct B->I
correct B->I
correct B->I
correct B->I
correct I->B
prices I-Aspect O
prices B-Aspect O
correct B->I
correct I->B
email I-Aspect O
email B-Aspect O
correct B->I
correct I->B
ipad I-Object O
ipad B-Object O
correct I->B
fast I-Predicate O
fast B-Predicate O
correct B->I
correct I->B
faster I-Predicate O
faster B-Predicate O
correct B->I
correct I->B
erection I-Aspect O
erection B-Aspect O
correct I->B
- I-Object O
- B-Object O
correct B->I
correct I->B
drive I-Predicate O
drive B-Predicate O
correct B->I
correct I->B
starliner I-Object O
starliner B-Object O
correct B->I
correct B->I
correct I->B
money I-Aspect O
money B-Aspect O
correct I->B
shoe I-Aspect O
shoe B-Aspect O
correct B->I

In [150]:
!zip out.zip task1/out_test_roberta_large.tsv

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/bin/bash: zip: command not found
