# Data creating

In [None]:
!wget https://raw.githubusercontent.com/skoltech-nlp/semantic-role-labelling/main/train.tsv

--2022-07-17 11:22:24--  https://raw.githubusercontent.com/skoltech-nlp/semantic-role-labelling/main/train.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 599430 (585K) [text/plain]
Saving to: ‘train.tsv’


2022-07-17 11:22:24 (14.2 MB/s) - ‘train.tsv’ saved [599430/599430]



In [None]:
def make_dataset(filename):
  train_dataset = []
  current_sent = {'tokens': [], 'tags': []}
  with open(filename, 'r') as f:
    for line in f.readlines():
      if line != '\n':
        token, tag = line.strip().split('\t')
        current_sent['tokens'].append(token)
        current_sent['tags'].append(tag)
      else:
        train_dataset.append(current_sent)
        current_sent = {'tokens': [], 'tags': []}
  return train_dataset

In [None]:
train_dataset = make_dataset('train.tsv')

# Dataset

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict, load_metric
from transformers import AutoTokenizer
import numpy as np
import pandas as pd

In [None]:
from nltk.tokenize import sent_tokenize

In [None]:
train, test = train_test_split(train_dataset, test_size=0.2, random_state=77)

In [None]:
data = DatasetDict({
    'train': Dataset.from_pandas(pd.DataFrame(train)),
    'test': Dataset.from_pandas(pd.DataFrame(test))
})

In [None]:
data

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 1867
    })
    test: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 467
    })
})

In [None]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base', add_prefix_space=True)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [None]:
label_list = ['O', 'B-Object', 'I-Object', 'B-Aspect', 'I-Aspect', 'B-Predicate', 'I-Predicate']
def tokenize_and_align_labels(examples, label_all_tokens=False):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples['tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        label_ids = [label_list.index(idx) if isinstance(idx, str) else idx for idx in label_ids]

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
tokenized_datasets = data.map(tokenize_and_align_labels, batched=True)



  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
model = AutoModelForTokenClassification.from_pretrained('roberta-base', num_labels=len(label_list))
model.config.id2label = dict(enumerate(label_list))
model.config.label2id = {v: k for k, v in model.config.id2label.items()}

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able

In [None]:
import torch
torch.manual_seed(0)

import random
random.seed(0)

import numpy as np
np.random.seed(0)

In [None]:
batch_size=64

In [None]:
args = TrainingArguments(
    "ner",
    evaluation_strategy = "epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.005,
    save_strategy='no',
    report_to='none',
    include_inputs_for_metrics=True,
    seed=42, 
    data_seed=42
)

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
metric = load_metric("seqeval")

Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [None]:
def compute_metrics(p):
    predictions, labels, inputs = p.predictions, p.label_ids, p.inputs
    predictions = np.argmax(p.predictions, axis=2)

    # send only the first token of each word to the evaluation
    true_predictions = []
    true_labels = []
    for prediction, label, tokens in zip(predictions, labels, inputs):
        true_predictions.append([])
        true_labels.append([])
        for (p, l, t) in zip(prediction, label, tokens):
            if l != -100 and not tokenizer.convert_ids_to_tokens(int(t)).startswith('##'):
                true_predictions[-1].append(label_list[p])
                true_labels[-1].append(label_list[l])

    results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=0)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
for param in model.roberta.parameters():
    param.requires_grad = True

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: tags, tokens. If tags, tokens are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1867
  Num Epochs = 10
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 300


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.228591,0.750405,0.827895,0.787248,0.927843
2,No log,0.166832,0.819777,0.856057,0.837525,0.943093
3,No log,0.154412,0.822341,0.891819,0.855672,0.947267
4,No log,0.161278,0.819287,0.903889,0.859511,0.947267
5,No log,0.175207,0.823218,0.903442,0.861466,0.948872
6,No log,0.1831,0.849935,0.881091,0.865233,0.950959
7,No log,0.198998,0.825711,0.895843,0.859348,0.947749
8,No log,0.200487,0.829682,0.897184,0.862113,0.94815
9,No log,0.209649,0.833751,0.892266,0.862017,0.948712
10,No log,0.214251,0.832428,0.890478,0.860475,0.94807


The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: tags, tokens. If tags, tokens are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 467
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: tags, tokens. If tags, tokens are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 467
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: tags, tokens. If tags, tokens are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num e

TrainOutput(global_step=300, training_loss=0.12734495798746745, metrics={'train_runtime': 238.1541, 'train_samples_per_second': 78.395, 'train_steps_per_second': 1.26, 'total_flos': 632400930629886.0, 'train_loss': 0.12734495798746745, 'epoch': 10.0})

In [None]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: tags, tokens. If tags, tokens are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 467
  Batch size = 64


{'epoch': 10.0,
 'eval_accuracy': 0.9480696685127217,
 'eval_f1': 0.860475161987041,
 'eval_loss': 0.214250847697258,
 'eval_precision': 0.8324279147513581,
 'eval_recall': 0.8904783191774698,
 'eval_runtime': 2.4178,
 'eval_samples_per_second': 193.151,
 'eval_steps_per_second': 3.309}

# predict

In [None]:
!wget https://raw.githubusercontent.com/skoltech-nlp/semantic-role-labelling/main/test_no_answers.tsv

--2022-07-17 11:28:07--  https://raw.githubusercontent.com/skoltech-nlp/semantic-role-labelling/main/test_no_answers.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 58177 (57K) [text/plain]
Saving to: ‘test_no_answers.tsv’


2022-07-17 11:28:07 (4.54 MB/s) - ‘test_no_answers.tsv’ saved [58177/58177]



In [None]:
def make_dataset(filename):
  train_dataset = []
  current_sent = ''
  with open(filename, 'r') as f:
    for line in f.readlines():
      if line != '\n':
        token = line.strip()
        current_sent += token + ' '
      else:
        train_dataset.append(current_sent)
        current_sent = '' 
  return train_dataset

In [None]:
dev_dataset = make_dataset('test_no_answers.tsv')

In [None]:
from transformers import TokenClassificationPipeline

In [None]:
token_classifier = TokenClassificationPipeline(model=model, tokenizer=tokenizer, 
                                               device=torch.device('cuda:0'),
                                               aggregation_strategy="max", ignore_labels=[])

In [None]:
outputs = token_classifier(dev_dataset)



In [None]:
result = []
for sent in outputs:
  sent_res = []
  for el in sent:
    words = el['word'].split(' ')
    words = [el for el in words if el != '']
    if len(words) != 0:
      token = el['entity_group']
      if token != 'O':
        sent_res.append((words[0], 'B-' + token))
      else:
        sent_res.append((words[0], token))
      for word in words[1:]:
        if token != 'O':
          sent_res.append((word, 'I-' + token))
        else:
          sent_res.append((word, token))
  result.append(sent_res)
  sent_res = []

In [None]:
result

[[('plus', 'O'),
  (',', 'O'),
  ('android', 'B-Object'),
  ('is', 'O'),
  ('developing', 'B-Aspect'),
  ('a', 'O'),
  ('way', 'O'),
  ('faster', 'B-Predicate'),
  ('than', 'O'),
  ('ios', 'B-Object'),
  ('so', 'O'),
  ('it', 'O'),
  ('has', 'O'),
  ('chances', 'O'),
  ('to', 'O'),
  ('become', 'O'),
  ('a', 'O'),
  ('laptop', 'O'),
  ('replacement', 'O'),
  ('earlier', 'B-Predicate'),
  ('than', 'O'),
  ('ios', 'B-Object'),
  ('.', 'O')],
 [('went', 'O'),
  ('to', 'O'),
  ('android', 'B-Object'),
  ('earlier', 'O'),
  ('this', 'O'),
  ('year', 'O'),
  ('after', 'O'),
  ('being', 'O'),
  ('convinced', 'O'),
  ('its', 'O'),
  ('better', 'B-Predicate'),
  ('then', 'O'),
  ('ios', 'O'),
  ('apple', 'B-Object'),
  ('.', 'O')],
 [('the', 'O'),
  ('version', 'O'),
  ('we', 'O'),
  ('showed', 'O'),
  ('here', 'O'),
  ('is', 'O'),
  ('ios', 'B-Object'),
  ('only', 'O'),
  (',', 'O'),
  ('because', 'O'),
  ('the', 'O'),
  ('ios', 'B-Object'),
  ('code', 'O'),
  ('supported', 'B-Aspect'),
  ('ib