# Installing required packages

In [None]:
!pip install --upgrade accelerate
!pip install datasets
!pip install transformers==4.27.0
!pip install evaluate seqeval

In [None]:
import pandas as pd
import numpy as np
import torch
from datasets import Dataset, Features, Value, ClassLabel, Sequence, load_dataset
import evaluate
from seqeval.metrics import classification_report
from transformers import AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer, pipeline, Pipeline
from nltk import wordpunct_tokenize
from tqdm import tqdm
from collections import Counter
import warnings

from google.colab import drive
drive.mount('/content/drive')

# Processing source file

In [None]:
enm = pd.read_excel('/content/enm1930_ner.xlsx')
enm['tokens'] = enm['tokens'].apply(lambda x: x.replace("'", ''))
enm['tokens'] = enm['tokens'].apply(lambda x: x.replace("[", ''))
enm['tokens'] = enm['tokens'].apply(lambda x: x.replace("]", ''))
enm['tokens'] = enm['tokens'].apply(lambda x: x.replace(",", ''))
enm['tags'] = enm['tags'].apply(lambda x: x.replace("]", ''))
enm['tags'] = enm['tags'].apply(lambda x: x.replace("[", ''))
enm['tags'] = enm['tags'].apply(lambda x: x.replace("'", ''))
enm['tags'] = enm['tags'].apply(lambda x: x.replace(",", ''))
enm['split_split_sent'] = enm['tokens'].apply(lambda x: x.split())
enm['split_ner'] = enm['tags'].apply(lambda x: x.split())

In [None]:
def to_json(x, y):
  global enm_json
  enm_json.append({'sentence': x, 'tags': y})

enm_json = []
enm.apply(lambda x: to_json(x['split_split_sent'], x['split_ner']), axis=1)

0      None
1      None
2      None
3      None
4      None
       ... 
463    None
464    None
465    None
466    None
467    None
Length: 468, dtype: object

In [None]:
ds_IOB2 = pd.DataFrame(enm_json, columns = ['tokens', 'ner_tags_labels'])

In [None]:
tag_ner = []
for d in enm_json:
  for t in d['tags']:
    if t not in tag_ner:
      tag_ner.append(t)

In [None]:
tags_clean = []

for d in enm_json:
  new_tags = []
  for i in range(len(d['tags'])):
    if d['tags'][i] == 'B-MORH':
      new_tags.append('B-MORPH')
    elif d['tags'][i] == 'U-SYNT':
      new_tags.append('I-SYNT')
    elif d['tags'][i] == 'D-MORPH':
      new_tags.append('B-MORPH')
    else:
      new_tags.append(d['tags'][i])
  tags_clean.append({'sentence': d['sentence'], 'tags': new_tags})
    
ds_IOB2 = pd.DataFrame(tags_clean, columns = ['sentence', 'tags'])

In [None]:
ner_tags_labels = ['O', 'B-PHON', 'B-MORPH', 'I-MORPH', 'B-LEX', 'I-LEX', 'B-SYNT', 'I-SYNT']

# Define the model

In [None]:
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

# Defining a function that transforms the input

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["sentence"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i) 
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids: 
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Prepare function for evaluating

In [None]:
seqeval = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [None]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
warnings.filterwarnings('ignore')

# Create dataset from our data

In [None]:
ds_features = Features({'sentence':  Sequence(Value("string")),
                        'tags': Sequence(ClassLabel(names=ner_tags_labels))})

dataset = Dataset.from_pandas(ds_IOB2, features=ds_features)
dataset_splitted = dataset.train_test_split(test_size=0.5, seed=22)

In [None]:
ds_tokenized = dataset_splitted.map(tokenize_and_align_labels, batched=True)
label_list = ds_tokenized["train"].features["tags"].feature.names
id2label = {i: label_list[i] for i in range(len(label_list))}
label2id = {label_list[i]: i for i in range(len(label_list))}

Map:   0%|          | 0/234 [00:00<?, ? examples/s]

Map:   0%|          | 0/234 [00:00<?, ? examples/s]

# Load the model which was trained on Zapadnodvisk data

In [None]:
!unzip /content/drive/MyDrive/model_roberta.zip -d model_roberta

Archive:  /content/drive/MyDrive/model_roberta.zip
   creating: model_roberta/content/xlm_roberta_base_dial/checkpoint-3000/
  inflating: model_roberta/content/xlm_roberta_base_dial/checkpoint-3000/tokenizer.json  
  inflating: model_roberta/content/xlm_roberta_base_dial/checkpoint-3000/rng_state.pth  
  inflating: model_roberta/content/xlm_roberta_base_dial/checkpoint-3000/scheduler.pt  
  inflating: model_roberta/content/xlm_roberta_base_dial/checkpoint-3000/optimizer.pt  
  inflating: model_roberta/content/xlm_roberta_base_dial/checkpoint-3000/special_tokens_map.json  
  inflating: model_roberta/content/xlm_roberta_base_dial/checkpoint-3000/tokenizer_config.json  
  inflating: model_roberta/content/xlm_roberta_base_dial/checkpoint-3000/trainer_state.json  
  inflating: model_roberta/content/xlm_roberta_base_dial/checkpoint-3000/config.json  
  inflating: model_roberta/content/xlm_roberta_base_dial/checkpoint-3000/training_args.bin  
  inflating: model_roberta/content/xlm_roberta_bas

In [None]:
model = AutoModelForTokenClassification.from_pretrained('model_roberta/content/xlm_roberta_base_dial/checkpoint-3000/', num_labels=len(label_list), id2label=id2label, label2id=label2id)

# Evaluate model which wasn't yet trained on Opocka's data

In [None]:
trainer = Trainer(model, tokenizer=tokenizer, data_collator=data_collator)
ds_tokenized_full = dataset.map(tokenize_and_align_labels, batched=True)
output = trainer.predict(ds_tokenized_full.remove_columns(['sentence', 'tags']))
predictions, labels, metrics = output
predictions = np.argmax(predictions, axis=2)

true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
print(classification_report(true_labels, true_predictions))

Map:   0%|          | 0/468 [00:00<?, ? examples/s]

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


              precision    recall  f1-score   support

         LEX       0.00      0.00      0.00        42
       MORPH       0.48      0.05      0.10       206
        PHON       0.48      0.05      0.09       248
        SYNT       0.00      0.00      0.00        24

   micro avg       0.48      0.05      0.08       520
   macro avg       0.24      0.03      0.05       520
weighted avg       0.42      0.05      0.08       520



# Train

In [None]:
training_args = TrainingArguments(
    output_dir="xlm_roberta_base_dial_V1_opochka",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=25,
    weight_decay=0.01,
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_tokenized["train"],
    eval_dataset=ds_tokenized["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
100,No log,0.5014,0.574359,0.459016,0.510251,0.851391
200,No log,0.491076,0.625616,0.520492,0.568233,0.866032
300,No log,0.542506,0.540856,0.569672,0.55489,0.850659
400,No log,0.576486,0.562753,0.569672,0.566191,0.854319
500,0.344800,0.597016,0.606195,0.561475,0.582979,0.864568
600,0.344800,0.648644,0.609865,0.557377,0.582441,0.8653
700,0.344800,0.665447,0.612335,0.569672,0.590234,0.866032


TrainOutput(global_step=750, training_loss=0.2651380106608073, metrics={'train_runtime': 542.8235, 'train_samples_per_second': 10.777, 'train_steps_per_second': 1.382, 'total_flos': 60183173783040.0, 'train_loss': 0.2651380106608073, 'epoch': 25.0})

# Evaluate

In [None]:
best_model_from_training_testing = '/content/xlm_roberta_base_dial_V1_opochka/checkpoint-700'
best_model= AutoModelForTokenClassification.from_pretrained(best_model_from_training_testing, num_labels=len(label_list), id2label=id2label, label2id=label2id)
trainer = Trainer(best_model, tokenizer=tokenizer, data_collator=data_collator)
output = trainer.predict(ds_tokenized['test'].remove_columns(['sentence', 'tags']))
predictions, labels, metrics = output
predictions = np.argmax(predictions, axis=2)

true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
print(classification_report(true_labels, true_predictions))

              precision    recall  f1-score   support

         LEX       0.57      0.44      0.50        18
       MORPH       0.69      0.69      0.69        98
        PHON       0.56      0.53      0.54       119
        SYNT       0.00      0.00      0.00         9

   micro avg       0.61      0.57      0.59       244
   macro avg       0.46      0.42      0.43       244
weighted avg       0.59      0.57      0.58       244



# Save model to disk

In [None]:
!zip -r /content/drive/MyDrive/model_roberta_one_opochka_all.zip /content/xlm_roberta_base_dial_V1_opochka/checkpoint-700

  adding: content/xlm_roberta_base_dial_V1_opochka/checkpoint-700/ (stored 0%)
  adding: content/xlm_roberta_base_dial_V1_opochka/checkpoint-700/config.json (deflated 52%)
  adding: content/xlm_roberta_base_dial_V1_opochka/checkpoint-700/rng_state.pth (deflated 28%)
  adding: content/xlm_roberta_base_dial_V1_opochka/checkpoint-700/training_args.bin (deflated 48%)
  adding: content/xlm_roberta_base_dial_V1_opochka/checkpoint-700/trainer_state.json (deflated 71%)
  adding: content/xlm_roberta_base_dial_V1_opochka/checkpoint-700/tokenizer_config.json (deflated 49%)
  adding: content/xlm_roberta_base_dial_V1_opochka/checkpoint-700/scheduler.pt (deflated 49%)
  adding: content/xlm_roberta_base_dial_V1_opochka/checkpoint-700/tokenizer.json (deflated 76%)
  adding: content/xlm_roberta_base_dial_V1_opochka/checkpoint-700/pytorch_model.bin (deflated 27%)
  adding: content/xlm_roberta_base_dial_V1_opochka/checkpoint-700/special_tokens_map.json (deflated 52%)
  adding: content/xlm_roberta_base_di

# Creating a pipeline that will accept transcriptions and generate a list of tags for each token

In [None]:
class MyPipeline(Pipeline):
    def _sanitize_parameters(self, **kwargs):
        preprocess_kwargs = {}
        if "tokenizer" in kwargs:
            preprocess_kwargs["tokenizer"] = kwargs["tokenizer"]
        return preprocess_kwargs, {}, {}

    def preprocess(self, text):
        self.text_splt = wordpunct_tokenize(text.lower())
        self.tokenized = self.tokenizer(self.text_splt, is_split_into_words=True, return_tensors='pt')
        return self.tokenized

    def _forward(self, model_inputs):
        model_inputs['input_ids'] = model_inputs['input_ids'].to('cuda')
        model_inputs['attention_mask'] = model_inputs['attention_mask'].to('cuda')
        return self.model(**model_inputs)

    def postprocess(self, model_outputs):
        tokens = self.tokenizer.convert_ids_to_tokens(list(self.tokenized["input_ids"][0]))
        predicted_label_id = torch.argmax(model_outputs.logits, axis=-1).numpy()
        labels = [id2label[i] for i in predicted_label_id[0]]
        res = {'tokens': tokens, 'labels': labels}
        result_text = ''
        result_labels = ''
        for i in range(len(res['tokens'])):
            if res['tokens'][i] != '<s>' and res['tokens'][i] != '</s>':
                if res['tokens'][i].startswith('▁'):
                    res['labels'][i] = '▁' + res['labels'][i]
                if i > 1:
                    x = res['tokens'][i].replace('▁', ' ')
                    y = res['labels'][i].replace('▁', ' ')
                else:
                    x = res['tokens'][i].replace('▁', '')
                    y = res['labels'][i].replace('▁', '')
                result_text += x
                result_labels = result_labels + '|' + y
        result_labels_splt = result_labels.split(' ')
        final_labels = []
        for l in result_labels_splt:
            cnt = Counter()
            if l[0] == '|' and l[-1] == '|':
                l = l[1:-1]
            elif l[0] != '|' and l[-1] == '|':
                l = l[0:-1]
            elif l[0] == '|' and l[-1] != '|':
                l = l[1:]
            l_splt = l.split('|')
            if len(l_splt) == 1:
                res_lab = l_splt[0]
                final_labels.append(res_lab)
            if len(l_splt) > 1:
                cnt = Counter(l_splt)
                if len(dict(cnt)) == 1:
                    res_lab = l_splt[0]
                    final_labels.append(res_lab)
                else:
                    c = dict(cnt)
                    c.pop('O', None)
                    res_lab = max(c, key=c.get)
                    final_labels.append(res_lab)

        txt_splt = result_text.split(' ')
        dict_with_labels = {}
        for i in range(len(txt_splt)):
            dict_with_labels[i] = {txt_splt[i]: final_labels[i]}

        return dict_with_labels

In [None]:
pipeline = MyPipeline(model=model.to('cuda'), tokenizer=tokenizer)
labels = []
for sentence in tqdm(list(enm['sentence'])):
    res = pipeline(sentence)
    labels_sent = []
    for key, value in res.items():
        for key2, value2 in value.items():
            labels_sent.append(value2)
    labels.append(labels_sent) 

100%|██████████| 468/468 [00:12<00:00, 37.31it/s]


In [None]:
enm['predictions'] = labels
enm.to_excel('enm_ner_predictions.xlsx') 