In [1]:
import torch
import datasets
import transformers
import numpy as np
from seqeval.metrics import f1_score
import pandas as pd
import os
from torch.utils.data import DataLoader

In [2]:
from train_datasets import BPEDropoutTrainDataset

In [3]:
if torch.cuda.is_available():
    print("GPU is enabled.")
    print("device count: {}, current device: {}".format(torch.cuda.device_count(), torch.cuda.current_device()))
else:
    print("GPU is not enabled.")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

GPU is enabled.
device count: 2, current device: 0


In [4]:
cache_dir = "./cache"

# Load Data

In [5]:
dataset_path = "masakhaner"
language = "amh"

model_path = "Davlan/afro-xlmr-mini"

In [6]:
dataset = datasets.load_dataset(dataset_path, language, cache_dir=cache_dir) 

Found cached dataset masakhaner (/atlas2/u/xiluo/temp/cache/masakhaner/amh/1.0.0/e61b24903076a3af7682855beebb820ec64edad0d6787b148c473694592d10b3)


  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
tags = dataset['train'].features["ner_tags"].feature
index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}
tags

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-DATE', 'I-DATE'], id=None)

In [8]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_path, use_fast=True, cache_dir=cache_dir)

In [9]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [10]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)


Loading cached processed dataset at /atlas2/u/xiluo/temp/cache/masakhaner/amh/1.0.0/e61b24903076a3af7682855beebb820ec64edad0d6787b148c473694592d10b3/cache-d966af7eee5cc661.arrow


Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Loading cached processed dataset at /atlas2/u/xiluo/temp/cache/masakhaner/amh/1.0.0/e61b24903076a3af7682855beebb820ec64edad0d6787b148c473694592d10b3/cache-727d76e42bfacd12.arrow


In [11]:
pd.DataFrame(
    [tokenizer.convert_ids_to_tokens(tokenized_dataset['train'][0]['input_ids']), tokenized_dataset['train'][0]['labels']],
    index=["tokens", "ner_tags"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
tokens,<s>,▁,ቀዳሚ,ው,▁የ,ሶማሌ,▁ክልል,▁በአ,ወ,ዳይ,...,▁ስነ,▁ስርዓት,ን,▁የተ,መለከተ,▁ዘገባ,▁ነው,▁,፡፡,</s>
ner_tags,-100,0,-100,-100,5,-100,6,6,-100,-100,...,0,0,-100,0,-100,0,0,0,-100,-100


# Training

In [12]:
# Make debugging easier
%env TOKENIZERS_PARALLELISM=false

env: TOKENIZERS_PARALLELISM=false


In [13]:
training_args = transformers.TrainingArguments(
    output_dir = "./checkpoints/xlm-roberta-ner-swa-noswreg",
    log_level = "error",
    num_train_epochs = 50,
    per_device_train_batch_size = 12,
    per_device_eval_batch_size = 12,
    evaluation_strategy = "epoch",
    fp16 = True,
    logging_steps = len(train_dataset),
    push_to_hub = False,
)

NameError: name 'train_dataset' is not defined

In [188]:
def metrics_func(eval_arg):
    preds = np.argmax(eval_arg.predictions, axis=2)
    batch_size, seq_len = preds.shape
    y_true, y_pred = [], []
    for b in range(batch_size):
        true_label, pred_label = [], []
        for s in range(seq_len):
            if eval_arg.label_ids[b, s] != -100:  # -100 must be ignored
                true_label.append(index2tag[eval_arg.label_ids[b][s]])
                pred_label.append(index2tag[preds[b][s]])
        y_true.append(true_label)
        y_pred.append(pred_label)
    return {"f1": f1_score(y_true, y_pred)}

In [189]:
data_collator = transformers.DataCollatorForTokenClassification(
    tokenizer,
    return_tensors="pt")

In [190]:
xlmr_config = transformers.AutoConfig.from_pretrained(
    model_path,
    num_labels=tags.num_classes,
    id2label=index2tag,
    label2id=tag2index
)

In [191]:
model = (transformers.RobertaForTokenClassification
         .from_pretrained(model_path, config=xlmr_config, cache_dir=cache_dir)
         .to(device))

In [192]:
trainer = transformers.Trainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    compute_metrics = metrics_func,
    train_dataset = tokenized_dataset['train'],
    eval_dataset = tokenized_dataset['test']
)

In [193]:
trainer.train()



{'eval_loss': 1.4632463455200195, 'eval_f1': 0.0, 'eval_runtime': 0.9429, 'eval_samples_per_second': 530.302, 'eval_steps_per_second': 7.424, 'epoch': 1.0}
{'eval_loss': 1.3581286668777466, 'eval_f1': 0.0, 'eval_runtime': 0.9428, 'eval_samples_per_second': 530.309, 'eval_steps_per_second': 7.424, 'epoch': 2.0}
{'eval_loss': 1.2802386283874512, 'eval_f1': 0.0, 'eval_runtime': 0.9443, 'eval_samples_per_second': 529.467, 'eval_steps_per_second': 7.413, 'epoch': 3.0}
{'eval_loss': 1.2069284915924072, 'eval_f1': 0.0, 'eval_runtime': 1.0123, 'eval_samples_per_second': 493.917, 'eval_steps_per_second': 6.915, 'epoch': 4.0}
{'eval_loss': 1.136922836303711, 'eval_f1': 0.0, 'eval_runtime': 1.011, 'eval_samples_per_second': 494.542, 'eval_steps_per_second': 6.924, 'epoch': 5.0}
{'eval_loss': 1.0697129964828491, 'eval_f1': 0.0, 'eval_runtime': 1.0171, 'eval_samples_per_second': 491.614, 'eval_steps_per_second': 6.883, 'epoch': 6.0}
{'eval_loss': 1.00645112991333, 'eval_f1': 0.0, 'eval_runtime': 1.



{'eval_loss': 0.42189621925354004, 'eval_f1': 0.608096468561585, 'eval_runtime': 1.0157, 'eval_samples_per_second': 492.262, 'eval_steps_per_second': 6.892, 'epoch': 20.0}
{'eval_loss': 0.40212559700012207, 'eval_f1': 0.6194539249146759, 'eval_runtime': 1.0081, 'eval_samples_per_second': 495.963, 'eval_steps_per_second': 6.943, 'epoch': 21.0}
{'eval_loss': 0.39058101177215576, 'eval_f1': 0.6153846153846154, 'eval_runtime': 0.9431, 'eval_samples_per_second': 530.142, 'eval_steps_per_second': 7.422, 'epoch': 22.0}
{'eval_loss': 0.3792569935321808, 'eval_f1': 0.6286672254819782, 'eval_runtime': 0.9439, 'eval_samples_per_second': 529.735, 'eval_steps_per_second': 7.416, 'epoch': 23.0}
{'eval_loss': 0.36768636107444763, 'eval_f1': 0.6614583333333334, 'eval_runtime': 0.944, 'eval_samples_per_second': 529.686, 'eval_steps_per_second': 7.416, 'epoch': 24.0}
{'eval_loss': 0.3646329939365387, 'eval_f1': 0.6568712186689714, 'eval_runtime': 1.0198, 'eval_samples_per_second': 490.3, 'eval_steps_per



{'eval_loss': 0.2995493710041046, 'eval_f1': 0.7101827676240209, 'eval_runtime': 0.9531, 'eval_samples_per_second': 524.583, 'eval_steps_per_second': 7.344, 'epoch': 40.0}
{'eval_loss': 0.30170440673828125, 'eval_f1': 0.7008547008547009, 'eval_runtime': 0.9758, 'eval_samples_per_second': 512.382, 'eval_steps_per_second': 7.173, 'epoch': 41.0}
{'eval_loss': 0.29742953181266785, 'eval_f1': 0.7062876830318691, 'eval_runtime': 1.0062, 'eval_samples_per_second': 496.935, 'eval_steps_per_second': 6.957, 'epoch': 42.0}
{'eval_loss': 0.29559406638145447, 'eval_f1': 0.7120689655172414, 'eval_runtime': 1.0114, 'eval_samples_per_second': 494.342, 'eval_steps_per_second': 6.921, 'epoch': 43.0}
{'eval_loss': 0.2955131530761719, 'eval_f1': 0.7005163511187608, 'eval_runtime': 1.0084, 'eval_samples_per_second': 495.812, 'eval_steps_per_second': 6.941, 'epoch': 44.0}
{'eval_loss': 0.29284873604774475, 'eval_f1': 0.7063903281519861, 'eval_runtime': 0.9429, 'eval_samples_per_second': 530.255, 'eval_steps

TrainOutput(global_step=1250, training_loss=0.528118701171875, metrics={'train_runtime': 470.5249, 'train_samples_per_second': 185.963, 'train_steps_per_second': 2.657, 'train_loss': 0.528118701171875, 'epoch': 50.0})