In [1]:
import torch
import datasets
import transformers
import numpy as np
from seqeval.metrics import f1_score
import pandas as pd
import os
from torch.utils.data import DataLoader

In [70]:
if torch.cuda.is_available():
    print("GPU is enabled.")
    print("device count: {}, current device: {}".format(torch.cuda.device_count(), torch.cuda.current_device()))
else:
    print("GPU is not enabled.")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

GPU is enabled.
device count: 2, current device: 0


In [71]:
cache_dir = "./cache"

In [118]:
!pip install protobuf

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting protobuf
  Downloading protobuf-4.23.2-cp37-abi3-manylinux2014_x86_64.whl (304 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m304.5/304.5 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: protobuf
Successfully installed protobuf-4.23.2


# Load Data

In [291]:
# English
# dataset_path = "conll2003"
# language = None
# split=['train[:25%]','test[:25%]']
# ds = datasets.load_dataset(dataset_path, language, cache_dir=cache_dir, split=split)
# dataset = datasets.DatasetDict()
# dataset['train'] = ds[0]
# dataset['test'] = ds[1]

# Spanish
# dataset_path = "conll2002"
# language = "es"
# split=['train[:40%]','test[:40%]']
# ds = datasets.load_dataset(dataset_path, language, cache_dir=cache_dir, split=split)
# dataset = datasets.DatasetDict()
# dataset['train'] = ds[0]
# dataset['test'] = ds[1]

# Dutch
# dataset_path = "conll2002"
# language = "nl"
# split=['train[:22%]','test[:22%]']
# ds = datasets.load_dataset(dataset_path, language, cache_dir=cache_dir, split=split)
# dataset = datasets.DatasetDict()
# dataset['train'] = ds[0]
# dataset['test'] = ds[1]

# German
# dataset_path = "germaner"
# language = None
# split=['train[:25%]']
# ds = datasets.load_dataset(dataset_path, language, cache_dir=cache_dir, split=split)
# train_testvalid = ds[0].train_test_split(test_size=0.5)
# # Split the 10% test + valid in half test, half valid
# test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
# # gather everyone if you want to have a single DatasetDict
# dataset = datasets.DatasetDict({
#     'train': train_testvalid['train'],
#     'test': test_valid['test'],
#     'validation': test_valid['train']})

model_path = "Davlan/xlm-roberta-base-ner-hrl"

Found cached dataset germaner (/atlas2/u/xiluo/temp/cache/germaner/default/0.9.1/98610f255094d6f67f37c379e5e9f0800322705df916299ddd09ac6dab80bbe8)


  0%|          | 0/1 [00:00<?, ?it/s]

In [294]:
print(len(dataset['train']))
print(len(dataset['test']))

3275
1638


In [216]:
tags = dataset['train'].features["ner_tags"].feature
index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}
tags

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None)

In [217]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_path, use_fast=False, sp_model_kwargs={'enable_sampling': True, 'alpha': 0.5}, cache_dir=cache_dir)

In [224]:
tokenizer(ex, is_split_into_words=True)

{'input_ids': [0, 65589, 6, 31894, 6, 11253, 6, 5053, 6, 11072, 6, 11934, 6, 13849, 6, 43, 6, 7242, 6, 3895, 6, 14258, 6, 26578, 6, 2803, 6, 3846, 6, 2797, 6, 1589, 6, 4, 6, 4965, 6, 5294, 6, 29738, 6, 3024, 6, 219288, 6, 13797, 6, 2803, 6, 7149, 6, 1898, 6, 4, 54721, 6, 1906, 6, 465, 6, 6633, 6, 19691, 6, 14498, 6, 10502, 6, 43, 6, 9243, 73675, 6, 7064, 6, 11795, 6, 4, 6, 3924, 6, 635, 6, 887, 6, 10050, 6, 994, 6, 127066, 6, 562, 65589, 6, 9224, 711, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [219]:
def tokenize_and_align_labels(examples):

    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, max_length=512, is_split_into_words=True)
    label_old = examples["ner_tags"]

    new_labels = []

    for idx in range(len(tokenized_inputs['input_ids'])):
        word_idx = 0
        label_new = []

        for id in tokenized_inputs['input_ids'][idx]:
            token = tokenizer.convert_ids_to_tokens(id)
            if token == "<s>":
                label_new.append(-100) #assign <s> to dummy token
            elif ord(token[0]) == 9601:
                label_new.append(label_old[idx][word_idx]) #only label first token of a word
                word_idx += 1
            else:
                label_new.append(-100) #assign non-first token of word to dummy token
    
        new_labels.append(label_new)


    tokenized_inputs["labels"] = new_labels

    # print(tokenized_inputs)

    return tokenized_inputs

In [220]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/2081 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [170]:
pd.DataFrame(
    [tokenizer.convert_ids_to_tokens(tokenized_dataset['train'][0]['input_ids']), tokenized_dataset['train'][0]['labels']],
    index=["tokens", "ner_tags"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,88,89,90,91,92,93,94,95,96,97
tokens,<s>,▁当,▁,希,▁,望,▁,工,▁,程,...,▁,悔,▁,不,▁当,▁,初,▁,!,</s>
ner_tags,-100,0,0,-100,0,-100,0,-100,0,-100,...,0,-100,0,-100,0,0,-100,0,-100,-100


# Training

In [149]:
# Make debugging easier
%env TOKENIZERS_PARALLELISM=false

env: TOKENIZERS_PARALLELISM=false


In [154]:
training_args = transformers.TrainingArguments(
    output_dir = "./checkpoints/xlm-roberta-ner-swa-noswreg",
    log_level = "error",
    num_train_epochs = 50,
    per_device_train_batch_size = 12,
    per_device_eval_batch_size = 12,
    evaluation_strategy = "epoch",
    fp16 = True,
    logging_steps = len(dataset['train']),
    push_to_hub = False,
)

In [155]:
def metrics_func(eval_arg):
    preds = np.argmax(eval_arg.predictions, axis=2)
    batch_size, seq_len = preds.shape
    y_true, y_pred = [], []
    for b in range(batch_size):
        true_label, pred_label = [], []
        for s in range(seq_len):
            if eval_arg.label_ids[b, s] != -100:  # -100 must be ignored
                true_label.append(index2tag[eval_arg.label_ids[b][s]])
                pred_label.append(index2tag[preds[b][s]])
        y_true.append(true_label)
        y_pred.append(pred_label)
    return {"f1": f1_score(y_true, y_pred)}

In [156]:
data_collator = transformers.DataCollatorForTokenClassification(
    tokenizer,
    return_tensors="pt")

In [157]:
xlmr_config = transformers.AutoConfig.from_pretrained(
    model_path,
    num_labels=tags.num_classes,
    id2label=index2tag,
    label2id=tag2index
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/980 [00:00<?, ?B/s]

In [158]:
model = (transformers.RobertaForTokenClassification
         .from_pretrained(model_path, config=xlmr_config, cache_dir=cache_dir)
         .to(device))

Downloading pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

In [159]:
trainer = transformers.Trainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    compute_metrics = metrics_func,
    train_dataset = tokenized_dataset['train'],
    eval_dataset = tokenized_dataset['test']
)

In [160]:
trainer.train()



Epoch,Training Loss,Validation Loss,F1
1,No log,0.165555,0.892435
2,No log,0.177335,0.89272


KeyboardInterrupt: 