In [98]:
import torch
import datasets
import transformers
import numpy as np
from seqeval.metrics import f1_score
import pandas as pd
import os
from torch.utils.data import DataLoader

In [99]:
from train_datasets import BPEDropoutTrainDataset

In [3]:
if torch.cuda.is_available():
    print("GPU is enabled.")
    print("device count: {}, current device: {}".format(torch.cuda.device_count(), torch.cuda.current_device()))
else:
    print("GPU is not enabled.")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

GPU is not enabled.


In [4]:
cache_dir = "./cache"

# Load Data

In [100]:
dataset_path = "Davlan/conll2003_noMISC"
#dataset_path = "conll2003"
language = None

# subword regularization params:
bpe_dropout_p = 0.0
model_path = "Davlan/xlm-roberta-base-ner-hrl"

In [101]:
train_dataset = BPEDropoutTrainDataset(dataset_path, model_path, dataset_language=language, bpe_dropout_p=bpe_dropout_p, cache_dir=cache_dir, train=True)
test_dataset = BPEDropoutTrainDataset(dataset_path, model_path, dataset_language=language, bpe_dropout_p=0.0, cache_dir=cache_dir, train=False)

Found cached dataset json (/Users/timwu0/Documents/CS224U/cs224u_subwordreg/cache/Davlan___json/Davlan--conll2003_noMISC-f8291b43d0280c02/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset json (/Users/timwu0/Documents/CS224U/cs224u_subwordreg/cache/Davlan___json/Davlan--conll2003_noMISC-f8291b43d0280c02/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/3 [00:00<?, ?it/s]

In [102]:
train_dataset.dset['train'][0]['ner_tags'][0]

'B-ORG'

In [92]:
tags = train_dataset.dset['train'].features["ner_tags"].feature
tags

Value(dtype='string', id=None)

In [68]:
index2tag = {0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC'}
tag2index = {'O': 0,
 'B-PER': 1,
 'I-PER': 2,
 'B-ORG': 3,
 'I-ORG': 4,
 'B-LOC': 5,
 'I-LOC': 6}

In [69]:
tokenizer = train_dataset.tokenizer

In [70]:
pd.DataFrame(
    [tokenizer.convert_ids_to_tokens(train_dataset[0]['input_ids']), train_dataset[0]['labels']],
    index=["tokens", "ner_tags"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
tokens,<s>,▁EU,▁re,ject,s,▁German,▁call,▁to,▁boy,cot,t,▁British,▁la,mb,▁,.,</s>
ner_tags,-100,B-ORG,O,-100,-100,O,O,O,O,-100,-100,O,O,-100,O,-100,-100


# Training

In [72]:
# Make debugging easier
%env TOKENIZERS_PARALLELISM=false

env: TOKENIZERS_PARALLELISM=false


In [73]:
training_args = transformers.TrainingArguments(
    output_dir = "./checkpoints/xlm-roberta-ner-en-2",
    log_level = "error",
    num_train_epochs = 50,
    per_device_train_batch_size = 12,
    per_device_eval_batch_size = 12,
    evaluation_strategy = "epoch",
    fp16 = torch.cuda.is_available(),
    logging_steps = len(train_dataset),
    push_to_hub = False
)

In [74]:
def metrics_func(eval_arg):
    preds = np.argmax(eval_arg.predictions, axis=2)
    batch_size, seq_len = preds.shape
    y_true, y_pred = [], []
    for b in range(batch_size):
        true_label, pred_label = [], []
        for s in range(seq_len):
            if eval_arg.label_ids[b, s] != -100:  # -100 must be ignored
                true_label.append(index2tag[eval_arg.label_ids[b][s]])
                pred_label.append(index2tag[preds[b][s]])
        y_true.append(true_label)
        y_pred.append(pred_label)
    return {"f1": f1_score(y_true, y_pred)}

In [75]:
data_collator = transformers.DataCollatorForTokenClassification(
    tokenizer,
    return_tensors="pt")

In [76]:
xlmr_config = transformers.AutoConfig.from_pretrained(
    model_path,
    num_labels=7,
    id2label=index2tag,
    label2id=tag2index
)

In [43]:
model = (transformers.RobertaForTokenClassification
         .from_pretrained(model_path, config=xlmr_config, cache_dir=cache_dir, ignore_mismatched_sizes=True)
         .to(device))

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at Davlan/xlm-roberta-base-ner-hrl and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([7]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [44]:
trainer = transformers.Trainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    compute_metrics = metrics_func,
    train_dataset = train_dataset,
    eval_dataset = test_dataset
)

In [45]:
trainer.train()



TypeError: an integer is required (got type str)