# Data downloader


In [1]:
!pip install tokenizers -q
!pip install transformers -q
!pip install seqeval -q
!pip install evaluate -q
!pip install datasets -q

[0m

In [2]:
from datasets import load_dataset

dataset = load_dataset("conll2003")

Downloading builder script:   0%|          | 0.00/2.58k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Downloading and preparing dataset conll2003/conll2003 (download: 959.94 KiB, generated: 9.78 MiB, post-processed: Unknown size, total: 10.72 MiB) to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/63f4ebd1bcb7148b1644497336fd74643d4ce70123334431a3c053b7ee4e96ee...


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14042 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3251 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3454 [00:00<?, ? examples/s]

Dataset conll2003 downloaded and prepared to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/63f4ebd1bcb7148b1644497336fd74643d4ce70123334431a3c053b7ee4e96ee. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
labels_names = dataset["train"].features["ner_tags"].feature.names

In [4]:
labels_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [5]:
dataset["train"][:2]

{'id': ['0', '1'],
 'tokens': [['EU',
   'rejects',
   'German',
   'call',
   'to',
   'boycott',
   'British',
   'lamb',
   '.'],
  ['Peter', 'Blackburn']],
 'pos_tags': [[22, 42, 16, 21, 35, 37, 16, 21, 7], [22, 22]],
 'chunk_tags': [[11, 21, 11, 12, 21, 22, 11, 12, 0], [11, 12]],
 'ner_tags': [[3, 0, 7, 0, 0, 0, 7, 0, 0], [1, 2]]}

# Data preparing

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")

Downloading:   0%|          | 0.00/760 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/779k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

In [7]:
def tokenize_function(examples):
    return tokenizer(examples["tokens"], padding = "max_length", truncation = True, is_split_into_words = True)

In [8]:
tokenized_datasets_ = dataset.map(tokenize_function, batched = True)

  0%|          | 0/15 [00:00<?, ?ba/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [9]:
len(tokenized_datasets_["train"][0]["input_ids"]) == len(tokenized_datasets_["train"][0]["ner_tags"])

False

Adjusting labels to fit input

In [10]:
def tokenize_adjust_labels(samples):
    tokenized_samples = tokenizer.batch_encode_plus(samples["tokens"], is_split_into_words = True, truncation = True)

    total_adjusted_labels = []

    for k in range(len(tokenized_samples["input_ids"])):
        prev_wid = -1

        word_ids_list = tokenized_samples.word_ids(batch_index = k)

        existing_label_ids = samples["ner_tags"][k]

        i = -1

        adjusted_label_ids = []

        for word_idx in word_ids_list:
            if word_idx is None:
                adjusted_label_ids.append(-100)
            elif word_idx != prev_wid:
                i += 1
                adjusted_label_ids.append(existing_label_ids[i])
                prev_wid = word_idx
            else:
                label_name = labels_names[existing_label_ids[i]]
                adjusted_label_ids.append(existing_label_ids[i])
        
        total_adjusted_labels.append(adjusted_label_ids)
            
    tokenized_samples["labels"] = total_adjusted_labels

    return tokenized_samples


In [11]:
out = tokenizer("Fine tune NER with XLNET")
out

{'input_ids': [9678, 6811, 17, 23897, 33, 17, 20545, 11756, 4, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [12]:
out.word_ids(0)

[0, 1, 2, 2, 3, 4, 4, 4, None, None]

In [13]:
tokenized_dataset = dataset.map(tokenize_adjust_labels, batched = True, remove_columns = ["tokens", "ner_tags", "pos_tags", "chunk_tags"])

  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [14]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 14042
    })
    validation: Dataset({
        features: ['id', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3251
    })
    test: Dataset({
        features: ['id', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3454
    })
})

In [15]:
tokenized_dataset["train"][:2]

{'id': ['0', '1'],
 'input_ids': [[1534, 8006, 23, 871, 547, 22, 8569, 559, 15174, 17, 9, 4, 3],
  [1656, 19769, 4, 3]],
 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2], [0, 0, 0, 2]],
 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1]],
 'labels': [[3, 0, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100, -100], [1, 2, -100, -100]]}

Padding token to a constant length

In [16]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

# Fine-tuning XLNet for NER task

In [17]:
from transformers import AutoModelForTokenClassification

In [18]:
model = AutoModelForTokenClassification.from_pretrained("xlnet-base-cased", num_labels = len(labels_names))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForTokenClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForTokenClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Computing metrics 

In [19]:
import numpy as np
import evaluate

In [20]:
metric = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [21]:
def compute_metrics(p):
    predictions, labels = p

    predictions = np.argmax(predictions, axis = 2)

    true_predictions = [
        [labels_names[p] for p, l in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_labels = [
        [labels_names[l] for p, l in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions = true_predictions, references = true_labels)

    return {
        "precision" : results["overall_precision"],
        "recall" : results["overall_recall"],
        "f1" : results["overall_f1"],
        "accuracy" : results["overall_accuracy"]
    }

## Using Trainer API to fine-tune

In [22]:
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback, IntervalStrategy

In [23]:
batch_size = 16
logging_steps = len(tokenized_dataset["train"]) // batch_size
epochs = 7
weight_decay = .01
eval_steps = 100
learning_rate = 2e-5
early_stopping_patience = 3

In [24]:
training_args = TrainingArguments(
    output_dir="results",
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    disable_tqdm=False,
    learning_rate = learning_rate,
    logging_steps=logging_steps,
    evaluation_strategy = IntervalStrategy.STEPS,
    eval_steps = eval_steps,
    save_total_limit = 5,
    metric_for_best_model = "eval_loss",
    load_best_model_at_end = True
) 

In [25]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=early_stopping_patience)]
)

In [26]:
trainer.train()

The following columns in the evaluation set don't have a corresponding argument in `XLNetForTokenClassification.forward` and have been ignored: id. If id are not expected by `XLNetForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3251
  Batch size = 16
Saving model checkpoint to results/checkpoint-500
Configuration saved in results/checkpoint-500/config.json
Model weights saved in results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in results/checkpoint-500/tokenizer_config.json
Special tokens file saved in results/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `XLNetForTokenClassification.forward` and have been ignored: id. If id are not expected by `XLNetForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3251
  Batch size = 16
The following columns in the evalu

TrainOutput(global_step=1900, training_loss=0.09672070879685252, metrics={'train_runtime': 597.314, 'train_samples_per_second': 164.56, 'train_steps_per_second': 10.289, 'total_flos': 874227373099560.0, 'train_loss': 0.09672070879685252, 'epoch': 2.16})

## Testing

In [27]:
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])

predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)

true_predictions = [
    [labels_names[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

true_labels = [
    [labels_names[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)

The following columns in the test set don't have a corresponding argument in `XLNetForTokenClassification.forward` and have been ignored: id. If id are not expected by `XLNetForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 3454
  Batch size = 16


In [28]:
results

{'LOC': {'precision': 0.9236058059587471,
  'recall': 0.9148694665153235,
  'f1': 0.9192168789203573,
  'number': 2643},
 'MISC': {'precision': 0.7229845626072041,
  'recall': 0.725473321858864,
  'f1': 0.7242268041237112,
  'number': 1162},
 'ORG': {'precision': 0.8743849493487699,
  'recall': 0.9396578538102643,
  'f1': 0.9058470764617691,
  'number': 3215},
 'PER': {'precision': 0.9583793738489871,
  'recall': 0.9566176470588236,
  'f1': 0.9574977000919964,
  'number': 2720},
 'overall_precision': 0.8925055254169179,
 'overall_recall': 0.9121149897330596,
 'overall_f1': 0.9022037168680817,
 'overall_accuracy': 0.9744028319005266}

In [29]:
true_predictions

[['O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'B-LOC',
  'B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-PER', 'B-PER', 'I-PER', 'I-PER'],
 ['B-LOC',
  'B-LOC',
  'B-LOC',
  'B-LOC',
  'O',
  'O',
  'B-LOC',
  'I-LOC',
  'I-LOC',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-MISC',
  'I-MISC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'O',
  'O',
  'B-MISC',
  'I-MISC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'O',
  'O'],
 ['B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-MISC',
  'O',
  'B-PER',
  'I-PER',
  

In [30]:
true_labels

[['O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'B-LOC',
  'B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-PER',
  'B-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-PER', 'B-PER', 'I-PER', 'I-PER'],
 ['B-LOC',
  'B-LOC',
  'B-LOC',
  'B-LOC',
  'O',
  'O',
  'B-LOC',
  'I-LOC',
  'I-LOC',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-MISC',
  'I-MISC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'O',
  'O'],
 ['B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-MISC',
  'O',
  'B-PER',
  'I-PER',
  'I-PER',
 

In [31]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score

In [32]:
y_pred = []
y = []
for sent in true_predictions:
    for tag in sent:
        y_pred.append(tag)
for sent in true_labels:
    for tag in sent:
        y.append(tag)


In [33]:
print(classification_report(y_pred, y))

              precision    recall  f1-score   support

       B-LOC       0.92      0.93      0.93      2599
      B-MISC       0.75      0.77      0.76      1133
       B-ORG       0.95      0.89      0.92      3429
       B-PER       0.96      0.96      0.96      2708
       I-LOC       0.88      0.83      0.86       382
      I-MISC       0.69      0.46      0.55       471
       I-ORG       0.92      0.85      0.88      1320
       I-PER       1.00      0.98      0.99      2533
           O       0.99      1.00      0.99     52659

    accuracy                           0.97     67234
   macro avg       0.89      0.85      0.87     67234
weighted avg       0.97      0.97      0.97     67234



In [38]:
print(f"Accuracy = {accuracy_score(y_pred, y):.6f}, \
        Precision (weighted) = {precision_score(y_pred, y, average = 'weighted'):.6f}, \
        Recall (weighted) = {recall_score(y_pred, y, average = 'weighted'):.6f}, \
        F1 (weighted) = {f1_score(y_pred, y, average = 'weighted'):.6f}")

Accuracy = 0.974403,         Precision (weighted) = 0.973616,         Recall (weighted) = 0.974403,         F1 (weighted) = 0.973741


In [36]:
from sklearn.metrics import confusion_matrix

In [37]:
print(confusion_matrix(y_pred, y))

[[ 2427    50    54    30     1     0     2     0    35]
 [   68   869    38     1     0     4     0     0   153]
 [  108    96  3050    34     0     4    10     0   127]
 [    7    16    23  2606     0     0     0     1    55]
 [    7     0     0     0   318    10    24     0    23]
 [    3    25     2     0     4   218    11     0   208]
 [   11    11    18     0    31    41  1116    10    82]
 [    0     0     0    22     5     3    17  2478     8]
 [   12    95    30    27     1    35    28     0 52431]]
