# Evaluating HF model with HF Dataset on ORG detection performance

In [1]:
import torch
import numpy as np
from tqdm import tqdm

In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import DataCollatorForTokenClassification
from torch.utils.data import DataLoader
from datasets import load_dataset, Dataset
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
seqeval = evaluate.load("seqeval")

## HF Model

In [4]:
model_name = "Jean-Baptiste/roberta-large-ner-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
model.eval()
# nlp = pipeline(
#     'ner',
#     model=model,
#     tokenizer=tokenizer,
#     aggregation_strategy="simple"
# )

RobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
    

In [5]:
model.config.label2id

{'LOC': 3, 'MISC': 4, 'O': 0, 'ORG': 2, 'PER': 1}

In [6]:
model.config.id2label

{0: 'O', 1: 'PER', 2: 'ORG', 3: 'LOC', 4: 'MISC'}

In [7]:
# def tokenize_and_align_labels(examples):
#     tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

#     labels = []
#     for i, label in enumerate(examples[f"ner_tags"]):
#         word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
#         previous_word_idx = None
#         label_ids = []
#         for word_idx in word_ids:  # Set the special tokens to -100.
#             if word_idx is None:
#                 label_ids.append(-100)
#             elif word_idx != previous_word_idx:  # Only label the first token of a given word.
#                 label_ids.append(label[word_idx])
#             else:
#                 label_ids.append(-100)
#             previous_word_idx = word_idx
#         labels.append(label_ids)

#     tokenized_inputs["labels"] = labels
#     return tokenized_inputs

In [8]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], 
        truncation=True, 
        is_split_into_words=True,
        add_special_tokens=True,
    )
    
    labels = []
    for idx, example in enumerate(examples):
        label_ids = example["ner_tags"]
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        word_ids_truncated = word_ids[1:-1] # cut off special tokens
        
        _, subtoken_counts = np.unique(np.array(word_ids_truncated), return_counts=True)
        label_ids = np.repeat(label_ids, subtoken_counts) 
        
        label_ids = label_ids.tolist()
        label_ids = [-100] + label_ids + [-100] # put special tokens back
        labels.append(label_ids)
        
        assert len(label_ids) == len(word_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

## Dataset: conll2003

In [9]:
conll_dataset = load_dataset(
    "conll2003", 
    split="test", 
    # features=["tokens", "ner_tags"]
)

Found cached dataset conll2003 (/home/base/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98)


In [10]:
id2label = dict(enumerate(conll_dataset.features["ner_tags"].feature.names))
id2label

{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC',
 7: 'B-MISC',
 8: 'I-MISC'}

In [11]:
label2id = {v: k for k, v in id2label.items()}
label2id

{'O': 0,
 'B-PER': 1,
 'I-PER': 2,
 'B-ORG': 3,
 'I-ORG': 4,
 'B-LOC': 5,
 'I-LOC': 6,
 'B-MISC': 7,
 'I-MISC': 8}

In [12]:
label2id = {"O": 0,
 "B-PER": 0,
 "I-PER": 0,
 "B-ORG": 1,
 "I-ORG": 1,
 "B-LOC": 0,
 "I-LOC": 0,
 "B-MISC": 0,
 "I-MISC": 0}

In [13]:
conll_examples = []
for d in conll_dataset:
    del d["pos_tags"]
    del d["chunk_tags"]
    d["ner_tags"] = [id2label[tag_id] for tag_id in d["ner_tags"]]
    d["ner_tags"] = [label2id[tag] for tag in d["ner_tags"]]
    conll_examples.append(d)

In [14]:
conll_examples[374]

{'id': '374',
 'tokens': ['NY', 'ISLANDERS', '7', '11', '8', '65', '72', '22'],
 'ner_tags': [1, 1, 0, 0, 0, 0, 0, 0]}

In [15]:
conll_dataset = Dataset.from_list(conll_examples)

In [16]:
conll_dataset_tokenized = tokenize_and_align_labels(conll_dataset)

In [17]:
conll_dataset_tokenized = Dataset.from_dict(dict(conll_dataset_tokenized))

## Dataset: ontonotes5

In [18]:
from datasets import load_dataset

ontonotes_dataset = load_dataset("tner/ontonotes5", split="test")

Found cached dataset ontonotes5 (/home/base/.cache/huggingface/datasets/tner___ontonotes5/ontonotes5/1.0.0/58d8410f24e689c113094eef1d1686365ba9155c66b57bdf8fa4273307c37612)


In [19]:
ontonotes_dataset.features

{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'tags': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None)}

In [20]:
label2id = {
    "O": 0,
    "B-CARDINAL": 1,
    "B-DATE": 2,
    "I-DATE": 3,
    "B-PERSON": 4,
    "I-PERSON": 5,
    "B-NORP": 6,
    "B-GPE": 7,
    "I-GPE": 8,
    "B-LAW": 9,
    "I-LAW": 10,
    "B-ORG": 11,
    "I-ORG": 12, 
    "B-PERCENT": 13,
    "I-PERCENT": 14, 
    "B-ORDINAL": 15, 
    "B-MONEY": 16, 
    "I-MONEY": 17, 
    "B-WORK_OF_ART": 18, 
    "I-WORK_OF_ART": 19, 
    "B-FAC": 20, 
    "B-TIME": 21, 
    "I-CARDINAL": 22, 
    "B-LOC": 23, 
    "B-QUANTITY": 24, 
    "I-QUANTITY": 25, 
    "I-NORP": 26, 
    "I-LOC": 27, 
    "B-PRODUCT": 28, 
    "I-TIME": 29, 
    "B-EVENT": 30,
    "I-EVENT": 31,
    "I-FAC": 32,
    "B-LANGUAGE": 33,
    "I-PRODUCT": 34,
    "I-ORDINAL": 35,
    "I-LANGUAGE": 36
}

In [21]:
id2label = {v: k for k, v in label2id.items()}
id2label

{0: 'O',
 1: 'B-CARDINAL',
 2: 'B-DATE',
 3: 'I-DATE',
 4: 'B-PERSON',
 5: 'I-PERSON',
 6: 'B-NORP',
 7: 'B-GPE',
 8: 'I-GPE',
 9: 'B-LAW',
 10: 'I-LAW',
 11: 'B-ORG',
 12: 'I-ORG',
 13: 'B-PERCENT',
 14: 'I-PERCENT',
 15: 'B-ORDINAL',
 16: 'B-MONEY',
 17: 'I-MONEY',
 18: 'B-WORK_OF_ART',
 19: 'I-WORK_OF_ART',
 20: 'B-FAC',
 21: 'B-TIME',
 22: 'I-CARDINAL',
 23: 'B-LOC',
 24: 'B-QUANTITY',
 25: 'I-QUANTITY',
 26: 'I-NORP',
 27: 'I-LOC',
 28: 'B-PRODUCT',
 29: 'I-TIME',
 30: 'B-EVENT',
 31: 'I-EVENT',
 32: 'I-FAC',
 33: 'B-LANGUAGE',
 34: 'I-PRODUCT',
 35: 'I-ORDINAL',
 36: 'I-LANGUAGE'}

In [22]:
label2id = {
    "O": 0,
    "B-CARDINAL": 0,
    "B-DATE": 0,
    "I-DATE": 0,
    "B-PERSON": 0,
    "I-PERSON": 0,
    "B-NORP": 0,
    "B-GPE": 0,
    "I-GPE": 0,
    "B-LAW": 0,
    "I-LAW": 0,
    "B-ORG": 1,
    "I-ORG": 1, 
    "B-PERCENT": 0,
    "I-PERCENT": 0, 
    "B-ORDINAL": 0, 
    "B-MONEY": 0, 
    "I-MONEY": 0, 
    "B-WORK_OF_ART": 0, 
    "I-WORK_OF_ART": 0, 
    "B-FAC": 0, 
    "B-TIME": 0, 
    "I-CARDINAL": 0, 
    "B-LOC": 0, 
    "B-QUANTITY": 0, 
    "I-QUANTITY": 0, 
    "I-NORP": 0, 
    "I-LOC": 0, 
    "B-PRODUCT": 0, 
    "I-TIME": 0, 
    "B-EVENT": 0,
    "I-EVENT": 0,
    "I-FAC": 0,
    "B-LANGUAGE": 0,
    "I-PRODUCT": 0,
    "I-ORDINAL": 0,
    "I-LANGUAGE": 0
}

In [23]:
ontonotes5_examples = []
for d in ontonotes_dataset:
    # del d["pos_tags"]
    # del d["chunk_tags"]
    d["tags"] = [id2label[tag_id] for tag_id in d["tags"]]
    d["ner_tags"] = [label2id[tag] for tag in d["tags"]]
    del d["tags"]
    ontonotes5_examples.append(d)

In [24]:
ontonotes_dataset = Dataset.from_list(ontonotes5_examples)

In [25]:
ontonotes_dataset

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 8262
})

In [26]:
ontonotes_dataset_tokenized = tokenize_and_align_labels(ontonotes_dataset)

In [27]:
ontonotes_dataset_tokenized = Dataset.from_dict(dict(ontonotes_dataset_tokenized))

## Eval

In [28]:
from datasets import concatenate_datasets

In [29]:
tokenized_dataset = concatenate_datasets([ontonotes_dataset_tokenized, conll_dataset_tokenized])

In [30]:
tokenized_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 11715
})

In [38]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
dataloader = DataLoader(tokenized_dataset, collate_fn=data_collator, batch_size=64)

In [39]:
# batch = next(iter(dataloader))

In [40]:
seqeval = evaluate.load("seqeval")

In [41]:
label_list = ["O", "ORG"]

In [42]:
def compute_metrics(predictions, labels, label_list=label_list):
    # predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    predictions[predictions != 2] = 0
    predictions[predictions == 2] = 1

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [43]:
preds = []
labels = []

In [44]:
with torch.no_grad():
    for batch in tqdm(dataloader):
        p = model(**batch)
        predictions = p.logits
        predictions = np.argmax(predictions, axis=2)
        predictions[predictions != 2] = 0
        predictions[predictions == 2] = 1
        
        preds.append(predictions.numpy())
        labels.append(batch.labels.numpy())

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 184/184 [21:04<00:00,  6.87s/it]


In [45]:
from sklearn.metrics import classification_report

In [46]:
# preds = [p.numpy() for p in preds]
# labels = [l.numpy() for l in labels]

In [47]:
preds = [np.hstack(p) for p in preds]
labels = [np.hstack(l) for l in labels]

In [48]:
preds = np.hstack(preds)
labels = np.hstack(labels)

In [49]:
len(preds), len(labels)

(721988, 721988)

In [50]:
filtered = [(pred, true) for pred, true in zip(preds, labels) if true != -100]

In [51]:
len(filtered)

228284

In [52]:
import pandas as pd

In [53]:
preds_df = pd.DataFrame(filtered, columns=["pred", "true"])

In [54]:
preds_df

Unnamed: 0,pred,true
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
228279,0,0
228280,0,0
228281,0,0
228282,0,0


In [55]:
print(classification_report(preds_df.true, preds_df.pred))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99    218273
           1       0.86      0.90      0.88     10011

    accuracy                           0.99    228284
   macro avg       0.93      0.95      0.94    228284
weighted avg       0.99      0.99      0.99    228284

