In [1]:
import re
import string
import csv
import string
import numpy as np
from itertools import groupby
import emoji
import pandas as pd

In [2]:
def read_file_as_lists(filename, delimiter='\t'):
    with open(filename) as stream:
        reader = csv.reader(stream, delimiter=delimiter, quoting=csv.QUOTE_NONE)
        labeled_tokens = [zip(*g) for k, g in groupby(reader, lambda x: not [s for s in x if s.strip()]) if not k]
        tokens, labels = zip(*labeled_tokens)
        return [list(t) for t in tokens], [list(l) for l in labels]

In [3]:
train_tokens, train_labels = read_file_as_lists("/kaggle/input/w-net-data/wnut17train.conll")
dev_tokens, dev_labels = read_file_as_lists("/kaggle/input/w-net-data/emerging.dev.conll")
test_tokens, test_labels = read_file_as_lists("/kaggle/input/w-net-data/emerging.test.annotated")

In [4]:
def clean_tokens(token_list):
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
#     spell = Speller(lang='en')
    for tokens in token_list:
        for i in range(len(tokens)):
            if re.match(url_pattern, tokens[i]):
                tokens[i] = '<URL>' 
            elif emoji.emoji_count(tokens[i]) > 0:
                tokens[i] = '<emoji>'
#             else:
#                 tokens[i] = spell(tokens[i])
        
    return token_list

In [5]:
train_tokens = clean_tokens(train_tokens)
dev_tokens = clean_tokens(dev_tokens)
test_tokens = clean_tokens(test_tokens)

In [6]:
print("Train")
print(len(train_tokens))
print('*'*50)
print("Dev")
print(len(dev_tokens))
print('*'*50)
print("Test")
print(len(test_tokens))

Train
3394
**************************************************
Dev
1009
**************************************************
Test
1287


In [7]:
print("Train")
print(len([token for sublist in train_tokens for token in sublist]))
print('*'*50)
print("Dev")
print(len([token for sublist in dev_tokens for token in sublist]))
print('*'*50)
print("Test")
print(len([token for sublist in test_tokens for token in sublist]))

Train
62730
**************************************************
Dev
15733
**************************************************
Test
23394


In [8]:
df_train_entities = pd.DataFrame({"train_entities": [entity for sublist in train_labels for entity in sublist if entity != 'O']})
df_dev_entities = pd.DataFrame({"dev_entities": [entity for sublist in dev_labels for entity in sublist if entity != 'O']})
df_test_entities = pd.DataFrame({"test_entities": [entity for sublist in test_labels for entity in sublist if entity != 'O']})

In [9]:
df_train_entities_count = df_train_entities["train_entities"].value_counts().sum()
df_dev_entities_count = df_dev_entities["dev_entities"].value_counts().sum()
df_test_entities_count = df_test_entities["test_entities"].value_counts().sum()
print(f"df_train_entities_count: {df_train_entities_count}")
print(f"df_dev_entities_count: {df_dev_entities_count}")
print(f"df_test_entities_count: {df_test_entities_count}")

df_train_entities_count: 3160
df_dev_entities_count: 1250
df_test_entities_count: 1740


In [10]:
types = ["person", "location", "corporation", "product", "creative-work", "group"]
for data in [df_train_entities, df_dev_entities, df_test_entities]:
    print(data.columns[0])
    for type_ in types:
        print(f"{type_}: {data[data.columns[0]].str.endswith(type_).sum()}")
    print("*"*50)

train_entities
person: 995
location: 793
corporation: 267
product: 345
creative-work: 346
group: 414
**************************************************
dev_entities
person: 587
location: 107
corporation: 46
product: 208
creative-work: 238
group: 64
**************************************************
test_entities
person: 560
location: 244
corporation: 88
product: 253
creative-work: 360
group: 235
**************************************************


In [11]:
def get_label2index(data):
    labels = list(set([label for sublist in data for label in sublist]))
    index2label = dict(enumerate(labels))
    label2index = {value: key for key, value in index2label.items()}
    return label2index

label2index = get_label2index(train_labels)

In [12]:
index2label = {value: key for key, value in label2index.items()}

In [13]:
train_labels = [[label2index.get(i) for i in sublist] for sublist in train_labels]
dev_labels = [[label2index.get(i) for i in sublist] for sublist in dev_labels]
test_labels = [[label2index.get(i) for i in sublist] for sublist in test_labels]

In [14]:
from datasets import Dataset

train_data = Dataset.from_dict({'tokens': train_tokens, 'labels': train_labels})
dev_data = Dataset.from_dict({'tokens': dev_tokens, 'labels': dev_labels})
test_data = Dataset.from_dict({'tokens': test_tokens, 'labels': test_labels})

In [15]:
from datasets import DatasetDict

dataset = DatasetDict({
    'train': train_data,
    'dev': dev_data,
    'test': test_data
})

In [16]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [17]:
B2I = {
    2:10,
    4:0,
    5:8,
    6:1,
    7:12,
    11:9
}

In [18]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            new_labels.append(-100)
        else:
            label = labels[word_id]
            if label in B2I.keys():
                label = B2I.get(label)
            new_labels.append(label)
    return new_labels

In [19]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["labels"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [20]:
tokenized_datasets = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [21]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)



In [22]:
train = data_collator(tokenized_datasets["train"])
dev = data_collator(tokenized_datasets["dev"])
test = data_collator(tokenized_datasets["test"])                  

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [23]:
!pip install seqeval

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25ldone
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16165 sha256=b0f8874baa4bce47abe940307214b6293c650b54a512af2b174bde3c6d04c974
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [24]:
!pip install evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.1


In [25]:
import evaluate

metric = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [26]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[index2label.get(l) for l in label if l != -100] for label in labels]
    true_predictions = [
        [index2label.get(p) for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [27]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=index2label,
    label2id=label2index,
)

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
from transformers import TrainingArguments

args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

In [29]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.370723,0.599493,0.277582,0.379462,0.912219
2,No log,0.390677,0.630662,0.318662,0.423392,0.917312
3,0.193500,0.364942,0.609971,0.366197,0.457646,0.921834


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=639, training_loss=0.17325312654737016, metrics={'train_runtime': 279.4317, 'train_samples_per_second': 36.438, 'train_steps_per_second': 2.287, 'total_flos': 287048741860728.0, 'train_loss': 0.17325312654737016, 'epoch': 3.0})

In [30]:
results = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
results



{'eval_loss': 0.5096039772033691,
 'eval_precision': 0.5683673469387756,
 'eval_recall': 0.21514098107377366,
 'eval_f1': 0.31213224992995237,
 'eval_accuracy': 0.9145766485495384,
 'eval_runtime': 7.0325,
 'eval_samples_per_second': 183.007,
 'eval_steps_per_second': 11.518,
 'epoch': 3.0}

In [31]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    true_labels = [[index2label.get(l) for l in label if l != -100] for label in labels]
    true_predictions = [
        [index2label.get(p) for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [32]:
from torch.utils.data import DataLoader
from torch.optim import AdamW
from accelerate import Accelerator
from transformers import get_scheduler
from accelerate import Accelerator
from tqdm.auto import tqdm
import torch
from tqdm.auto import tqdm


def Training(lr, batch_size):
    
    train_dataloader = DataLoader(
        tokenized_datasets["train"],
        shuffle=True,
        collate_fn=data_collator,
        batch_size=batch_size,
    )
    eval_dataloader = DataLoader(
        tokenized_datasets["dev"], collate_fn=data_collator, batch_size=batch_size
    )
    model = AutoModelForTokenClassification.from_pretrained(
        model_checkpoint,
        id2label=index2label,
        label2id=label2index,
    )
    optimizer = AdamW(model.parameters(), lr=lr)
    accelerator = Accelerator()
    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader
    )
    num_train_epochs = 3
    num_update_steps_per_epoch = len(train_dataloader)
    num_training_steps = num_train_epochs * num_update_steps_per_epoch

    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )
    accelerator = Accelerator()
    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader
    )
    
    progress_bar = tqdm(range(num_training_steps))
    for epoch in range(num_train_epochs):
        # Training
        model.train()
        for batch in train_dataloader:
            outputs = model(**batch)
            loss = outputs.loss
            accelerator.backward(loss)

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

        # Evaluation
        model.eval()
        for batch in eval_dataloader:
            with torch.no_grad():
                outputs = model(**batch)

            predictions = outputs.logits.argmax(dim=-1)
            labels = batch["labels"]

            predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
            labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

            predictions_gathered = accelerator.gather(predictions)
            labels_gathered = accelerator.gather(labels)

            true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
            metric.add_batch(predictions=true_predictions, references=true_labels)

        results = metric.compute()
        print(
            f"epoch {epoch}:",
            {
                key: results[f"overall_{key}"]
                for key in ["precision", "recall", "f1", "accuracy"]
            },
        )

In [33]:
learning_rates = [1e-4, 5e-5, 1e-5]
batch_sizes = [8, 16]

for learning_rate in learning_rates:
    for batch_size in batch_sizes:
        print(f"learning rate:{learning_rate}")
        print(f"batch_size:{batch_size}")
        Training(lr = learning_rate, batch_size = batch_size)
        print()
        print()

learning rate:0.0001
batch_size:8


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1275 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


epoch 0: {'precision': 0.1068075117370892, 'recall': 0.5112359550561798, 'f1': 0.1766990291262136, 'accuracy': 0.8971986902967621}
epoch 1: {'precision': 0.3163145539906103, 'recall': 0.6814159292035398, 'f1': 0.432064128256513, 'accuracy': 0.916012681253573}
epoch 2: {'precision': 0.3227699530516432, 'recall': 0.5623721881390593, 'f1': 0.4101416853094705, 'accuracy': 0.9170001559170521}


learning rate:0.0001
batch_size:16


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/639 [00:00<?, ?it/s]

epoch 0: {'precision': 0.14847417840375587, 'recall': 0.6200980392156863, 'f1': 0.23958333333333334, 'accuracy': 0.901616340107063}
epoch 1: {'precision': 0.29988262910798125, 'recall': 0.53732912723449, 'f1': 0.3849340866290019, 'accuracy': 0.9152330959929318}
epoch 2: {'precision': 0.33098591549295775, 'recall': 0.6064516129032258, 'f1': 0.428246013667426, 'accuracy': 0.9199106075567798}


learning rate:5e-05
batch_size:8


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1275 [00:00<?, ?it/s]

epoch 0: {'precision': 0.25997652582159625, 'recall': 0.6257062146892656, 'f1': 0.3673300165837479, 'accuracy': 0.913050257263136}
epoch 1: {'precision': 0.3685446009389671, 'recall': 0.6603575184016824, 'f1': 0.4730696798493409, 'accuracy': 0.922405280390832}
epoch 2: {'precision': 0.3926056338028169, 'recall': 0.6482558139534884, 'f1': 0.48903508771929816, 'accuracy': 0.9248999532248844}


learning rate:5e-05
batch_size:16


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/639 [00:00<?, ?it/s]

epoch 0: {'precision': 0.23591549295774647, 'recall': 0.6860068259385665, 'f1': 0.35109170305676857, 'accuracy': 0.9101398056234083}
epoch 1: {'precision': 0.3397887323943662, 'recall': 0.6624713958810069, 'f1': 0.44918541505042664, 'accuracy': 0.9210539992723871}
epoch 2: {'precision': 0.3955399061032864, 'recall': 0.5980479148181012, 'f1': 0.4761568350406217, 'accuracy': 0.924120367964243}


learning rate:1e-05
batch_size:8


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1275 [00:00<?, ?it/s]

epoch 0: {'precision': 0.13791079812206572, 'recall': 0.5745721271393643, 'f1': 0.22243256034074774, 'accuracy': 0.9006808377942934}
epoch 1: {'precision': 0.272887323943662, 'recall': 0.6094364351245085, 'f1': 0.3769760843129307, 'accuracy': 0.9135699807702302}
epoch 2: {'precision': 0.3380281690140845, 'recall': 0.5702970297029702, 'f1': 0.42446573323507736, 'accuracy': 0.9186632711397537}


learning rate:1e-05
batch_size:16


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/639 [00:00<?, ?it/s]

epoch 0: {'precision': 0.004694835680751174, 'recall': 0.42105263157894735, 'f1': 0.009286128845037725, 'accuracy': 0.8876357777662284}
epoch 1: {'precision': 0.2282863849765258, 'recall': 0.5966257668711656, 'f1': 0.33022071307300516, 'accuracy': 0.9084766904007068}
epoch 2: {'precision': 0.2494131455399061, 'recall': 0.5577427821522309, 'f1': 0.3446877534468775, 'accuracy': 0.9099319162205707}




In [34]:
lr = 5e-5
batch_size = 8

train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=batch_size,
)
eval_dataloader = DataLoader(
    tokenized_datasets["dev"], collate_fn=data_collator, batch_size=batch_size
)
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=index2label,
    label2id=label2index,
)
optimizer = AdamW(model.parameters(), lr=lr)
accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)
num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1275 [00:00<?, ?it/s]

In [46]:
# Evaluation
true_predictions_list = []
true_labels_list = []

model.eval()

test_dataloader = DataLoader(
    tokenized_datasets["test"], collate_fn=data_collator, batch_size=batch_size
)
device = "cuda:0"
for batch in test_dataloader:
    with torch.no_grad():
        model.to(device)  
        batch = {key: value.to(device) for key, value in batch.items()}  

        outputs = model(**batch)
        predictions = torch.argmax(outputs.logits, dim=-1)
        labels = batch["labels"]

        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)

        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)

        true_predictions_list.append(true_predictions)
        true_labels_list.append(true_labels)


In [51]:
true_labels = [subsublist for sublist in true_labels_list for subsublist in sublist]
true_predictions = [subsublist for sublist in true_predictions for subsublist in sublist]

In [54]:
from seqeval.metrics import precision_score, recall_score, f1_score

precision = precision_score(true_labels, true_predictions)

recall = recall_score(true_labels, true_predictions)

f1 = f1_score(true_labels, true_predictions)

print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)


Precision: 0.22556971803785245
Recall: 0.5719882468168462
F1-score: 0.3235457063711911


In [36]:
true_predictions_list = [item for sublist in true_predictions_list for subsublist in sublist for item in subsublist]
true_labels_list = [item for sublist in true_labels_list for subsublist in sublist for item in subsublist]

In [37]:
from sklearn.metrics import classification_report

report = classification_report(true_labels_list, true_predictions_list)

print(report)

                 precision    recall  f1-score   support

  B-corporation       0.15      0.21      0.18       123
B-creative-work       0.15      0.62      0.24       120
        B-group       0.19      0.51      0.28        63
     B-location       0.39      0.73      0.51       214
       B-person       0.28      0.81      0.42       382
      B-product       0.14      0.45      0.22        40
  I-corporation       0.09      0.36      0.14        14
I-creative-work       0.13      0.72      0.22        40
        I-group       0.16      0.52      0.24        21
     I-location       0.26      0.63      0.36        38
       I-person       0.45      0.87      0.59        68
      I-product       0.17      0.45      0.25        88
              O       1.00      0.93      0.96     29331

       accuracy                           0.91     30542
      macro avg       0.27      0.60      0.36     30542
   weighted avg       0.97      0.91      0.94     30542



In [38]:
from sklearn.metrics import f1_score

micro_f1 = f1_score(true_labels_list, true_predictions_list, average='micro')
macro_f1 = f1_score(true_labels_list, true_predictions_list, average='macro')
print(f"micro_f1:{micro_f1}")
print(f"macro_f1:{macro_f1}")

micro_f1:0.9149040665313339
macro_f1:0.3552110131759258


In [39]:
entity_true_labels = [label[2:] if len(label) >1 else label for label in true_labels_list] 
entity_prediction_labels = [label[2:] if len(label) >1 else label for label in true_predictions_list] 

report = classification_report(entity_true_labels, entity_prediction_labels)
print(report)

               precision    recall  f1-score   support

            O       1.00      0.93      0.96     29331
  corporation       0.16      0.26      0.20       137
creative-work       0.16      0.70      0.26       160
        group       0.19      0.52      0.28        84
     location       0.39      0.77      0.52       252
       person       0.31      0.84      0.45       450
      product       0.17      0.48      0.25       128

     accuracy                           0.92     30542
    macro avg       0.34      0.64      0.42     30542
 weighted avg       0.97      0.92      0.94     30542



In [40]:
micro_f1 = f1_score(entity_true_labels, entity_prediction_labels, average='micro')
macro_f1 = f1_score(entity_true_labels, entity_prediction_labels, average='macro')
print(f"micro_f1:{micro_f1}")
print(f"macro_f1:{macro_f1}")

micro_f1:0.9162137384585162
macro_f1:0.41537515841288536
