# Assignment 2 - Data Mining

# Task 1
The task one is just to download the W-NUT_data.zip
- [X] Done

# Task 2
Task 2 is to convert IOB data to the correct data structure for token class.
- [X] Done

Needed libraries

In [None]:
# ! pip install transformers
# ! pip install datasets
# ! pip install seqeval
# ! pip install evaluate
# ! pip install torch
# ! pip install accelerate
# ! pip install transformers[torch]

# ! pip install optuna
# # or $ conda install -c conda-forge optuna



In [None]:
from datasets import Dataset
from transformers import AutoTokenizer
import torch
import accelerate
import evaluate
from transformers import DataCollatorForTokenClassification
from datasets import DatasetDict
from transformers import AutoModelForTokenClassification
from huggingface_hub import notebook_login
from transformers import TrainingArguments
from transformers import Trainer
from sklearn.model_selection import ParameterGrid
import seqeval
from sklearn.metrics import classification_report
from tabulate import tabulate
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score
from sklearn.preprocessing import LabelBinarizer
from seqeval.metrics import classification_report as seqeval_classification_report, recall_score as seqeval_recall_score, precision_score as seqeval_precision_score, f1_score as seqeval_f1_score


# Importing training set

In [None]:
# Define the data format
data = {
    "tokens": [],  # List of tokens
    "labels": [],  # List of integer labels
}

label_to_int = {}  # Dictionary to map original labels to integers
label_names = []  # List to map integers to original labels

train_path = "W-NUT_data/wnut17train.conll"


# Read the .conll file and populate the data dictionary
with open(train_path, "r") as file:
    lines = file.readlines()
    tokens, labels = [], []
    for line in lines:
        parts = line.strip().split()
        if parts:
            token, label = parts
            tokens.append(token)

            # Check if the label is already in the mapping dictionary
            if label not in label_to_int:
                label_to_int[label] = len(label_to_int)
                label_names.append(label)

            labels.append(label_to_int[label])
        else:
            data["tokens"].append(tokens)
            data["labels"].append(labels)
            tokens, labels = [], []  # Reset for the next sentence

# Create a custom dataset
train = Dataset.from_dict(data)

In [None]:
print(train)

Dataset({
    features: ['tokens', 'labels'],
    num_rows: 3394
})


# Importing validation set

In [None]:
# Define the data format for the validation set
validation_data = {
    "tokens": [],  # List of tokens
    "labels": [],  # List of integer labels
}

validation_path = "W-NUT_data/emerging.dev.conll"  # Replace with the actual path to your validation set .conll file

# Read the .conll file for the validation set and populate the data dictionary
with open(validation_path, "r") as file:
    lines = file.readlines()
    tokens, labels = [], []
    for line in lines:
        parts = line.strip().split()
        if len(parts) == 2:  # Check if there are both token and label
            token, label = parts
            tokens.append(token)

            if label not in label_to_int:
                label_to_int[label] = len(label_names)
                label_names.append(label)

            labels.append(label_to_int[label])
        else:
            if parts:  # Handle lines with only one value
                # Decide how to handle lines with one value (e.g., set a default label)
                token = parts[0]
                label = "O"  # You can replace this with an appropriate default label
                tokens.append(token)

                if label not in label_to_int:
                    label_to_int[label] = len(label_names)
                    label_names.append(label)

                labels.append(label_to_int[label])
            else:
                validation_data["tokens"].append(tokens)
                validation_data["labels"].append(labels)
                tokens, labels = [], []  # Reset for the next sentence

# If there's data left to process
if tokens:
    validation_data["tokens"].append(tokens)
    validation_data["labels"].append(labels)

# Create a custom dataset for the validation set
validation = Dataset.from_dict(validation_data)


In [None]:
print(validation)

Dataset({
    features: ['tokens', 'labels'],
    num_rows: 1009
})


# Importing test set

In [None]:
# Define the data format for the test set
test_data = {
   "tokens": [],  # List of tokens
   "labels": [],  # List of integer labels
}

test_path = "W-NUT_data/emerging.test.annotated"  # Replace with the actual path to your test set .conll file

# Read the .conll file for the test set and populate the data dictionary
with open(test_path, "r") as file:
   lines = file.readlines()
   tokens, labels = [], []
   for line in lines:
       parts = line.strip().split()
       if len(parts) == 2:  # Check if there are both token and label
           token, label = parts
           tokens.append(token)

           if label not in label_to_int:
               label_to_int[label] = len(label_names)
               label_names.append(label)

           labels.append(label_to_int[label])
       else:
           if parts:  # Handle lines with only one value
               # Decide how to handle lines with one value (e.g., set a default label)
               token = parts[0]
               label = "O"  # You can replace this with an appropriate default label
               tokens.append(token)

               if label not in label_to_int:
                   label_to_int[label] = len(label_names)
                   label_names.append(label)

               labels.append(label_to_int[label])
           else:
               test_data["tokens"].append(tokens)
               test_data["labels"].append(labels)
               tokens, labels = [], []  # Reset for the next sentence

# If there's data left to process
if tokens:
   test_data["tokens"].append(tokens)
   test_data["labels"].append(labels)

# Create a custom dataset for the test set
test = Dataset.from_dict(test_data)


In [None]:
print(test)

Dataset({
    features: ['tokens', 'labels'],
    num_rows: 1287
})


In [None]:
print(label_to_int)

{'O': 0, 'B-location': 1, 'I-location': 2, 'B-group': 3, 'B-corporation': 4, 'B-person': 5, 'B-creative-work': 6, 'B-product': 7, 'I-person': 8, 'I-creative-work': 9, 'I-corporation': 10, 'I-group': 11, 'I-product': 12}


In [None]:
print(label_names)

['O', 'B-location', 'I-location', 'B-group', 'B-corporation', 'B-person', 'B-creative-work', 'B-product', 'I-person', 'I-creative-work', 'I-corporation', 'I-group', 'I-product']


# Realizing a dataset dictionary

In [None]:
raw_datasets = DatasetDict({
    'train': train,
    'validation': validation,
    'test': test
})

In [None]:
print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 3394
    })
    validation: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 1009
    })
    test: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 1287
    })
})


# Verify the initial alignment between tokens and labels.

In [65]:
words = raw_datasets["test"][0]["tokens"]
labels = raw_datasets["test"][0]["labels"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

& gt ; * The soldier was killed when another avalanche hit an army barracks in the northern area of Sonmarg    , said a military spokesman . 
O O  O O O   O       O   O      O    O       O         O   O  O    O        O  O   O        O    O  B-location O O    O O        O         O 


# Importing the needed tokenizer

In [None]:
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 '@',
 'p',
 '##aul',
 '##walk',
 'It',
 "'",
 's',
 'the',
 'view',
 'from',
 'where',
 'I',
 "'",
 'm',
 'living',
 'for',
 'two',
 'weeks',
 '.',
 'Empire',
 'State',
 'Building',
 '=',
 'E',
 '##SB',
 '.',
 'Pretty',
 'bad',
 'storm',
 'here',
 'last',
 'evening',
 '.',
 '[SEP]']

# Defining the function to align labels and tokens properly

In [62]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [63]:
labels = raw_datasets["train"][0]["labels"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]
[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, -100]


Defining the function to apply the previous function on all the dataset

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["labels"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [None]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

Map:   0%|          | 0/3394 [00:00<?, ? examples/s]

Map:   0%|          | 0/1009 [00:00<?, ? examples/s]

Map:   0%|          | 0/1287 [00:00<?, ? examples/s]

In [None]:
print(tokenized_datasets)

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3394
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1009
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1287
    })
})


Data collation

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    1,    2,    2,    0,
            1,    2,    0,    0,    0,    0,    0,    0,    0,    0, -100],
        [-100,    0,    0,    0,    0,    0,    0,    3,    4,    4,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0, -100, -100]])

In [None]:
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, -100]
[-100, 0, 0, 0, 0, 0, 0, 3, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]


# Task 3
Evaluation and metrics

In [None]:
metric = evaluate.load("seqeval")

In [None]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

# Task 4

In [None]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}


In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.config.num_labels

13

In [None]:
notebook_login() #reading token


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
notebook_login() # writting token

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.474795,0.513619,0.341969,0.410575,0.931809
2,0.029100,0.577936,0.585732,0.303109,0.399488,0.931784
3,0.013500,0.570911,0.558621,0.314767,0.402651,0.931985


TrainOutput(global_step=1275, training_loss=0.0191681235444312, metrics={'train_runtime': 214.2748, 'train_samples_per_second': 47.518, 'train_steps_per_second': 5.95, 'total_flos': 289506808910040.0, 'train_loss': 0.0191681235444312, 'epoch': 3.0})

In [None]:
args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

# Create Trainer with the current model and arguments
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],  # Use the test set for evaluation during training
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

# Train the model with the current hyperparameters
trainer.train()

# After training, evaluate the model on the test set
test_results = trainer.evaluate(tokenized_datasets["test"])

# Print and/or store the results for this hyperparameter combination
print(f"Test Results: {test_results}")


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.880355,0.52381,0.214345,0.304207,0.928543
2,0.001600,0.986134,0.557962,0.181592,0.274007,0.928141
3,0.001500,0.972839,0.564838,0.187811,0.281892,0.928191


Test Results: {'eval_loss': 0.9728385210037231, 'eval_precision': 0.5648379052369077, 'eval_recall': 0.18781094527363185, 'eval_f1': 0.28189172370877413, 'eval_accuracy': 0.9281909547738694, 'eval_runtime': 6.2486, 'eval_samples_per_second': 205.966, 'eval_steps_per_second': 25.766, 'epoch': 3.0}


# Task 5 Hyperparameters tuning
Tuning learning rate and batch size with gradient accumulation because of GPU memory problems.

In [None]:
args_combination_1 = TrainingArguments(
    "bert-finetuned-ner-combination-1",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=4,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=args_combination_1,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],  # Use the "validation" set for hyperparameter optimization
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

trainer.train()

# After training, evaluate the model on the test set
test_results = trainer.evaluate(tokenized_datasets["test"])

# Print the test results
print("Test Results:")
print(test_results)

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0204,0.889384,0.680328,0.31519,0.430796,0.915868
2,0.0213,0.805429,0.668213,0.364557,0.471744,0.917046
3,0.009,0.720928,0.697917,0.381646,0.493453,0.921327


Test Results:
{'eval_loss': 0.7523127198219299, 'eval_precision': 0.5814249363867684, 'eval_recall': 0.189469320066335, 'eval_f1': 0.2858036272670419, 'eval_accuracy': 0.929321608040201, 'eval_runtime': 6.2163, 'eval_samples_per_second': 207.037, 'eval_steps_per_second': 25.9, 'epoch': 3.0}


In [None]:
args_combination_2 = TrainingArguments(
    "bert-finetuned-ner-combination-2",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=12,
    push_to_hub=True,
)
trainer = Trainer(
    model=model,
    args=args_combination_2,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],  # Use the "validation" set for hyperparameter optimization
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

trainer.train()

# After training, evaluate the model on the test set
test_results = trainer.evaluate(tokenized_datasets["test"])

# Print the test results
print("Test Results:")
print(test_results)

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.847894,0.670058,0.291772,0.406526,0.913085
2,0.013500,0.622215,0.675138,0.386709,0.491751,0.920739
3,0.013500,0.652038,0.658026,0.417722,0.511034,0.923307


Test Results:
{'eval_loss': 0.7375848889350891, 'eval_precision': 0.5557986870897156, 'eval_recall': 0.21061359867330018, 'eval_f1': 0.30547203848466625, 'eval_accuracy': 0.9295477386934673, 'eval_runtime': 6.8643, 'eval_samples_per_second': 187.492, 'eval_steps_per_second': 23.455, 'epoch': 3.0}


In [None]:
args_combination_3 = TrainingArguments(
    "bert-finetuned-ner-combination-3",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=4,
    push_to_hub=True,
)
trainer = Trainer(
    model=model,
    args=args_combination_3,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],  # Use the "validation" set for hyperparameter optimization
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

trainer.train()

# After training, evaluate the model on the test set
test_results = trainer.evaluate(tokenized_datasets["test"])

# Print the test results
print("Test Results:")
print(test_results)

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0079,0.912543,0.693935,0.311392,0.429882,0.91635
2,0.0057,0.823483,0.71123,0.336709,0.457045,0.917688
3,0.0059,0.730988,0.681406,0.38038,0.488221,0.920953


Test Results:
{'eval_loss': 0.8010109066963196, 'eval_precision': 0.5705596107055961, 'eval_recall': 0.19444444444444445, 'eval_f1': 0.29004329004329005, 'eval_accuracy': 0.9292713567839196, 'eval_runtime': 6.8971, 'eval_samples_per_second': 186.599, 'eval_steps_per_second': 23.343, 'epoch': 3.0}


In [None]:
args_combination_4 = TrainingArguments(
    "bert-finetuned-ner-combination-4",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=12,
    push_to_hub=True,
)
trainer = Trainer(
    model=model,
    args=args_combination_4,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],  # Use the "validation" set for hyperparameter optimization
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

trainer.train()

# After training, evaluate the model on the test set
test_results = trainer.evaluate(tokenized_datasets["test"])

# Print the test results
print("Test Results:")
print(test_results)

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.859605,0.712379,0.371519,0.488353,0.920257
2,0.003900,0.706297,0.664888,0.394304,0.495034,0.921916
3,0.003900,0.662699,0.643917,0.412025,0.502509,0.924699


Test Results:
{'eval_loss': 0.7843152284622192, 'eval_precision': 0.5411764705882353, 'eval_recall': 0.20978441127694858, 'eval_f1': 0.3023603226770242, 'eval_accuracy': 0.9294221105527638, 'eval_runtime': 6.4846, 'eval_samples_per_second': 198.469, 'eval_steps_per_second': 24.828, 'epoch': 3.0}


In [None]:
args_combination_5 = TrainingArguments(
    "bert-finetuned-ner-combination-5",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=4,
    push_to_hub=True,
)
trainer = Trainer(
    model=model,
    args=args_combination_5,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],  # Use the "validation" set for hyperparameter optimization
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

trainer.train()

# After training, evaluate the model on the test set
test_results = trainer.evaluate(tokenized_datasets["test"])

# Print the test results
print("Test Results:")
print(test_results)

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0024,0.857778,0.661905,0.351899,0.459504,0.919026
2,0.0016,0.860201,0.651672,0.357595,0.46179,0.919347
3,0.0016,0.845222,0.657778,0.374684,0.477419,0.920685


Test Results:
{'eval_loss': 0.8803117275238037, 'eval_precision': 0.5288888888888889, 'eval_recall': 0.19734660033167495, 'eval_f1': 0.28743961352657005, 'eval_accuracy': 0.9284673366834171, 'eval_runtime': 6.1025, 'eval_samples_per_second': 210.896, 'eval_steps_per_second': 26.382, 'epoch': 3.0}


In [None]:
args_combination_6 = TrainingArguments(
    "bert-finetuned-ner-combination-6",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=12,
    push_to_hub=True,
)
trainer = Trainer(
    model=model,
    args=args_combination_6,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],  # Use the "validation" set for hyperparameter optimization
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

trainer.train()

# After training, evaluate the model on the test set
test_results = trainer.evaluate(tokenized_datasets["test"])

# Print the test results
print("Test Results:")
print(test_results)

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.862558,0.685979,0.374684,0.48465,0.921274
2,0.001500,0.970584,0.675776,0.344304,0.456184,0.91817
3,0.001500,0.945845,0.691624,0.344937,0.460304,0.918384


Test Results:
{'eval_loss': 0.9369379281997681, 'eval_precision': 0.5826558265582655, 'eval_recall': 0.17827529021558872, 'eval_f1': 0.273015873015873, 'eval_accuracy': 0.9279396984924623, 'eval_runtime': 6.1735, 'eval_samples_per_second': 208.471, 'eval_steps_per_second': 26.079, 'epoch': 3.0}


In [None]:
args_combination_7 = TrainingArguments(
    "bert-finetuned-ner-combination-7",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=4,
    push_to_hub=True,
)
trainer = Trainer(
    model=model,
    args=args_combination_6,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],  # Use the "validation" set for hyperparameter optimization
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

trainer.train()

# After training, evaluate the model on the test set
test_results = trainer.evaluate(tokenized_datasets["test"])

# Print the test results
print("Test Results:")
print(test_results)

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.934225,0.675386,0.359494,0.469228,0.920471
2,0.001100,0.971665,0.698473,0.347468,0.464074,0.919294
3,0.001100,1.008281,0.698241,0.326582,0.445019,0.917367


Test Results:
{'eval_loss': 0.96390700340271, 'eval_precision': 0.5740479548660085, 'eval_recall': 0.16873963515754561, 'eval_f1': 0.26081384171739824, 'eval_accuracy': 0.9278643216080402, 'eval_runtime': 6.9263, 'eval_samples_per_second': 185.815, 'eval_steps_per_second': 23.245, 'epoch': 3.0}


In [None]:
args_combination_8 = TrainingArguments(
    "bert-finetuned-ner-combination-8",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=12,
    push_to_hub=True,
)
trainer = Trainer(
    model=model,
    args=args_combination_6,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],  # Use the "validation" set for hyperparameter optimization
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

trainer.train()

# After training, evaluate the model on the test set
test_results = trainer.evaluate(tokenized_datasets["test"])

# Print the test results
print("Test Results:")
print(test_results)

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.187177,0.721154,0.332278,0.454939,0.91742
2,0.000300,1.025149,0.676259,0.356962,0.467274,0.919829
3,0.000300,1.020509,0.698765,0.358228,0.47364,0.919615


Test Results:
{'eval_loss': 1.0025056600570679, 'eval_precision': 0.5787401574803149, 'eval_recall': 0.1828358208955224, 'eval_f1': 0.277882797731569, 'eval_accuracy': 0.9282412060301507, 'eval_runtime': 6.4262, 'eval_samples_per_second': 200.275, 'eval_steps_per_second': 25.054, 'epoch': 3.0}


# Task 6

In [117]:
def compute_metrics_extended(eval_preds, label_names):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)
    ]

    # Flatten the lists to make them suitable for classification_report
    true_labels_flat = [label for labels in true_labels for label in labels]
    true_predictions_flat = [label for labels in true_predictions for label in labels]

    # Create a LabelBinarizer for one-hot encoding labels
    label_binarizer = LabelBinarizer()
    label_binarizer.fit(label_names)

    # One-hot encode the labels
    true_labels_bin = label_binarizer.transform(true_labels_flat)
    true_predictions_bin = label_binarizer.transform(true_predictions_flat)

    # Calculate classification report for Precision, Recall, F1-scores
    report = classification_report(true_labels_bin, true_predictions_bin, target_names=label_names)

    # Calculate macro and micro average F1-scores if there are samples for each class
    macro_f1 = f1_score(true_labels_bin, true_predictions_bin, average='macro', zero_division=0)
    micro_f1 = f1_score(true_labels_bin, true_predictions_bin, average='micro', zero_division=0)

    # Compute metrics using classification_report from seqeval
    seqeval_report = seqeval_classification_report(true_labels, true_predictions, scheme='IOB2', output_dict=True)

    # Micro and macro metrics for seqeval
    seqeval_micro_f1 = seqeval_f1_score(true_labels, true_predictions)
    seqeval_macro_precision = seqeval_precision_score(true_labels, true_predictions)
    seqeval_macro_recall = seqeval_recall_score(true_labels, true_predictions)

    return {
        "classification_report": report,
        "macro_f1": macro_f1,
        "micro_f1": micro_f1,
        "seqeval_classification_report": seqeval_report,
        "seqeval_micro_f1": seqeval_micro_f1,
        "seqeval_macro_precision": seqeval_macro_precision,
        "seqeval_macro_recall": seqeval_macro_recall
    }


In [122]:
# Define label names
label_names = [
    "O",
    "B-location", "I-location",
    "B-group", "I-group",
    "B-corporation", "I-corporation",
    "B-person", "I-person",
    "B-creative-work", "I-creative-work",
    "B-product", "I-product"
]

# Define TrainingArguments
args_combination_1 = TrainingArguments(
    "bert-finetuned-ner-combination-1",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=4,
    push_to_hub=True,
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=args_combination_1,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=lambda eval_preds: compute_metrics_extended(eval_preds, label_names),
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# After training, evaluate the model on the test set
test_results = trainer.evaluate(tokenized_datasets["test"])

# Print the test results
print("Test Results:")
print(test_results)


Epoch,Training Loss,Validation Loss,Classification Report,Macro F1,Micro F1,Seqeval Classification Report,Seqeval Micro F1,Seqeval Macro Precision,Seqeval Macro Recall
1,0.014,0.658469,precision recall f1-score support  O 0.78 0.29 0.42 429  B-location 0.62 0.14 0.23 218  I-location 0.34 0.13 0.19 165  B-group 0.66 0.38 0.48 150  I-group 0.25 0.03 0.06 127  B-corporation 0.33 0.13 0.19 70  I-corporation 0.75 0.06 0.11 959  B-person 0.19 0.09 0.12 87  I-person 0.19 0.12 0.14 330 B-creative-work 0.50 0.24 0.32 237 I-creative-work 0.72 0.16 0.27 452  B-product 0.24 0.08 0.12 186  I-product 0.93 1.00 0.96 36390  micro avg 0.92 0.92 0.92 39800  macro avg 0.50 0.22 0.28 39800  weighted avg 0.90 0.92 0.90 39800  samples avg 0.92 0.92 0.92 39800,0.27841,0.922337,"{'corporation': {'precision': 0.7202072538860104, 'recall': 0.2434325744308231, 'f1-score': 0.3638743455497382, 'support': 571}, 'creative-work': {'precision': 0.44871794871794873, 'recall': 0.15086206896551724, 'f1-score': 0.2258064516129032, 'support': 232}, 'group': {'precision': 0.22666666666666666, 'recall': 0.1471861471861472, 'f1-score': 0.1784776902887139, 'support': 231}, 'location': {'precision': 0.43333333333333335, 'recall': 0.3466666666666667, 'f1-score': 0.3851851851851852, 'support': 150}, 'person': {'precision': 0.5735294117647058, 'recall': 0.16738197424892703, 'f1-score': 0.2591362126245847, 'support': 233}, 'product': {'precision': 0.20408163265306123, 'recall': 0.07874015748031496, 'f1-score': 0.11363636363636365, 'support': 127}, 'micro avg': {'precision': 0.4696048632218845, 'recall': 0.20012953367875647, 'f1-score': 0.28065395095367845, 'support': 1544}, 'macro avg': {'precision': 0.4344227078369543, 'recall': 0.18904493149639937, 'f1-score': 0.2543527081495815, 'support': 1544}, 'weighted avg': {'precision': 0.5131163383157975, 'recall': 0.20012953367875647, 'f1-score': 0.2810725570213155, 'support': 1544}}",0.280654,0.469605,0.20013
2,0.0256,0.756469,precision recall f1-score support  O 0.69 0.38 0.48 429  B-location 0.55 0.10 0.16 218  I-location 0.37 0.10 0.16 165  B-group 0.64 0.33 0.43 150  I-group 0.22 0.05 0.08 127  B-corporation 0.41 0.13 0.20 70  I-corporation 0.51 0.10 0.17 959  B-person 0.36 0.06 0.10 87  I-person 0.18 0.06 0.09 330 B-creative-work 0.56 0.14 0.23 237 I-creative-work 0.59 0.22 0.32 452  B-product 0.28 0.04 0.07 186  I-product 0.93 1.00 0.96 36390  micro avg 0.92 0.92 0.92 39800  macro avg 0.48 0.21 0.27 39800  weighted avg 0.90 0.92 0.90 39800  samples avg 0.92 0.92 0.92 39800,0.266464,0.923166,"{'corporation': {'precision': 0.597864768683274, 'recall': 0.29422066549912435, 'f1-score': 0.3943661971830986, 'support': 571}, 'creative-work': {'precision': 0.4897959183673469, 'recall': 0.10344827586206896, 'f1-score': 0.1708185053380783, 'support': 232}, 'group': {'precision': 0.25252525252525254, 'recall': 0.10822510822510822, 'f1-score': 0.15151515151515152, 'support': 231}, 'location': {'precision': 0.5, 'recall': 0.29333333333333333, 'f1-score': 0.3697478991596639, 'support': 150}, 'person': {'precision': 0.43636363636363634, 'recall': 0.20600858369098712, 'f1-score': 0.27988338192419826, 'support': 233}, 'product': {'precision': 0.29411764705882354, 'recall': 0.07874015748031496, 'f1-score': 0.12422360248447203, 'support': 127}, 'micro avg': {'precision': 0.4826021180030257, 'recall': 0.20660621761658032, 'f1-score': 0.2893424036281179, 'support': 1544}, 'macro avg': {'precision': 0.4284445371663889, 'recall': 0.18066268734848953, 'f1-score': 0.24842578960077713, 'support': 1544}, 'weighted avg': {'precision': 0.4710961384468298, 'recall': 0.20660621761658032, 'f1-score': 0.282554664642357, 'support': 1544}}",0.289342,0.482602,0.206606
3,0.0141,0.750982,precision recall f1-score support  O 0.69 0.38 0.49 429  B-location 0.47 0.11 0.18 218  I-location 0.33 0.10 0.16 165  B-group 0.59 0.39 0.47 150  I-group 0.23 0.07 0.11 127  B-corporation 0.23 0.13 0.17 70  I-corporation 0.50 0.12 0.19 959  B-person 0.14 0.06 0.08 87  I-person 0.18 0.08 0.11 330 B-creative-work 0.45 0.19 0.27 237 I-creative-work 0.50 0.26 0.34 452  B-product 0.23 0.09 0.13 186  I-product 0.94 0.99 0.97 36390  micro avg 0.92 0.92 0.92 39800  macro avg 0.42 0.23 0.28 39800  weighted avg 0.90 0.92 0.90 39800  samples avg 0.92 0.92 0.92 39800,0.281129,0.922789,"{'corporation': {'precision': 0.5544554455445545, 'recall': 0.29422066549912435, 'f1-score': 0.3844393592677346, 'support': 571}, 'creative-work': {'precision': 0.37662337662337664, 'recall': 0.125, 'f1-score': 0.18770226537216828, 'support': 232}, 'group': {'precision': 0.20930232558139536, 'recall': 0.11688311688311688, 'f1-score': 0.15, 'support': 231}, 'location': {'precision': 0.4017094017094017, 'recall': 0.31333333333333335, 'f1-score': 0.35205992509363293, 'support': 150}, 'person': {'precision': 0.36129032258064514, 'recall': 0.24034334763948498, 'f1-score': 0.28865979381443296, 'support': 233}, 'product': {'precision': 0.1935483870967742, 'recall': 0.09448818897637795, 'f1-score': 0.12698412698412698, 'support': 127}, 'micro avg': {'precision': 0.40213523131672596, 'recall': 0.21955958549222798, 'f1-score': 0.28403854210305823, 'support': 1544}, 'macro avg': {'precision': 0.3494882098560246, 'recall': 0.19737810872190628, 'f1-score': 0.24830757842201598, 'support': 1544}, 'weighted avg': {'precision': 0.40242047964433764, 'recall': 0.21955958549222798, 'f1-score': 0.2810268811904219, 'support': 1544}}",0.284039,0.402135,0.21956


Trainer is attempting to log a value of "                 precision    recall  f1-score   support

              O       0.78      0.29      0.42       429
     B-location       0.62      0.14      0.23       218
     I-location       0.34      0.13      0.19       165
        B-group       0.66      0.38      0.48       150
        I-group       0.25      0.03      0.06       127
  B-corporation       0.33      0.13      0.19        70
  I-corporation       0.75      0.06      0.11       959
       B-person       0.19      0.09      0.12        87
       I-person       0.19      0.12      0.14       330
B-creative-work       0.50      0.24      0.32       237
I-creative-work       0.72      0.16      0.27       452
      B-product       0.24      0.08      0.12       186
      I-product       0.93      1.00      0.96     36390

      micro avg       0.92      0.92      0.92     39800
      macro avg       0.50      0.22      0.28     39800
   weighted avg       0.90      0.92      0.9

Trainer is attempting to log a value of "                 precision    recall  f1-score   support

              O       0.69      0.38      0.49       429
     B-location       0.47      0.11      0.18       218
     I-location       0.33      0.10      0.16       165
        B-group       0.59      0.39      0.47       150
        I-group       0.23      0.07      0.11       127
  B-corporation       0.23      0.13      0.17        70
  I-corporation       0.50      0.12      0.19       959
       B-person       0.14      0.06      0.08        87
       I-person       0.18      0.08      0.11       330
B-creative-work       0.45      0.19      0.27       237
I-creative-work       0.50      0.26      0.34       452
      B-product       0.23      0.09      0.13       186
      I-product       0.94      0.99      0.97     36390

      micro avg       0.92      0.92      0.92     39800
      macro avg       0.42      0.23      0.28     39800
   weighted avg       0.90      0.92      0.9

Test Results:
{'eval_loss': 0.7509824633598328, 'eval_classification_report': '                 precision    recall  f1-score   support\n\n              O       0.69      0.38      0.49       429\n     B-location       0.47      0.11      0.18       218\n     I-location       0.33      0.10      0.16       165\n        B-group       0.59      0.39      0.47       150\n        I-group       0.23      0.07      0.11       127\n  B-corporation       0.23      0.13      0.17        70\n  I-corporation       0.50      0.12      0.19       959\n       B-person       0.14      0.06      0.08        87\n       I-person       0.18      0.08      0.11       330\nB-creative-work       0.45      0.19      0.27       237\nI-creative-work       0.50      0.26      0.34       452\n      B-product       0.23      0.09      0.13       186\n      I-product       0.94      0.99      0.97     36390\n\n      micro avg       0.92      0.92      0.92     39800\n      macro avg       0.42      0.23      0.28 

In [123]:
# Define label names
label_names = [
    "O",
    "B-location", "I-location",
    "B-group", "I-group",
    "B-corporation", "I-corporation",
    "B-person", "I-person",
    "B-creative-work", "I-creative-work",
    "B-product", "I-product"
]

# Define TrainingArguments
args_combination_2 = TrainingArguments(
    "bert-finetuned-ner-combination-2",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=12,
    push_to_hub=True,
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=args_combination_2,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=lambda eval_preds: compute_metrics_extended(eval_preds, label_names),
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# After training, evaluate the model on the test set
test_results = trainer.evaluate(tokenized_datasets["test"])

# Print the test results
print("Test Results:")
print(test_results)


Epoch,Training Loss,Validation Loss,Classification Report,Macro F1,Micro F1,Seqeval Classification Report,Seqeval Micro F1,Seqeval Macro Precision,Seqeval Macro Recall
1,No log,0.742025,precision recall f1-score support  O 0.67 0.35 0.46 429  B-location 0.57 0.09 0.16 218  I-location 0.48 0.13 0.20 165  B-group 0.52 0.43 0.47 150  I-group 0.17 0.06 0.09 127  B-corporation 0.20 0.29 0.24 70  I-corporation 0.46 0.08 0.14 959  B-person 0.19 0.03 0.06 87  I-person 0.18 0.06 0.09 330 B-creative-work 0.42 0.23 0.30 237 I-creative-work 0.49 0.25 0.33 452  B-product 0.23 0.28 0.25 186  I-product 0.94 0.99 0.96 36390  micro avg 0.92 0.92 0.92 39800  macro avg 0.42 0.25 0.29 39800  weighted avg 0.90 0.92 0.90 39800  samples avg 0.92 0.92 0.92 39800,0.288192,0.920477,"{'corporation': {'precision': 0.5555555555555556, 'recall': 0.2714535901926445, 'f1-score': 0.36470588235294116, 'support': 571}, 'creative-work': {'precision': 0.4897959183673469, 'recall': 0.10344827586206896, 'f1-score': 0.1708185053380783, 'support': 232}, 'group': {'precision': 0.3177570093457944, 'recall': 0.1471861471861472, 'f1-score': 0.20118343195266272, 'support': 231}, 'location': {'precision': 0.381294964028777, 'recall': 0.35333333333333333, 'f1-score': 0.36678200692041524, 'support': 150}, 'person': {'precision': 0.2981366459627329, 'recall': 0.20600858369098712, 'f1-score': 0.24365482233502536, 'support': 233}, 'product': {'precision': 0.13736263736263737, 'recall': 0.1968503937007874, 'f1-score': 0.16181229773462782, 'support': 127}, 'micro avg': {'precision': 0.3696837513631407, 'recall': 0.21955958549222798, 'f1-score': 0.27549776513612356, 'support': 1544}, 'macro avg': {'precision': 0.36331712177047404, 'recall': 0.21304672066099475, 'f1-score': 0.25149282443895843, 'support': 1544}, 'weighted avg': {'precision': 0.4199234990291538, 'recall': 0.21955958549222798, 'f1-score': 0.27635321327555024, 'support': 1544}}",0.275498,0.369684,0.21956
2,0.013300,0.867954,precision recall f1-score support  O 0.80 0.31 0.45 429  B-location 0.57 0.11 0.19 218  I-location 0.36 0.10 0.16 165  B-group 0.53 0.43 0.47 150  I-group 0.15 0.06 0.08 127  B-corporation 0.37 0.10 0.16 70  I-corporation 0.66 0.06 0.12 959  B-person 0.31 0.06 0.10 87  I-person 0.21 0.09 0.13 330 B-creative-work 0.42 0.20 0.27 237 I-creative-work 0.55 0.22 0.31 452  B-product 0.24 0.08 0.11 186  I-product 0.93 0.99 0.96 36390  micro avg 0.92 0.92 0.92 39800  macro avg 0.47 0.22 0.27 39800  weighted avg 0.90 0.92 0.90 39800  samples avg 0.92 0.92 0.92 39800,0.270412,0.922362,"{'corporation': {'precision': 0.7101449275362319, 'recall': 0.2574430823117338, 'f1-score': 0.37789203084832906, 'support': 571}, 'creative-work': {'precision': 0.5192307692307693, 'recall': 0.11637931034482758, 'f1-score': 0.1901408450704225, 'support': 232}, 'group': {'precision': 0.2545454545454545, 'recall': 0.12121212121212122, 'f1-score': 0.16422287390029325, 'support': 231}, 'location': {'precision': 0.3953488372093023, 'recall': 0.34, 'f1-score': 0.3655913978494624, 'support': 150}, 'person': {'precision': 0.3813559322033898, 'recall': 0.19313304721030042, 'f1-score': 0.2564102564102564, 'support': 233}, 'product': {'precision': 0.20930232558139536, 'recall': 0.07086614173228346, 'f1-score': 0.10588235294117646, 'support': 127}, 'micro avg': {'precision': 0.4658573596358118, 'recall': 0.19883419689119172, 'f1-score': 0.27871084884248754, 'support': 1544}, 'macro avg': {'precision': 0.4116547077177572, 'recall': 0.18317228380187775, 'f1-score': 0.24335662616999001, 'support': 1544}, 'weighted avg': {'precision': 0.4919002235870138, 'recall': 0.19883419689119172, 'f1-score': 0.27581209053512956, 'support': 1544}}",0.278711,0.465857,0.198834
3,0.013300,0.808096,precision recall f1-score support  O 0.76 0.34 0.48 429  B-location 0.57 0.11 0.18 218  I-location 0.32 0.15 0.21 165  B-group 0.53 0.41 0.46 150  I-group 0.23 0.05 0.08 127  B-corporation 0.24 0.11 0.16 70  I-corporation 0.57 0.09 0.16 959  B-person 0.31 0.06 0.10 87  I-person 0.20 0.10 0.14 330 B-creative-work 0.44 0.22 0.30 237 I-creative-work 0.64 0.22 0.33 452  B-product 0.28 0.07 0.11 186  I-product 0.94 0.99 0.96 36390  micro avg 0.92 0.92 0.92 39800  macro avg 0.47 0.23 0.28 39800  weighted avg 0.90 0.92 0.90 39800  samples avg 0.92 0.92 0.92 39800,0.281525,0.922965,"{'corporation': {'precision': 0.6456692913385826, 'recall': 0.287215411558669, 'f1-score': 0.3975757575757576, 'support': 571}, 'creative-work': {'precision': 0.5294117647058824, 'recall': 0.11637931034482758, 'f1-score': 0.1908127208480565, 'support': 232}, 'group': {'precision': 0.25874125874125875, 'recall': 0.16017316017316016, 'f1-score': 0.1978609625668449, 'support': 231}, 'location': {'precision': 0.38636363636363635, 'recall': 0.34, 'f1-score': 0.36170212765957455, 'support': 150}, 'person': {'precision': 0.46938775510204084, 'recall': 0.19742489270386265, 'f1-score': 0.27794561933534745, 'support': 233}, 'product': {'precision': 0.2553191489361702, 'recall': 0.09448818897637795, 'f1-score': 0.13793103448275862, 'support': 127}, 'micro avg': {'precision': 0.46482758620689657, 'recall': 0.2182642487046632, 'f1-score': 0.2970471573380344, 'support': 1544}, 'macro avg': {'precision': 0.42414880919792847, 'recall': 0.1992801606261496, 'f1-score': 0.2606380370780566, 'support': 1544}, 'weighted avg': {'precision': 0.48641020067586843, 'recall': 0.2182642487046632, 'f1-score': 0.293733213082144, 'support': 1544}}",0.297047,0.464828,0.218264


Trainer is attempting to log a value of "                 precision    recall  f1-score   support

              O       0.67      0.35      0.46       429
     B-location       0.57      0.09      0.16       218
     I-location       0.48      0.13      0.20       165
        B-group       0.52      0.43      0.47       150
        I-group       0.17      0.06      0.09       127
  B-corporation       0.20      0.29      0.24        70
  I-corporation       0.46      0.08      0.14       959
       B-person       0.19      0.03      0.06        87
       I-person       0.18      0.06      0.09       330
B-creative-work       0.42      0.23      0.30       237
I-creative-work       0.49      0.25      0.33       452
      B-product       0.23      0.28      0.25       186
      I-product       0.94      0.99      0.96     36390

      micro avg       0.92      0.92      0.92     39800
      macro avg       0.42      0.25      0.29     39800
   weighted avg       0.90      0.92      0.9

Trainer is attempting to log a value of "                 precision    recall  f1-score   support

              O       0.76      0.34      0.48       429
     B-location       0.57      0.11      0.18       218
     I-location       0.32      0.15      0.21       165
        B-group       0.53      0.41      0.46       150
        I-group       0.23      0.05      0.08       127
  B-corporation       0.24      0.11      0.16        70
  I-corporation       0.57      0.09      0.16       959
       B-person       0.31      0.06      0.10        87
       I-person       0.20      0.10      0.14       330
B-creative-work       0.44      0.22      0.30       237
I-creative-work       0.64      0.22      0.33       452
      B-product       0.28      0.07      0.11       186
      I-product       0.94      0.99      0.96     36390

      micro avg       0.92      0.92      0.92     39800
      macro avg       0.47      0.23      0.28     39800
   weighted avg       0.90      0.92      0.9

Test Results:
{'eval_loss': 0.8080958127975464, 'eval_classification_report': '                 precision    recall  f1-score   support\n\n              O       0.76      0.34      0.48       429\n     B-location       0.57      0.11      0.18       218\n     I-location       0.32      0.15      0.21       165\n        B-group       0.53      0.41      0.46       150\n        I-group       0.23      0.05      0.08       127\n  B-corporation       0.24      0.11      0.16        70\n  I-corporation       0.57      0.09      0.16       959\n       B-person       0.31      0.06      0.10        87\n       I-person       0.20      0.10      0.14       330\nB-creative-work       0.44      0.22      0.30       237\nI-creative-work       0.64      0.22      0.33       452\n      B-product       0.28      0.07      0.11       186\n      I-product       0.94      0.99      0.96     36390\n\n      micro avg       0.92      0.92      0.92     39800\n      macro avg       0.47      0.23      0.28 