# Assignment 2 - Data Mining

# Task 1
The task one is just to download the W-NUT_data.zip
- [X] Done

# Task 2
Task 2 is to convert IOB data to the correct data structure for token class.
- [X] Done

Needed libraries

In [36]:
# ! pip install transformers
# ! pip install datasets
# ! pip install seqeval
# ! pip install evaluate
# ! pip install torch
# ! pip install accelerate
# ! pip install transformers[torch]

# ! pip install optuna
# # or $ conda install -c conda-forge optuna



In [37]:
from datasets import Dataset
from transformers import AutoTokenizer
import torch
import accelerate
import evaluate
from transformers import DataCollatorForTokenClassification
from datasets import DatasetDict
from transformers import AutoModelForTokenClassification
from huggingface_hub import notebook_login
from transformers import TrainingArguments
from transformers import Trainer
from sklearn.model_selection import ParameterGrid
import seqeval
from sklearn.metrics import classification_report



# Importing training set

In [38]:
# Define the data format
data = {
    "tokens": [],  # List of tokens
    "labels": [],  # List of integer labels
}

label_to_int = {}  # Dictionary to map original labels to integers
label_names = []  # List to map integers to original labels

train_path = "W-NUT_data/wnut17train.conll"


# Read the .conll file and populate the data dictionary
with open(train_path, "r") as file:
    lines = file.readlines()
    tokens, labels = [], []
    for line in lines:
        parts = line.strip().split()
        if parts:
            token, label = parts
            tokens.append(token)

            # Check if the label is already in the mapping dictionary
            if label not in label_to_int:
                label_to_int[label] = len(label_to_int)
                label_names.append(label)

            labels.append(label_to_int[label])
        else:
            data["tokens"].append(tokens)
            data["labels"].append(labels)
            tokens, labels = [], []  # Reset for the next sentence

# Create a custom dataset
train = Dataset.from_dict(data)

In [39]:
print(train)

# Importing validation set

In [40]:
# Define the data format for the validation set
validation_data = {
    "tokens": [],  # List of tokens
    "labels": [],  # List of integer labels
}

validation_path = "W-NUT_data/emerging.dev.conll"  # Replace with the actual path to your validation set .conll file

# Read the .conll file for the validation set and populate the data dictionary
with open(validation_path, "r") as file:
    lines = file.readlines()
    tokens, labels = [], []
    for line in lines:
        parts = line.strip().split()
        if len(parts) == 2:  # Check if there are both token and label
            token, label = parts
            tokens.append(token)

            if label not in label_to_int:
                label_to_int[label] = len(label_names)
                label_names.append(label)

            labels.append(label_to_int[label])
        else:
            if parts:  # Handle lines with only one value
                # Decide how to handle lines with one value (e.g., set a default label)
                token = parts[0]
                label = "O"  # You can replace this with an appropriate default label
                tokens.append(token)

                if label not in label_to_int:
                    label_to_int[label] = len(label_names)
                    label_names.append(label)

                labels.append(label_to_int[label])
            else:
                validation_data["tokens"].append(tokens)
                validation_data["labels"].append(labels)
                tokens, labels = [], []  # Reset for the next sentence

# If there's data left to process
if tokens:
    validation_data["tokens"].append(tokens)
    validation_data["labels"].append(labels)

# Create a custom dataset for the validation set
validation = Dataset.from_dict(validation_data)


In [41]:
print(validation)

# Importing test set

In [42]:
# Define the data format for the test set
test_data = {
   "tokens": [],  # List of tokens
   "labels": [],  # List of integer labels
}

test_path = "W-NUT_data/emerging.test.annotated"  # Replace with the actual path to your test set .conll file

# Read the .conll file for the test set and populate the data dictionary
with open(test_path, "r") as file:
   lines = file.readlines()
   tokens, labels = [], []
   for line in lines:
       parts = line.strip().split()
       if len(parts) == 2:  # Check if there are both token and label
           token, label = parts
           tokens.append(token)

           if label not in label_to_int:
               label_to_int[label] = len(label_names)
               label_names.append(label)

           labels.append(label_to_int[label])
       else:
           if parts:  # Handle lines with only one value
               # Decide how to handle lines with one value (e.g., set a default label)
               token = parts[0]
               label = "O"  # You can replace this with an appropriate default label
               tokens.append(token)

               if label not in label_to_int:
                   label_to_int[label] = len(label_names)
                   label_names.append(label)

               labels.append(label_to_int[label])
           else:
               test_data["tokens"].append(tokens)
               test_data["labels"].append(labels)
               tokens, labels = [], []  # Reset for the next sentence

# If there's data left to process
if tokens:
   test_data["tokens"].append(tokens)
   test_data["labels"].append(labels)

# Create a custom dataset for the test set
test = Dataset.from_dict(test_data)


In [43]:
print(test)

In [44]:
print(label_to_int)

In [45]:
print(label_names)

# Realizing a dataset dictionary

In [46]:
raw_datasets = DatasetDict({
    'train': train,
    'validation': validation,
    'test': test
})

In [47]:
print(raw_datasets)

# Verify the initial alignment between tokens and labels.

In [48]:
words = raw_datasets["train"][0]["tokens"]
labels = raw_datasets["train"][0]["labels"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

# Importing the needed tokenizer

In [49]:
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [50]:
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 '@',
 'p',
 '##aul',
 '##walk',
 'It',
 "'",
 's',
 'the',
 'view',
 'from',
 'where',
 'I',
 "'",
 'm',
 'living',
 'for',
 'two',
 'weeks',
 '.',
 'Empire',
 'State',
 'Building',
 '=',
 'E',
 '##SB',
 '.',
 'Pretty',
 'bad',
 'storm',
 'here',
 'last',
 'evening',
 '.',
 '[SEP]']

# Defining the function to align labels and tokens properly

In [51]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [52]:
labels = raw_datasets["train"][0]["labels"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

Defining the function to apply the previous function on all the dataset

In [53]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["labels"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [54]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

Map:   0%|          | 0/3394 [00:00<?, ? examples/s]

Map:   0%|          | 0/1009 [00:00<?, ? examples/s]

Map:   0%|          | 0/1287 [00:00<?, ? examples/s]

In [55]:
print(tokenized_datasets)

Data collation

In [56]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [57]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    1,    2,    2,    0,
            1,    2,    0,    0,    0,    0,    0,    0,    0,    0, -100],
        [-100,    0,    0,    0,    0,    0,    0,    3,    4,    4,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0, -100, -100]])

In [58]:
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

# Task 3
Evaluation and metrics

In [59]:
metric = evaluate.load("seqeval")

In [60]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

# Task 4

In [61]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}


In [62]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [63]:
model.config.num_labels

13

In [64]:
notebook_login() #reading token

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [65]:
notebook_login() # writting token

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [88]:
args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

In [89]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()


  0%|          | 0/1275 [00:00<?, ?it/s]

RuntimeError: MPS backend out of memory (MPS allocated: 3.29 GB, other allocations: 14.77 GB, max allowed: 18.13 GB). Tried to allocate 84.95 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

# Create Trainer with the current model and arguments
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],  # Use the test set for evaluation during training
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

# Train the model with the current hyperparameters
trainer.train()

# After training, evaluate the model on the test set
test_results = trainer.evaluate(tokenized_datasets["test"])

# Print and/or store the results for this hyperparameter combination
print(f"Test Results: {test_results}")


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.880355,0.52381,0.214345,0.304207,0.928543
2,0.001600,0.986134,0.557962,0.181592,0.274007,0.928141
3,0.001500,0.972839,0.564838,0.187811,0.281892,0.928191


Test Results: {'eval_loss': 0.9728385210037231, 'eval_precision': 0.5648379052369077, 'eval_recall': 0.18781094527363185, 'eval_f1': 0.28189172370877413, 'eval_accuracy': 0.9281909547738694, 'eval_runtime': 6.2486, 'eval_samples_per_second': 205.966, 'eval_steps_per_second': 25.766, 'epoch': 3.0}


# Task 5 Hyperparameters tuning
Tuning learning rate and batch size with gradient accumulation because of GPU memory problems.

In [46]:
args_combination_1 = TrainingArguments(
    "bert-finetuned-ner-combination-1",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=4,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=args_combination_1,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],  # Use the "validation" set for hyperparameter optimization
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

trainer.train()

# After training, evaluate the model on the test set
test_results = trainer.evaluate(tokenized_datasets["test"])

# Print the test results
print("Test Results:")
print(test_results)

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0204,0.889384,0.680328,0.31519,0.430796,0.915868
2,0.0213,0.805429,0.668213,0.364557,0.471744,0.917046
3,0.009,0.720928,0.697917,0.381646,0.493453,0.921327


Test Results:
{'eval_loss': 0.7523127198219299, 'eval_precision': 0.5814249363867684, 'eval_recall': 0.189469320066335, 'eval_f1': 0.2858036272670419, 'eval_accuracy': 0.929321608040201, 'eval_runtime': 6.2163, 'eval_samples_per_second': 207.037, 'eval_steps_per_second': 25.9, 'epoch': 3.0}


In [47]:
args_combination_2 = TrainingArguments(
    "bert-finetuned-ner-combination-2",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=12,
    push_to_hub=True,
)
trainer = Trainer(
    model=model,
    args=args_combination_2,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],  # Use the "validation" set for hyperparameter optimization
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

trainer.train()

# After training, evaluate the model on the test set
test_results = trainer.evaluate(tokenized_datasets["test"])

# Print the test results
print("Test Results:")
print(test_results)

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.847894,0.670058,0.291772,0.406526,0.913085
2,0.013500,0.622215,0.675138,0.386709,0.491751,0.920739
3,0.013500,0.652038,0.658026,0.417722,0.511034,0.923307


Test Results:
{'eval_loss': 0.7375848889350891, 'eval_precision': 0.5557986870897156, 'eval_recall': 0.21061359867330018, 'eval_f1': 0.30547203848466625, 'eval_accuracy': 0.9295477386934673, 'eval_runtime': 6.8643, 'eval_samples_per_second': 187.492, 'eval_steps_per_second': 23.455, 'epoch': 3.0}


In [48]:
args_combination_3 = TrainingArguments(
    "bert-finetuned-ner-combination-3",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=4,
    push_to_hub=True,
)
trainer = Trainer(
    model=model,
    args=args_combination_3,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],  # Use the "validation" set for hyperparameter optimization
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

trainer.train()

# After training, evaluate the model on the test set
test_results = trainer.evaluate(tokenized_datasets["test"])

# Print the test results
print("Test Results:")
print(test_results)

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0079,0.912543,0.693935,0.311392,0.429882,0.91635
2,0.0057,0.823483,0.71123,0.336709,0.457045,0.917688
3,0.0059,0.730988,0.681406,0.38038,0.488221,0.920953


Test Results:
{'eval_loss': 0.8010109066963196, 'eval_precision': 0.5705596107055961, 'eval_recall': 0.19444444444444445, 'eval_f1': 0.29004329004329005, 'eval_accuracy': 0.9292713567839196, 'eval_runtime': 6.8971, 'eval_samples_per_second': 186.599, 'eval_steps_per_second': 23.343, 'epoch': 3.0}


In [49]:
args_combination_4 = TrainingArguments(
    "bert-finetuned-ner-combination-4",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=12,
    push_to_hub=True,
)
trainer = Trainer(
    model=model,
    args=args_combination_4,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],  # Use the "validation" set for hyperparameter optimization
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

trainer.train()

# After training, evaluate the model on the test set
test_results = trainer.evaluate(tokenized_datasets["test"])

# Print the test results
print("Test Results:")
print(test_results)

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.859605,0.712379,0.371519,0.488353,0.920257
2,0.003900,0.706297,0.664888,0.394304,0.495034,0.921916
3,0.003900,0.662699,0.643917,0.412025,0.502509,0.924699


Test Results:
{'eval_loss': 0.7843152284622192, 'eval_precision': 0.5411764705882353, 'eval_recall': 0.20978441127694858, 'eval_f1': 0.3023603226770242, 'eval_accuracy': 0.9294221105527638, 'eval_runtime': 6.4846, 'eval_samples_per_second': 198.469, 'eval_steps_per_second': 24.828, 'epoch': 3.0}


In [50]:
args_combination_5 = TrainingArguments(
    "bert-finetuned-ner-combination-5",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=4,
    push_to_hub=True,
)
trainer = Trainer(
    model=model,
    args=args_combination_5,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],  # Use the "validation" set for hyperparameter optimization
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

trainer.train()

# After training, evaluate the model on the test set
test_results = trainer.evaluate(tokenized_datasets["test"])

# Print the test results
print("Test Results:")
print(test_results)

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0024,0.857778,0.661905,0.351899,0.459504,0.919026
2,0.0016,0.860201,0.651672,0.357595,0.46179,0.919347
3,0.0016,0.845222,0.657778,0.374684,0.477419,0.920685


Test Results:
{'eval_loss': 0.8803117275238037, 'eval_precision': 0.5288888888888889, 'eval_recall': 0.19734660033167495, 'eval_f1': 0.28743961352657005, 'eval_accuracy': 0.9284673366834171, 'eval_runtime': 6.1025, 'eval_samples_per_second': 210.896, 'eval_steps_per_second': 26.382, 'epoch': 3.0}


In [51]:
args_combination_6 = TrainingArguments(
    "bert-finetuned-ner-combination-6",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=12,
    push_to_hub=True,
)
trainer = Trainer(
    model=model,
    args=args_combination_6,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],  # Use the "validation" set for hyperparameter optimization
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

trainer.train()

# After training, evaluate the model on the test set
test_results = trainer.evaluate(tokenized_datasets["test"])

# Print the test results
print("Test Results:")
print(test_results)

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.862558,0.685979,0.374684,0.48465,0.921274
2,0.001500,0.970584,0.675776,0.344304,0.456184,0.91817
3,0.001500,0.945845,0.691624,0.344937,0.460304,0.918384


Test Results:
{'eval_loss': 0.9369379281997681, 'eval_precision': 0.5826558265582655, 'eval_recall': 0.17827529021558872, 'eval_f1': 0.273015873015873, 'eval_accuracy': 0.9279396984924623, 'eval_runtime': 6.1735, 'eval_samples_per_second': 208.471, 'eval_steps_per_second': 26.079, 'epoch': 3.0}


In [52]:
args_combination_7 = TrainingArguments(
    "bert-finetuned-ner-combination-7",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=4,
    push_to_hub=True,
)
trainer = Trainer(
    model=model,
    args=args_combination_6,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],  # Use the "validation" set for hyperparameter optimization
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

trainer.train()

# After training, evaluate the model on the test set
test_results = trainer.evaluate(tokenized_datasets["test"])

# Print the test results
print("Test Results:")
print(test_results)

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.934225,0.675386,0.359494,0.469228,0.920471
2,0.001100,0.971665,0.698473,0.347468,0.464074,0.919294
3,0.001100,1.008281,0.698241,0.326582,0.445019,0.917367


Test Results:
{'eval_loss': 0.96390700340271, 'eval_precision': 0.5740479548660085, 'eval_recall': 0.16873963515754561, 'eval_f1': 0.26081384171739824, 'eval_accuracy': 0.9278643216080402, 'eval_runtime': 6.9263, 'eval_samples_per_second': 185.815, 'eval_steps_per_second': 23.245, 'epoch': 3.0}


In [53]:
args_combination_8 = TrainingArguments(
    "bert-finetuned-ner-combination-8",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=12,
    push_to_hub=True,
)
trainer = Trainer(
    model=model,
    args=args_combination_6,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],  # Use the "validation" set for hyperparameter optimization
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

trainer.train()

# After training, evaluate the model on the test set
test_results = trainer.evaluate(tokenized_datasets["test"])

# Print the test results
print("Test Results:")
print(test_results)

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.187177,0.721154,0.332278,0.454939,0.91742
2,0.000300,1.025149,0.676259,0.356962,0.467274,0.919829
3,0.000300,1.020509,0.698765,0.358228,0.47364,0.919615


Test Results:
{'eval_loss': 1.0025056600570679, 'eval_precision': 0.5787401574803149, 'eval_recall': 0.1828358208955224, 'eval_f1': 0.277882797731569, 'eval_accuracy': 0.9282412060301507, 'eval_runtime': 6.4262, 'eval_samples_per_second': 200.275, 'eval_steps_per_second': 25.054, 'epoch': 3.0}


# Task 6

In [66]:
import numpy as np
from seqeval.metrics import classification_report

# Define the list of entity types (label names)
label_names = [
    "O", 
    "B-location", "I-location", 
    "B-group", "I-group", 
    "B-corporation", "I-corporation", 
    "B-person", "I-person", 
    "B-creative-work", "I-creative-work", 
    "B-product", "I-product"
]


# Define the IOB2 scheme (Inside-Outside-Beginning)
IOB2 = True

def compute_metrics_extended(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Convert label and prediction indices to label names
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    pred_labels = [[label_names[p] for p, l in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    # Compute metrics using classification_report from seqeval
    report = classification_report(true_labels, pred_labels, scheme=IOB2, output_dict=True)

    # Initialize variables for micro metrics
    tp, fp, fn = 0, 0, 0

    # Initialize lists for macro metrics
    macro_precision_list = []
    macro_recall_list = []
    macro_f1_list = []

    # Create a dictionary to hold metrics for each entity type
    entity_metrics = {}

    for entity_type in label_names:
        if entity_type != 'O':
            entity_type_without_prefix = entity_type[2:]  # Remove the 'B-' or 'I-' prefix

            if entity_type_without_prefix not in entity_metrics:
                entity_metrics[entity_type_without_prefix] = {}

            # Extract metrics for B-label, I-label, and full entities
            for label_suffix in ["", "I"]:
                label = entity_type[:2] + label_suffix + entity_type[2:]
                if label in report:
                    entity_metrics[entity_type_without_prefix]["precision_" + label_suffix] = report[label]["precision"]
                    entity_metrics[entity_type_without_prefix]["recall_" + label_suffix] = report[label]["recall"]
                    entity_metrics[entity_type_without_prefix]["f1-score_" + label_suffix] = report[label]["f1-score"]
                else:
                    entity_metrics[entity_type_without_prefix]["precision_" + label_suffix] = 0
                    entity_metrics[entity_type_without_prefix]["recall_" + label_suffix] = 0
                    entity_metrics[entity_type_without_prefix]["f1-score_" + label_suffix] = 0

                # Update tp, fp, fn based on the evaluation results for this entity type
                if label in report:
                    tp += report[label]["support"]
                    fp += report[label]["false positives"]
                    fn += report[label]["false negatives"]

    # Calculate micro-average metrics if there are any true positives
    if tp > 0:
        micro_precision = tp / (tp + fp)
        micro_recall = tp / (tp + fn)
        micro_f1 = 2 * (micro_precision * micro_recall) / (micro_precision + micro_recall)
    else:
        micro_precision = 0
        micro_recall = 0
        micro_f1 = 0

    # Calculate macro-average metrics
    for entity_type in entity_metrics:
        precision_key = "precision" if "precision" in entity_metrics[entity_type] else "precision_"
        recall_key = "recall" if "recall" in entity_metrics[entity_type] else "recall_"
        f1_key = "f1-score" if "f1-score" in entity_metrics[entity_type] else "f1-score_"

        macro_precision_list.append(entity_metrics[entity_type][precision_key])
        macro_recall_list.append(entity_metrics[entity_type][recall_key])
        macro_f1_list.append(entity_metrics[entity_type][f1_key])

    macro_precision = np.mean(macro_precision_list)
    macro_recall = np.mean(macro_recall_list)
    macro_f1 = np.mean(macro_f1_list)

    return {
        "classification_report": report,
        "micro_precision": micro_precision,
        "micro_recall": micro_recall,
        "micro_f1": micro_f1,
        "macro_precision": macro_precision,
        "macro_recall": macro_recall,
        "macro_f1": macro_f1,
        "entity_metrics": entity_metrics,
    }


In [69]:
args_combination_1 = TrainingArguments(
    "bert-finetuned-ner-combination-1",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=4,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=args_combination_1,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics_extended,
    tokenizer=tokenizer,
)

trainer.train()

# After training, evaluate the model on the test set
test_results = trainer.evaluate(tokenized_datasets["test"])

# Print the test results
print("Test Results:")
print(test_results)

  0%|          | 0/2547 [00:00<?, ?it/s]

  0%|          | 0/161 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'corporation': {'precision': 0.7346153846153847, 'recall': 0.3345008756567426, 'f1-score': 0.45968712394705186, 'support': 571}, 'creative-work': {'precision': 0.28160919540229884, 'recall': 0.21120689655172414, 'f1-score': 0.24137931034482757, 'support': 232}, 'group': {'precision': 0.24647887323943662, 'recall': 0.15151515151515152, 'f1-score': 0.18766756032171583, 'support': 231}, 'location': {'precision': 0.6875, 'recall': 0.29333333333333333, 'f1-score': 0.411214953271028, 'support': 150}, 'person': {'precision': 0.5773195876288659, 'recall': 0.24034334763948498, 'f1-score': 0.33939393939393936, 'support': 233}, 'product': {'precision': 0.2222222222222222, 'recall': 0.12598425196850394, 'f1-score': 0.16080402010050251, 'support': 127}, 'micro avg': {'precision': 0.48331273176761436, 'recall': 0.25323834196891193, 'f1-score': 0.3323416914577136, 'support': 1544}, 'macro avg': {'precision': 0.45829087718470135, 'recall': 0.22614730944415673,

In [None]:
args_combination_2 = TrainingArguments(
    "bert-finetuned-ner-combination-2",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=12,
    push_to_hub=True,
)
trainer = Trainer(
    model=model,
    args=args_combination_2,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics_extended,
    tokenizer=tokenizer,
)

trainer.train()

# After training, evaluate the model on the test set
test_results = trainer.evaluate(tokenized_datasets["test"])

# Print the test results
print("Test Results:")
print(test_results)

  0%|          | 0/849 [00:00<?, ?it/s]

  0%|          | 0/127 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'corporation': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 575}, 'creative-work': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 141}, 'group': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72}, 'location': {'precision': 0.20652173913043478, 'recall': 0.25675675675675674, 'f1-score': 0.2289156626506024, 'support': 74}, 'person': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 213}, 'product': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 92}, 'micro avg': {'precision': 0.20652173913043478, 'recall': 0.016281062553556127, 'f1-score': 0.030182684670373314, 'support': 1167}, 'macro avg': {'precision': 0.034420289855072464, 'recall': 0.04279279279279279, 'f1-score': 0.038152610441767064, 'support': 1167}, 'weighted avg': {'precision': 0.013095637271338624, 'recall': 0.016281062553556127, 'f1-score': 0.014515646132086182, 'support': 1167}}" of type <class 'dict'> for key 

  0%|          | 0/127 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'corporation': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 575}, 'creative-work': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 141}, 'group': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72}, 'location': {'precision': 0.2361111111111111, 'recall': 0.22972972972972974, 'f1-score': 0.2328767123287671, 'support': 74}, 'person': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 213}, 'product': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 92}, 'micro avg': {'precision': 0.2361111111111111, 'recall': 0.01456726649528706, 'f1-score': 0.02744148506860371, 'support': 1167}, 'macro avg': {'precision': 0.03935185185185185, 'recall': 0.03828828828828829, 'f1-score': 0.03881278538812785, 'support': 1167}, 'weighted avg': {'precision': 0.014971912786822812, 'recall': 0.01456726649528706, 'f1-score': 0.014766818091112912, 's

  0%|          | 0/127 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'corporation': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 575}, 'creative-work': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 141}, 'group': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72}, 'location': {'precision': 0.21176470588235294, 'recall': 0.24324324324324326, 'f1-score': 0.22641509433962265, 'support': 74}, 'person': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 213}, 'product': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 92}, 'micro avg': {'precision': 0.21176470588235294, 'recall': 0.015424164524421594, 'f1-score': 0.02875399361022364, 'support': 1167}, 'macro avg': {'precision': 0.03529411764705882, 'recall': 0.04054054054054054, 'f1-score': 0.03773584905660377, 'support': 1167}, 'weighted avg': {'precision': 0.013428096174202328, 'recall': 0.015424164524421594, 'f1-score': 0.01435708395983896

  0%|          | 0/161 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'corporation': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 571}, 'creative-work': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 232}, 'group': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 231}, 'location': {'precision': 0.21568627450980393, 'recall': 0.22, 'f1-score': 0.21782178217821785, 'support': 150}, 'person': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 233}, 'product': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 127}, 'micro avg': {'precision': 0.21568627450980393, 'recall': 0.021373056994818652, 'f1-score': 0.03889216263995286, 'support': 1544}, 'macro avg': {'precision': 0.03594771241830066, 'recall': 0.03666666666666667, 'f1-score': 0.036303630363036306, 'support': 1544}, 'weighted avg': {'precision': 0.020953977445900638, 'recall': 0.021373056994818652, 'f1-score': 0.021161442569127383, 'support

In [None]:
args_combination_3 = TrainingArguments(
    "bert-finetuned-ner-combination-3",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=4,
    push_to_hub=True,
)
trainer = Trainer(
    model=model,
    args=args_combination_3,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics_extended,
    tokenizer=tokenizer,
)

trainer.train()

# After training, evaluate the model on the test set
test_results = trainer.evaluate(tokenized_datasets["test"])

# Print the test results
print("Test Results:")
print(test_results)

  0%|          | 0/2547 [00:00<?, ?it/s]

  0%|          | 0/127 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'corporation': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 575}, 'creative-work': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 141}, 'group': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72}, 'location': {'precision': 0.17475728155339806, 'recall': 0.24324324324324326, 'f1-score': 0.20338983050847462, 'support': 74}, 'person': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 213}, 'product': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 92}, 'micro avg': {'precision': 0.17475728155339806, 'recall': 0.015424164524421594, 'f1-score': 0.028346456692913382, 'support': 1167}, 'macro avg': {'precision': 0.02912621359223301, 'recall': 0.04054054054054054, 'f1-score': 0.03389830508474577, 'support': 1167}, 'weighted avg': {'precision': 0.011081438590361144, 'recall': 0.015424164524421594, 'f1-score': 0.012897041523245176, 'support': 1167}}" of type <class 'dict'> for key "

  0%|          | 0/127 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'corporation': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 575}, 'creative-work': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 141}, 'group': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72}, 'location': {'precision': 0.14516129032258066, 'recall': 0.24324324324324326, 'f1-score': 0.18181818181818182, 'support': 74}, 'person': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 213}, 'product': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 92}, 'micro avg': {'precision': 0.14516129032258066, 'recall': 0.015424164524421594, 'f1-score': 0.0278853601859024, 'support': 1167}, 'macro avg': {'precision': 0.024193548387096777, 'recall': 0.04054054054054054, 'f1-score': 0.030303030303030304, 'support': 1167}, 'weighted avg': {'precision': 0.009204743345219339, 'recall': 0.015424164524421594, 'f1-score': 0.0115291734829009

  0%|          | 0/127 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'corporation': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 575}, 'creative-work': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 141}, 'group': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72}, 'location': {'precision': 0.20930232558139536, 'recall': 0.24324324324324326, 'f1-score': 0.225, 'support': 74}, 'person': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 213}, 'product': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 92}, 'micro avg': {'precision': 0.20930232558139536, 'recall': 0.015424164524421594, 'f1-score': 0.028731045490822026, 'support': 1167}, 'macro avg': {'precision': 0.03488372093023256, 'recall': 0.04054054054054054, 'f1-score': 0.0375, 'support': 1167}, 'weighted avg': {'precision': 0.01327195552101393, 'recall': 0.015424164524421594, 'f1-score': 0.014267352185089977, 'support': 1167}}" of ty

  0%|          | 0/161 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'corporation': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 571}, 'creative-work': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 232}, 'group': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 231}, 'location': {'precision': 0.21019108280254778, 'recall': 0.22, 'f1-score': 0.21498371335504884, 'support': 150}, 'person': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 233}, 'product': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 127}, 'micro avg': {'precision': 0.21019108280254778, 'recall': 0.021373056994818652, 'f1-score': 0.03880070546737214, 'support': 1544}, 'macro avg': {'precision': 0.03503184713375796, 'recall': 0.03666666666666667, 'f1-score': 0.03583061889250814, 'support': 1544}, 'weighted avg': {'precision': 0.02042011814791591, 'recall': 0.021373056994818652, 'f1-score': 0.020885723447705524, 'support':

In [None]:
args_combination_4 = TrainingArguments(
    "bert-finetuned-ner-combination-4",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=12,
    push_to_hub=True,
)
trainer = Trainer(
    model=model,
    args=args_combination_4,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics_extended,
    tokenizer=tokenizer,
)

trainer.train()

# After training, evaluate the model on the test set
test_results = trainer.evaluate(tokenized_datasets["test"])

# Print the test results
print("Test Results:")
print(test_results)

  0%|          | 0/849 [00:00<?, ?it/s]

  0%|          | 0/127 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'corporation': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 575}, 'creative-work': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 141}, 'group': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72}, 'location': {'precision': 0.19387755102040816, 'recall': 0.25675675675675674, 'f1-score': 0.2209302325581395, 'support': 74}, 'person': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 213}, 'product': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 92}, 'micro avg': {'precision': 0.19387755102040816, 'recall': 0.016281062553556127, 'f1-score': 0.0300395256916996, 'support': 1167}, 'macro avg': {'precision': 0.03231292517006803, 'recall': 0.04279279279279279, 'f1-score': 0.036821705426356585, 'support': 1167}, 'weighted avg': {'precision': 0.012293863560848503, 'recall': 0.016281062553556127, 'f1-score': 0.01400928638329248, 'support': 1167}}" of type <class 'dict'> for key "eva

  0%|          | 0/127 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'corporation': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 575}, 'creative-work': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 141}, 'group': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72}, 'location': {'precision': 0.20689655172413793, 'recall': 0.24324324324324326, 'f1-score': 0.2236024844720497, 'support': 74}, 'person': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 213}, 'product': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 92}, 'micro avg': {'precision': 0.20689655172413793, 'recall': 0.015424164524421594, 'f1-score': 0.028708133971291863, 'support': 1167}, 'macro avg': {'precision': 0.034482758620689655, 'recall': 0.04054054054054054, 'f1-score': 0.037267080745341616, 'support': 1167}, 'weighted avg': {'precision': 0.013119404308128711, 'recall': 0.015424164524421594, 'f1-score': 0.014178735090772

  0%|          | 0/127 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'corporation': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 575}, 'creative-work': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 141}, 'group': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72}, 'location': {'precision': 0.23684210526315788, 'recall': 0.24324324324324326, 'f1-score': 0.23999999999999996, 'support': 74}, 'person': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 213}, 'product': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 92}, 'micro avg': {'precision': 0.23684210526315788, 'recall': 0.015424164524421594, 'f1-score': 0.028962188254223652, 'support': 1167}, 'macro avg': {'precision': 0.039473684210526314, 'recall': 0.04054054054054054, 'f1-score': 0.039999999999999994, 'support': 1167}, 'weighted avg': {'precision': 0.015018265457989448, 'recall': 0.015424164524421594, 'f1-score': 0.01521850899742

  0%|          | 0/161 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'corporation': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 571}, 'creative-work': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 232}, 'group': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 231}, 'location': {'precision': 0.2158273381294964, 'recall': 0.2, 'f1-score': 0.20761245674740486, 'support': 150}, 'person': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 233}, 'product': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 127}, 'micro avg': {'precision': 0.2158273381294964, 'recall': 0.019430051813471502, 'f1-score': 0.035650623885918005, 'support': 1544}, 'macro avg': {'precision': 0.03597122302158273, 'recall': 0.03333333333333333, 'f1-score': 0.03460207612456748, 'support': 1544}, 'weighted avg': {'precision': 0.020967681813098743, 'recall': 0.019430051813471502, 'f1-score': 0.020169603958620937, 'support': 

In [None]:
args_combination_5 = TrainingArguments(
    "bert-finetuned-ner-combination-5",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=4,
    push_to_hub=True,
)
trainer = Trainer(
    model=model,
    args=args_combination_5,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics_extended,
    tokenizer=tokenizer,
)

trainer.train()

# After training, evaluate the model on the test set
test_results = trainer.evaluate(tokenized_datasets["test"])

# Print the test results
print("Test Results:")
print(test_results)

  0%|          | 0/2547 [00:00<?, ?it/s]

  0%|          | 0/127 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'corporation': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 575}, 'creative-work': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 141}, 'group': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72}, 'location': {'precision': 0.2903225806451613, 'recall': 0.24324324324324326, 'f1-score': 0.2647058823529412, 'support': 74}, 'person': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 213}, 'product': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 92}, 'micro avg': {'precision': 0.2903225806451613, 'recall': 0.015424164524421594, 'f1-score': 0.02929210740439382, 'support': 1167}, 'macro avg': {'precision': 0.048387096774193554, 'recall': 0.04054054054054054, 'f1-score': 0.04411764705882353, 'support': 1167}, 'weighted avg': {'precision': 0.018409486690438678, 'recall': 0.015424164524421594, 'f1-score': 0.016785120217752914, 'support': 1167}}" of type <class 'dict'> for key "eva

  0%|          | 0/127 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'corporation': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 575}, 'creative-work': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 141}, 'group': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72}, 'location': {'precision': 0.17307692307692307, 'recall': 0.24324324324324326, 'f1-score': 0.20224719101123595, 'support': 74}, 'person': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 213}, 'product': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 92}, 'micro avg': {'precision': 0.17307692307692307, 'recall': 0.015424164524421594, 'f1-score': 0.028324154209284032, 'support': 1167}, 'macro avg': {'precision': 0.028846153846153844, 'recall': 0.04054054054054054, 'f1-score': 0.033707865168539325, 'support': 1167}, 'weighted avg': {'precision': 0.010974886296223056, 'recall': 0.015424164524421594, 'f1-score': 0.01282458623378

  0%|          | 0/127 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'corporation': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 575}, 'creative-work': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 141}, 'group': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72}, 'location': {'precision': 0.15384615384615385, 'recall': 0.24324324324324326, 'f1-score': 0.18848167539267016, 'support': 74}, 'person': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 213}, 'product': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 92}, 'micro avg': {'precision': 0.15384615384615385, 'recall': 0.015424164524421594, 'f1-score': 0.028037383177570097, 'support': 1167}, 'macro avg': {'precision': 0.025641025641025644, 'recall': 0.04054054054054054, 'f1-score': 0.031413612565445025, 'support': 1167}, 'weighted avg': {'precision': 0.009755454485531606, 'recall': 0.015424164524421594, 'f1-score': 0.01195170863672

  0%|          | 0/161 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'corporation': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 571}, 'creative-work': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 232}, 'group': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 231}, 'location': {'precision': 0.20540540540540542, 'recall': 0.25333333333333335, 'f1-score': 0.22686567164179106, 'support': 150}, 'person': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 233}, 'product': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 127}, 'micro avg': {'precision': 0.20540540540540542, 'recall': 0.02461139896373057, 'f1-score': 0.04395604395604396, 'support': 1544}, 'macro avg': {'precision': 0.03423423423423424, 'recall': 0.042222222222222223, 'f1-score': 0.037810945273631845, 'support': 1544}, 'weighted avg': {'precision': 0.019955188348970733, 'recall': 0.02461139896373057, 'f1-score': 0.02204005877349

In [None]:
args_combination_6 = TrainingArguments(
    "bert-finetuned-ner-combination-6",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=12,
    push_to_hub=True,
)
trainer = Trainer(
    model=model,
    args=args_combination_6,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics_extended,
    tokenizer=tokenizer,
)

trainer.train()

# After training, evaluate the model on the test set
test_results = trainer.evaluate(tokenized_datasets["test"])

# Print the test results
print("Test Results:")
print(test_results)

  0%|          | 0/849 [00:00<?, ?it/s]

  0%|          | 0/127 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'corporation': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 575}, 'creative-work': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 141}, 'group': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72}, 'location': {'precision': 0.15789473684210525, 'recall': 0.24324324324324326, 'f1-score': 0.19148936170212766, 'support': 74}, 'person': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 213}, 'product': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 92}, 'micro avg': {'precision': 0.15789473684210525, 'recall': 0.015424164524421594, 'f1-score': 0.028103044496487123, 'support': 1167}, 'macro avg': {'precision': 0.02631578947368421, 'recall': 0.04054054054054054, 'f1-score': 0.031914893617021274, 'support': 1167}, 'weighted avg': {'precision': 0.010012176971992965, 'recall': 0.015424164524421594, 'f1-score': 0.012142427391565935, 'support': 1167}}" of type <class 'dict'> for key 

  0%|          | 0/127 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'corporation': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 575}, 'creative-work': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 141}, 'group': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72}, 'location': {'precision': 0.12337662337662338, 'recall': 0.25675675675675674, 'f1-score': 0.16666666666666666, 'support': 74}, 'person': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 213}, 'product': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 92}, 'micro avg': {'precision': 0.12337662337662338, 'recall': 0.016281062553556127, 'f1-score': 0.028766086298258896, 'support': 1167}, 'macro avg': {'precision': 0.020562770562770564, 'recall': 0.04279279279279279, 'f1-score': 0.027777777777777776, 'support': 1167}, 'weighted avg': {'precision': 0.007823367720539957, 'recall': 0.016281062553556127, 'f1-score': 0.01056840902599

  0%|          | 0/127 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'corporation': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 575}, 'creative-work': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 141}, 'group': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72}, 'location': {'precision': 0.13043478260869565, 'recall': 0.24324324324324326, 'f1-score': 0.16981132075471697, 'support': 74}, 'person': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 213}, 'product': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 92}, 'micro avg': {'precision': 0.13043478260869565, 'recall': 0.015424164524421594, 'f1-score': 0.027586206896551717, 'support': 1167}, 'macro avg': {'precision': 0.021739130434782608, 'recall': 0.04054054054054054, 'f1-score': 0.028301886792452827, 'support': 1167}, 'weighted avg': {'precision': 0.00827092880295071, 'recall': 0.015424164524421594, 'f1-score': 0.010767812969879

  0%|          | 0/161 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'corporation': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 571}, 'creative-work': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 232}, 'group': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 231}, 'location': {'precision': 0.17757009345794392, 'recall': 0.25333333333333335, 'f1-score': 0.2087912087912088, 'support': 150}, 'person': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 233}, 'product': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 127}, 'micro avg': {'precision': 0.17757009345794392, 'recall': 0.02461139896373057, 'f1-score': 0.04323094425483504, 'support': 1544}, 'macro avg': {'precision': 0.029595015576323987, 'recall': 0.042222222222222223, 'f1-score': 0.0347985347985348, 'support': 1544}, 'weighted avg': {'precision': 0.017250980582054137, 'recall': 0.02461139896373057, 'f1-score': 0.0202841200250526

In [None]:
args_combination_7 = TrainingArguments(
    "bert-finetuned-ner-combination-7",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=4,
    push_to_hub=True,
)
trainer = Trainer(
    model=model,
    args=args_combination_6,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics_extended,
    tokenizer=tokenizer,
)

trainer.train()

# After training, evaluate the model on the test set
test_results = trainer.evaluate(tokenized_datasets["test"])

# Print the test results
print("Test Results:")
print(test_results)

  0%|          | 0/849 [00:00<?, ?it/s]

  0%|          | 0/127 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'corporation': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 575}, 'creative-work': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 141}, 'group': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72}, 'location': {'precision': 0.20454545454545456, 'recall': 0.24324324324324326, 'f1-score': 0.22222222222222227, 'support': 74}, 'person': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 213}, 'product': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 92}, 'micro avg': {'precision': 0.20454545454545456, 'recall': 0.015424164524421594, 'f1-score': 0.028685258964143423, 'support': 1167}, 'macro avg': {'precision': 0.034090909090909095, 'recall': 0.04054054054054054, 'f1-score': 0.03703703703703704, 'support': 1167}, 'weighted avg': {'precision': 0.012970320168263614, 'recall': 0.015424164524421594, 'f1-score': 0.014091212034656766, 'support': 1167}}" of type <class 'dict'> for key 

  0%|          | 0/127 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'corporation': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 575}, 'creative-work': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 141}, 'group': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72}, 'location': {'precision': 0.17307692307692307, 'recall': 0.24324324324324326, 'f1-score': 0.20224719101123595, 'support': 74}, 'person': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 213}, 'product': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 92}, 'micro avg': {'precision': 0.17307692307692307, 'recall': 0.015424164524421594, 'f1-score': 0.028324154209284032, 'support': 1167}, 'macro avg': {'precision': 0.028846153846153844, 'recall': 0.04054054054054054, 'f1-score': 0.033707865168539325, 'support': 1167}, 'weighted avg': {'precision': 0.010974886296223056, 'recall': 0.015424164524421594, 'f1-score': 0.01282458623378

  0%|          | 0/127 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'corporation': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 575}, 'creative-work': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 141}, 'group': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72}, 'location': {'precision': 0.1782178217821782, 'recall': 0.24324324324324326, 'f1-score': 0.2057142857142857, 'support': 74}, 'person': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 213}, 'product': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 92}, 'micro avg': {'precision': 0.1782178217821782, 'recall': 0.015424164524421594, 'f1-score': 0.028391167192429023, 'support': 1167}, 'macro avg': {'precision': 0.0297029702970297, 'recall': 0.04054054054054054, 'f1-score': 0.03428571428571429, 'support': 1167}, 'weighted avg': {'precision': 0.011300873017893048, 'recall': 0.015424164524421594, 'f1-score': 0.013044436283510834, 

  0%|          | 0/161 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'corporation': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 571}, 'creative-work': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 232}, 'group': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 231}, 'location': {'precision': 0.20454545454545456, 'recall': 0.24, 'f1-score': 0.22085889570552147, 'support': 150}, 'person': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 233}, 'product': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 127}, 'micro avg': {'precision': 0.20454545454545456, 'recall': 0.023316062176165803, 'f1-score': 0.04186046511627907, 'support': 1544}, 'macro avg': {'precision': 0.034090909090909095, 'recall': 0.04, 'f1-score': 0.03680981595092025, 'support': 1544}, 'weighted avg': {'precision': 0.01987164390014131, 'recall': 0.023316062176165803, 'f1-score': 0.021456498935121904, 'support': 1544}}" of ty

In [None]:
args_combination_8 = TrainingArguments(
    "bert-finetuned-ner-combination-8",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=12,
    push_to_hub=True,
)
trainer = Trainer(
    model=model,
    args=args_combination_8,  
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],  
    data_collator=data_collator,
    compute_metrics=compute_metrics_extended,
    tokenizer=tokenizer,
)

trainer.train()

# After training, evaluate the model on the test set
test_results = trainer.evaluate(tokenized_datasets["test"])

# Print the test results
print("Test Results:")
print(test_results)


  0%|          | 0/849 [00:00<?, ?it/s]

  0%|          | 0/127 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'corporation': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 575}, 'creative-work': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 141}, 'group': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72}, 'location': {'precision': 0.09895833333333333, 'recall': 0.25675675675675674, 'f1-score': 0.14285714285714285, 'support': 74}, 'person': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 213}, 'product': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 92}, 'micro avg': {'precision': 0.09895833333333333, 'recall': 0.016281062553556127, 'f1-score': 0.027961736571008092, 'support': 1167}, 'macro avg': {'precision': 0.016493055555555556, 'recall': 0.04279279279279279, 'f1-score': 0.023809523809523808, 'support': 1167}, 'weighted avg': {'precision': 0.00627499285918309, 'recall': 0.016281062553556127, 'f1-score': 0.009058636307993

  0%|          | 0/127 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'corporation': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 575}, 'creative-work': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 141}, 'group': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72}, 'location': {'precision': 0.11920529801324503, 'recall': 0.24324324324324326, 'f1-score': 0.16, 'support': 74}, 'person': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 213}, 'product': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 92}, 'micro avg': {'precision': 0.11920529801324503, 'recall': 0.015424164524421594, 'f1-score': 0.027314112291350535, 'support': 1167}, 'macro avg': {'precision': 0.019867549668874173, 'recall': 0.04054054054054054, 'f1-score': 0.02666666666666667, 'support': 1167}, 'weighted avg': {'precision': 0.00755886208481588, 'recall': 0.015424164524421594, 'f1-score': 0.01014567266495287, 'support': 1

  0%|          | 0/127 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'corporation': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 575}, 'creative-work': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 141}, 'group': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 72}, 'location': {'precision': 0.10465116279069768, 'recall': 0.24324324324324326, 'f1-score': 0.14634146341463414, 'support': 74}, 'person': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 213}, 'product': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 92}, 'micro avg': {'precision': 0.10465116279069768, 'recall': 0.015424164524421594, 'f1-score': 0.0268857356235997, 'support': 1167}, 'macro avg': {'precision': 0.01744186046511628, 'recall': 0.04054054054054054, 'f1-score': 0.024390243902439022, 'support': 1167}, 'weighted avg': {'precision': 0.006635977760506965, 'recall': 0.015424164524421594, 'f1-score': 0.00927957865696908

  0%|          | 0/161 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'corporation': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 571}, 'creative-work': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 232}, 'group': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 231}, 'location': {'precision': 0.15639810426540285, 'recall': 0.22, 'f1-score': 0.18282548476454294, 'support': 150}, 'person': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 233}, 'product': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 127}, 'micro avg': {'precision': 0.15639810426540285, 'recall': 0.021373056994818652, 'f1-score': 0.037606837606837605, 'support': 1544}, 'macro avg': {'precision': 0.026066350710900476, 'recall': 0.03666666666666667, 'f1-score': 0.030470914127423823, 'support': 1544}, 'weighted avg': {'precision': 0.015194116347027479, 'recall': 0.021373056994818652, 'f1-score': 0.017761543208990573, 'suppo