## Huggingface Pre-trained 'bert-base-cased' Model for NER problem on CoNLL-2003 dataset

In [None]:
!pip install transformers datasets seqeval torch # installing libraries to manage datasets, sequence evaluation metrices, transformers and Pytorch
!pip install sklearn # For metrics adn utilities


Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m43.6/43.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-20

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from datasets import load_metric

# Loading the CoNLL-2003 dataset
dataset = load_dataset("conll2003")

# Load the pre-trained BERT tokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") # to convert text into tokens that the model can process

# Define the maximum sequence length
max_length = 128

# Function for input Tokenization and label alignment
def tokenize_and_align_labels(examples):    # Tokenize the input sentences with truncation and padding
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        max_length=max_length,
        is_split_into_words=True  #ensures that the tokenizer knows the input is split into words
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):  # Iterate over each example to align the labels with the tokenized inputs
        word_ids = tokenized_inputs.word_ids(batch_index=i) # Get word IDs for the tokens in the example
        label_ids = [-100 if word_id is None else label[word_id] for word_id in word_ids]  # Map labels to tokens, using -100 for padding tokens
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels # Add the aligned labels to the tokenized inputs
    return tokenized_inputs

# Apply the tokenization and label alignment
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

# Load the pre-trained BERT model for token classification
model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-cased",   # "bert-base-cased" is a BERT model variant that is case-sensitive
    num_labels=len(dataset['train'].features['ner_tags'].feature.names)# Set the number of output labels based on dataset
)

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",  # Directory to save the model checkpoints and results
    evaluation_strategy="epoch",  # Evaluate the model at the end of each epoch
    learning_rate=2e-5,  # Learning rate for the optimizer
    per_device_train_batch_size=32,  # Batch size for training
    per_device_eval_batch_size=32,  # Batch size for evaluation
    num_train_epochs=3,  # Number of epochs to train
    weight_decay=0.01,  # Weight decay for regularization
)

# Define the label list globally
label_list = dataset['train'].features['ner_tags'].feature.names

def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1) #Convert model outputs to label predictions
    true_labels = [[label_list[l] for l in label if l != -100] for label in labels] #Map label IDs to label names and remove padding tokens (-100)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels) #Compute metrics using the `seqeval` library
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

#Initialize the Trainer with model, arguments, datasets, and metrics function
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],       #training dattaset
    eval_dataset=tokenized_datasets["validation"],   #Validation dataset
    compute_metrics=compute_metrics,  # Function to compute metrics during evaluation
)

# Model training
trainer.train()
# Evaluate the model on the validation set and print results
results = trainer.evaluate()
print(results)

# Test and predict on the test dataset
test_dataset = tokenized_datasets["test"]
test_results = trainer.evaluate(test_dataset)
print(f"Test Results: {test_results}")

# Predict labels for the test dataset
predictions, labels, _ = trainer.predict(test_dataset)
predicted_labels = predictions.argmax(axis=-1)

# Map label IDs to label names for true labels and predictions
true_labels = [
    [label_list[l] for l in label if l != -100]
    for label in labels
]
#Actual predictions from givend data
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predicted_labels, labels)
]

#Examples form test evaluation
for i in range(3):  # printing some examples to match true vs predicted
    print(f"Example {i + 1}:")
    print(f"Tokens: {dataset['test'][i]['tokens']}")
    print(f"True Labels: {true_labels[i]}")
    print(f"Predicted Labels: {true_predictions[i]}")
    print("")

#compute and print final metrics
final_metrics = metric.compute(predictions=true_predictions, references=true_labels)
print(f"Final Test Metrics: {final_metrics}")



Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

The repository for conll2003 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/conll2003.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.074138,0.90488,0.914062,0.909448,0.978525
2,0.194800,0.064942,0.929193,0.929777,0.929485,0.982457
3,0.050800,0.059687,0.929336,0.935345,0.932331,0.983164


{'eval_loss': 0.05968658998608589, 'eval_precision': 0.9293361884368309, 'eval_recall': 0.9353448275862069, 'eval_f1': 0.9323308270676692, 'eval_accuracy': 0.9831644376362458, 'eval_runtime': 22.4029, 'eval_samples_per_second': 145.071, 'eval_steps_per_second': 4.553, 'epoch': 3.0}
Test Results: {'eval_loss': 0.1578899770975113, 'eval_precision': 0.8875079959791647, 'eval_recall': 0.9014293669946166, 'eval_f1': 0.8944145139752266, 'eval_accuracy': 0.9701913679498093, 'eval_runtime': 23.8219, 'eval_samples_per_second': 144.951, 'eval_steps_per_second': 4.534, 'epoch': 3.0}
Example 1:
Tokens: ['SOCCER', '-', 'JAPAN', 'GET', 'LUCKY', 'WIN', ',', 'CHINA', 'IN', 'SURPRISE', 'DEFEAT', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'B-LOC', 'B-LOC', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'B-PER', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Predicted Labels: ['O', 'O', 'O', 'O', 'O', 'B-PER', 'B-PER', 'B-PER', 'O', 'O', 'B-PER', 'B-PER', 'B-PER', 'B-PER

##Modifications in above model for efficiency


In [None]:
import os  # Import the os module for interacting with the operating system
from datasets import Dataset, DatasetDict  # Import Dataset and DatasetDict for dataset manipulation
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer  # Import necessary components from transformers library
from datasets import load_metric  # Import load_metric to access evaluation metrics

# Function to load CoNLL-2003 formatted data from a file
def load_conll_data(file_path):
    sentences = []  # List to store sentences
    ner_tags = []  # List to store corresponding NER tags
    sentence = []  # Temporary list to store words of a sentence
    tags = []  # Temporary list to store NER tags of a sentence

    with open(file_path, "r") as f:  # Open the file for reading
        for line in f:  # Iterate over each line in the file
            line = line.strip()  # Remove leading/trailing whitespace from the line
            if not line:  # Check if the line is empty
                if sentence:  # If sentence list is not empty
                    sentences.append(sentence)  # Add the sentence to sentences list
                    ner_tags.append(tags)  # Add the tags to ner_tags list
                    sentence = []  # Reset sentence list
                    tags = []  # Reset tags list
            else:
                word, pos, chunk, ner = line.split()  # Split the line into word, POS tag, chunk tag, and NER tag
                sentence.append(word)  # Add word to the sentence list
                tags.append(ner)  # Add NER tag to tags list

    return sentences, ner_tags  # Return the sentences and their corresponding NER tags

# Load the data from the provided files
train_sentences, train_labels = load_conll_data('/content/train.txt')  # Load training data
val_sentences, val_labels = load_conll_data('/content/valid.txt')  # Load validation data
test_sentences, test_labels = load_conll_data('/content/test.txt')  # Load test data

# Create Dataset objects for Hugging Face
train_dataset = Dataset.from_dict({"tokens": train_sentences, "ner_tags": train_labels})  # Create training dataset
val_dataset = Dataset.from_dict({"tokens": val_sentences, "ner_tags": val_labels})  # Create validation dataset
test_dataset = Dataset.from_dict({"tokens": test_sentences, "ner_tags": test_labels})  # Create test dataset

# Combine individual datasets into a DatasetDict
dataset = DatasetDict({
    "train": train_dataset,  # Add training dataset
    "validation": val_dataset,  # Add validation dataset
    "test": test_dataset,  # Add test dataset
})

# Define label mapping from labels to IDs and vice versa
unique_labels = set(label for labels in train_labels + val_labels + test_labels for label in labels)  # Collect all unique labels
label_map = {label: i for i, label in enumerate(unique_labels)}  # Map each unique label to a unique ID
inverse_label_map = {i: label for label, i in label_map.items()}  # Reverse the mapping from IDs to labels

# Load the tokenizer and model
model_name = "bert-base-cased"  # Define the model name
tokenizer = AutoTokenizer.from_pretrained(model_name)  # Load pre-trained tokenizer
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_map))  # Load pre-trained model for token classification with appropriate number of labels

# Function to tokenize the inputs and align the labels with tokens
def tokenize_and_align_labels(examples):
    # Tokenize the input sentences and pad/truncate to max_length
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True,
    )

    labels = []  # List to store aligned labels
    for i, label in enumerate(examples["ner_tags"]):  # Iterate over each example
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Get word IDs for the tokens
        label_ids = []  # List to store label IDs
        previous_word_idx = None  # Track the previous word index
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Use -100 for padding tokens
            elif word_idx != previous_word_idx:
                label_ids.append(label_map[label[word_idx]])  # Map the label to its ID
            else:
                label_ids.append(-100)  # Use -100 for subword tokens
            previous_word_idx = word_idx

        labels.append(label_ids)  # Append the aligned labels

    tokenized_inputs["labels"] = labels  # Add aligned labels to tokenized inputs
    return tokenized_inputs

# Apply the tokenization and label alignment function to the entire dataset
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True, remove_columns=["tokens", "ner_tags"])  # Tokenize and align labels, removing original columns

# Define training arguments
training_args = TrainingArguments(
    output_dir="./ner_output",  # Directory to save the model checkpoints and results
    evaluation_strategy="epoch",  # Evaluate the model at the end of each epoch
    save_strategy="epoch",  # Save the model at the end of each epoch
    learning_rate=3e-5,  # Learning rate for the optimizer
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,  # Batch size for evaluation
    num_train_epochs=3,  # Number of epochs to train
    weight_decay=0.01,  # Weight decay for regularization
    logging_dir='./logs',  # Directory for storing logs
    logging_steps=50,  # Log every 50 steps
    save_total_limit=2,  # Limit the number of saved checkpoints to save disk space
    load_best_model_at_end=True,  # Load the best model at the end of training based on evaluation metric
)

# Load the evaluation metric for sequence labeling tasks
metric = load_metric("seqeval")  # Load the `seqeval` metric for evaluating sequence labeling tasks

# Function to compute evaluation metrics
def compute_metrics(p):
    predictions, labels = p  # Extract predictions and labels
    predictions = predictions.argmax(axis=-1)  # Get the predicted class with highest probability

    # Map label IDs to label names and remove padding tokens (-100)
    true_labels = [[inverse_label_map[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [inverse_label_map[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Compute metrics using the `seqeval` library
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],  # Precision of the model
        "recall": results["overall_recall"],        # Recall of the model
        "f1": results["overall_f1"],                # F1 score of the model
        "accuracy": results["overall_accuracy"],    # Accuracy of the model
    }

# Setup the Trainer with the model, arguments, datasets, and metrics function
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],  # Training dataset
    eval_dataset=tokenized_dataset["validation"],  # Validation dataset
    tokenizer=tokenizer,  # Tokenizer to use for processing inputs
    compute_metrics=compute_metrics,  # Function to compute metrics during evaluation
)

# Train the model
trainer.train()  # Train the model using the Trainer

# Save the trained model and tokenizer
model.save_pretrained("./ner_model")  # Save the trained model to the specified directory
tokenizer.save_pretrained("./ner_model")  # Save the tokenizer to the specified directory

# Evaluate the model on the test set
results = trainer.evaluate(tokenized_dataset["test"])  # Evaluate the model on the test dataset

# Print basic metrics from the test evaluation
print("Test Results:")  # Print a header for the test results
print(f"Accuracy: {results['eval_accuracy']:.4f}")  # Print the accuracy of the model
print(f"Precision: {results['eval_precision']:.4f}")  # Print the precision of the model
print(f"Recall: {results['eval_recall']:.4f}")  # Print the recall of the model
print(f"F1-Score: {results['eval_f1']:.4f}")  # Print the F1 score of the model


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/14987 [00:00<?, ? examples/s]

Map:   0%|          | 0/3466 [00:00<?, ? examples/s]

Map:   0%|          | 0/3684 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0464,0.040949,0.924232,0.943228,0.933633,0.989134
2,0.0264,0.038542,0.942427,0.948619,0.945513,0.990551
3,0.0063,0.038804,0.946752,0.952493,0.949614,0.99123


Test Results:
Accuracy: 0.9823
Precision: 0.9021
Recall: 0.9171
F1-Score: 0.9095


Using Downloaded Data files

In [None]:
import os  # Import the os module for interacting with the operating system
from datasets import Dataset, DatasetDict  # Import Dataset and DatasetDict for dataset creation and manipulation
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer  # Import necessary components for token classification
from datasets import load_metric  # Import load_metric to load evaluation metrics

# Function to load CoNLL-2003 formatted data from a file
def load_conll_data(file_path):
    sentences = []  # List to store sentences
    ner_tags = []  # List to store corresponding NER tags
    sentence = []  # Temporary list to store words of a sentence
    tags = []  # Temporary list to store NER tags of a sentence

    with open(file_path, "r") as f:  # Open the file for reading
        for line in f:  # Iterate over each line in the file
            line = line.strip()  # Remove leading/trailing whitespace from the line
            if not line:  # Check if the line is empty
                if sentence:  # If sentence list is not empty
                    sentences.append(sentence)  # Add the sentence to sentences list
                    ner_tags.append(tags)  # Add the tags to ner_tags list
                    sentence = []  # Reset sentence list
                    tags = []  # Reset tags list
            else:
                word, pos, chunk, ner = line.split()  # Split the line into word, POS tag, chunk tag, and NER tag
                sentence.append(word)  # Add word to the sentence list
                tags.append(ner)  # Add NER tag to tags list

    return sentences, ner_tags  # Return the sentences and their corresponding NER tags

# Load the data from the provided files
train_sentences, train_labels = load_conll_data('/content/train.txt')  # Load training data
val_sentences, val_labels = load_conll_data('/content/valid.txt')  # Load validation data
test_sentences, test_labels = load_conll_data('/content/test.txt')  # Load test data

# Create Dataset objects for Hugging Face
train_dataset = Dataset.from_dict({"tokens": train_sentences, "ner_tags": train_labels})  # Create training dataset from sentences and tags
val_dataset = Dataset.from_dict({"tokens": val_sentences, "ner_tags": val_labels})  # Create validation dataset
test_dataset = Dataset.from_dict({"tokens": test_sentences, "ner_tags": test_labels})  # Create test dataset

# Combine individual datasets into a DatasetDict for easier management
dataset = DatasetDict({
    "train": train_dataset,  # Add training dataset
    "validation": val_dataset,  # Add validation dataset
    "test": test_dataset,  # Add test dataset
})

# Define label mapping from labels to IDs and vice versa
unique_labels = set(label for labels in train_labels + val_labels + test_labels for label in labels)  # Collect all unique labels from the datasets
label_map = {label: i for i, label in enumerate(unique_labels)}  # Map each unique label to a unique ID
inverse_label_map = {i: label for label, i in label_map.items()}  # Reverse the mapping from IDs to labels

# Load the pre-trained tokenizer and model
model_name = "bert-base-cased"  # Specify the model name
tokenizer = AutoTokenizer.from_pretrained(model_name)  # Load the tokenizer for the specified model
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_map))  # Load the model for token classification with the appropriate number of labels

# Function to tokenize the inputs and align the labels with tokens
def tokenize_and_align_labels(examples):
    # Tokenize the input sentences, pad/truncate to max_length
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True,  # Indicates that inputs are already split into words
    )

    labels = []  # List to store aligned labels
    for i, label in enumerate(examples["ner_tags"]):  # Iterate over each example
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Get word IDs for the tokens
        label_ids = []  # List to store label IDs
        previous_word_idx = None  # Track the previous word index
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Use -100 for padding tokens
            elif word_idx != previous_word_idx:
                label_ids.append(label_map[label[word_idx]])  # Map the label to its ID
            else:
                label_ids.append(-100)  # Use -100 for subword tokens
            previous_word_idx = word_idx

        labels.append(label_ids)  # Append the aligned labels

    tokenized_inputs["labels"] = labels  # Add aligned labels to tokenized inputs
    return tokenized_inputs

# Apply the tokenization and label alignment function to the entire dataset
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True, remove_columns=["tokens", "ner_tags"])  # Tokenize and align labels, removing original columns

# Define training arguments for the Trainer
training_args = TrainingArguments(
    output_dir="./ner_output",  # Directory to save the model checkpoints and results
    evaluation_strategy="epoch",  # Evaluate the model at the end of each epoch
    learning_rate=2e-5,  # Learning rate for the optimizer
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,  # Batch size for evaluation
    num_train_epochs=3,  # Number of epochs to train
)

# Load the evaluation metric for sequence labeling tasks
metric = load_metric("seqeval")  # Load the `seqeval` metric for evaluating sequence labeling tasks

# Function to compute evaluation metrics
def compute_metrics(p):
    predictions, labels = p  # Extract predictions and labels
    predictions = predictions.argmax(axis=-1)  # Get the predicted class with highest probability

    # Map label IDs to label names and remove padding tokens (-100)
    true_labels = [[inverse_label_map[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [inverse_label_map[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Compute metrics using the `seqeval` library
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],  # Precision of the model
        "recall": results["overall_recall"],        # Recall of the model
        "f1": results["overall_f1"],                # F1 score of the model
        "accuracy": results["overall_accuracy"],    # Accuracy of the model
    }

# Setup the Trainer with the model, arguments, datasets, and metrics function
trainer = Trainer(
    model=model,  # Model to train
    args=training_args,  # Training arguments
    train_dataset=tokenized_dataset["train"],  # Training dataset
    eval_dataset=tokenized_dataset["validation"],  # Validation dataset
    tokenizer=tokenizer,  # Tokenizer to use for processing inputs
    compute_metrics=compute_metrics,  # Function to compute metrics during evaluation
)

# Train the model
trainer.train()  # Train the model using the Trainer

# Save the trained model and tokenizer
model.save_pretrained("./ner_model")  # Save the trained model to the specified directory
tokenizer.save_pretrained("./ner_model")  # Save the tokenizer to the specified directory

# Evaluate the model on the validation set
results = trainer.evaluate()  # Evaluate the model on the validation dataset

# Print the evaluation results
print(results)  # Print the evaluation results including accuracy, precision, recall, and F1 score


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/14987 [00:00<?, ? examples/s]

Map:   0%|          | 0/3466 [00:00<?, ? examples/s]

Map:   0%|          | 0/3684 [00:00<?, ? examples/s]

  metric = load_metric("seqeval")


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

The repository for seqeval contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/seqeval.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1584,0.042731,0.925079,0.940195,0.932576,0.989193
2,0.028,0.036675,0.943535,0.951482,0.947492,0.991094
3,0.0139,0.038044,0.947095,0.952999,0.950038,0.991424


{'eval_loss': 0.03804400935769081, 'eval_precision': 0.9470952620123891, 'eval_recall': 0.9529986522911051, 'eval_f1': 0.950037786547989, 'eval_accuracy': 0.9914239701973263, 'eval_runtime': 24.4319, 'eval_samples_per_second': 141.864, 'eval_steps_per_second': 8.882, 'epoch': 3.0}


In [None]:
# Evaluate the model on the test set
test_results = trainer.evaluate(eval_dataset=tokenized_dataset["test"])
print("Test set evaluation results:", test_results)


Test set evaluation results: {'eval_loss': 0.115883968770504, 'eval_precision': 0.9049117287187555, 'eval_recall': 0.9172572643515238, 'eval_f1': 0.9110426748790145, 'eval_accuracy': 0.9826573486537472, 'eval_runtime': 26.2137, 'eval_samples_per_second': 140.537, 'eval_steps_per_second': 8.812, 'epoch': 3.0}
