<a href="https://colab.research.google.com/github/technologyhamed/Natural_Language_Processing/blob/main/Named_Entity_Recognition_(NER)_model__BERT_0_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets



In [2]:
import pandas as pd
import numpy as np
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset, load_metric


In [3]:
# Sample annotated data (replace with your actual data)
data = [
    {"tokens": ["John", "Doe", "is", "from", "New", "York"], "labels": [0, 0, 1, 1, 0, 0]},
    {"tokens": ["Jane", "Smith", "lives", "in", "London"], "labels": [0, 0, 1, 1, 0]}
]

# Convert to DataFrame and then to Dataset
df = pd.DataFrame(data)
dataset = Dataset.from_pandas(df)

# Load tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')
model = BertForTokenClassification.from_pretrained('bert-base-multilingual-cased', num_labels=3)  # Adjust num_labels

# Tokenize data
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = [-100 if word_id is None else label[word_id] for word_id in word_ids]
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [4]:
tokenized_dataset

Dataset({
    features: ['tokens', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2
})

In [6]:
# Split dataset into train and validation sets
train_dataset, validation_dataset = tokenized_dataset.train_test_split(test_size=0.2).values()

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Use the split training set
    eval_dataset=validation_dataset,  # Use the split validation set
)

# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,0.960298
2,No log,0.84215
3,No log,0.795064


TrainOutput(global_step=3, training_loss=0.8712170918782552, metrics={'train_runtime': 28.2736, 'train_samples_per_second': 0.106, 'train_steps_per_second': 0.106, 'total_flos': 12248396208.0, 'train_loss': 0.8712170918782552, 'epoch': 3.0})

In [8]:
!pip install seqeval  # Install the missing library

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=94a104be5c28a67d1fcea67f1341d2c0e78d3d844113cdf1af3b4b7206f2e31d
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [9]:
# Load the metric
metric = load_metric("seqeval")

# Compute metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

trainer.evaluate()


The repository for seqeval contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/seqeval.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


{'eval_loss': 0.7950640320777893,
 'eval_runtime': 0.1209,
 'eval_samples_per_second': 8.273,
 'eval_steps_per_second': 8.273,
 'epoch': 3.0}