<a href="https://colab.research.google.com/github/technologyhamed/Natural_Language_Processing/blob/main/Named_Entity_Recognition_(NER)_model__BERT_0_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets

In [2]:
import pandas as pd
import numpy as np
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset, load_metric


In [None]:
# Sample annotated data (replace with your actual data)
data = [
    {"tokens": ["John", "Doe", "is", "from", "New", "York"], "labels": [0, 0, 1, 1, 0, 0]},
    {"tokens": ["Jane", "Smith", "lives", "in", "London"], "labels": [0, 0, 1, 1, 0]}
]

# Convert to DataFrame and then to Dataset
df = pd.DataFrame(data)
dataset = Dataset.from_pandas(df)

# Load tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')
model = BertForTokenClassification.from_pretrained('bert-base-multilingual-cased', num_labels=3)  # Adjust num_labels

# Tokenize data
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = [-100 if word_id is None else label[word_id] for word_id in word_ids]
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)


In [4]:
tokenized_dataset

Dataset({
    features: ['tokens', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2
})

In [6]:
# Split dataset into train and validation sets
train_dataset, validation_dataset = tokenized_dataset.train_test_split(test_size=0.2).values()

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Use the split training set
    eval_dataset=validation_dataset,  # Use the split validation set
)

# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,0.960298
2,No log,0.84215
3,No log,0.795064


TrainOutput(global_step=3, training_loss=0.8712170918782552, metrics={'train_runtime': 28.2736, 'train_samples_per_second': 0.106, 'train_steps_per_second': 0.106, 'total_flos': 12248396208.0, 'train_loss': 0.8712170918782552, 'epoch': 3.0})

In [None]:
!pip install seqeval  # Install the missing library

In [None]:
# Load the metric
metric = load_metric("seqeval")

# Compute metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

trainer.evaluate()
