<a href="https://colab.research.google.com/github/surya1604/Hybrid-NER/blob/main/Model/Bert_Hybrid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sqlalchemy<2.0
!pip show sqlalchemy


/bin/bash: line 1: 2.0: No such file or directory
Name: SQLAlchemy
Version: 2.0.36
Summary: Database Abstraction Library
Home-page: https://www.sqlalchemy.org
Author: Mike Bayer
Author-email: mike_mp@zzzcomputing.com
License: MIT
Location: /usr/local/lib/python3.10/dist-packages
Requires: greenlet, typing-extensions
Required-by: alembic, bigframes, dataset, ipython-sql, langchain


In [None]:
!pip install fsspec==2024.10.0
!pip install transformers tokenizers seqeval -q
!pip install datasets
!pip install evaluate


In [None]:
import datasets
import re
import numpy as np
from transformers import BertTokenizerFast, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer, TrainerCallback
import evaluate
import matplotlib.pyplot as plt

In [None]:
conll2003 = datasets.load_dataset("conll2003")
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
metric = evaluate.load("seqeval")

In [None]:
conll2003

In [None]:
conll2003.shape

In [None]:
conll2003["train"][0]

In [None]:
conll2003["train"].features["ner_tags"]

In [None]:
conll2003['train'].description

In [None]:
def rule_based_labeling(tokens):
    labels = []
    for token in tokens:
        if re.match(r"^[A-Z]+$", token):  # All uppercase tokens
            labels.append(3)  # Example label for `B-MISC`
        elif re.match(r".*\d+.*", token):  # Tokens containing numbers
            labels.append(4)  # Example label for `B-NUM`
        else:
            labels.append(0)  # Default label (no entity)
    return labels

In [None]:
def apply_rules_and_merge(example):
    tokens = example["tokens"]
    ner_tags = example["ner_tags"]
    rule_based_tags = rule_based_labeling(tokens)
    merged_tags = [
        rule_based_tag if rule_based_tag != 0 else ner_tag
        for rule_based_tag, ner_tag in zip(rule_based_tags, ner_tags)
    ]
    example["ner_tags"] = merged_tags
    return example

In [None]:
conll2003 = conll2003.map(apply_rules_and_merge)

In [None]:
conll2003['train'][0]

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Special tokens
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])  # First token of a word
            else:
                label_ids.append(label[word_idx])  # Other tokens of a word
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
# Apply tokenization and alignment
tokenized_datasets = conll2003.map(tokenize_and_align_labels, batched=True)

# Load Pretrained Model
model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-uncased", num_labels=len(conll2003["train"].features["ner_tags"].feature.names), hidden_dropout_prob=0.2

)

In [None]:


args = TrainingArguments(
    "test-ner",
    evaluation_strategy="epoch",  # Evaluate after each epoch
    learning_rate=3e-5,  # Fine-tuning learning rate
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,  # Training for fewer epochs to reduce overfitting
    weight_decay=0.01,  # Regularization
    save_strategy="epoch",  # Save model after every epoch
    load_best_model_at_end=True,  # Required for EarlyStoppingCallback
    metric_for_best_model="eval_loss",  # Monitor validation loss
    logging_dir="./logs",  # Directory for logging
    save_total_limit=2,  # Keep only the last two saved models
)


In [None]:
# Define Data Collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# Compute Metrics Function
label_list = conll2003["train"].features["ner_tags"].feature.names


In [None]:
def compute_metrics(eval_preds):
    pred_logits, labels = eval_preds
    pred_logits = np.argmax(pred_logits, axis=2)
    predictions = [
        [label_list[pred] for (pred, label) in zip(prediction, true_label) if label != -100]
        for prediction, true_label in zip(pred_logits, labels)
    ]
    true_labels = [
        [label_list[label] for (pred, label) in zip(prediction, true_label) if label != -100]
        for prediction, true_label in zip(pred_logits, labels)
    ]
    results = metric.compute(predictions=predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
# Define a custom callback to log losses
class LossLoggerCallback(TrainerCallback):
    def __init__(self):
        self.train_losses = []
        self.eval_losses = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            if "loss" in logs:  # Log training loss
                self.train_losses.append(logs["loss"])
            if "eval_loss" in logs:  # Log evaluation loss
                self.eval_losses.append(logs["eval_loss"])

# Initialize the callback
loss_logger = LossLoggerCallback()

In [None]:
from transformers import EarlyStoppingCallback

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[
        loss_logger,  # Custom callback
        EarlyStoppingCallback(early_stopping_patience=3)  # Early stopping callback
    ],  # Combine callbacks into a single list
)

# Train the Model
trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2619,0.083841,0.918002,0.927841,0.922895,0.978172
2,0.069,0.07311,0.939538,0.938231,0.938884,0.981937
3,0.0403,0.070357,0.941382,0.947312,0.944338,0.983605
4,0.0252,0.072256,0.940738,0.949358,0.945028,0.983605
5,0.0188,0.074625,0.944859,0.953285,0.949053,0.984876
6,0.0126,0.076707,0.947142,0.952876,0.95,0.985051


TrainOutput(global_step=5268, training_loss=0.0605846011195114, metrics={'train_runtime': 936.9954, 'train_samples_per_second': 149.851, 'train_steps_per_second': 9.37, 'total_flos': 2045112348578508.0, 'train_loss': 0.0605846011195114, 'epoch': 6.0})

In [None]:
# Plot the training and validation losses
plt.figure(figsize=(10, 6))
plt.plot(loss_logger.train_losses, label="Training Loss", color="blue", marker="o")
plt.plot(loss_logger.eval_losses, label="Validation Loss", color="orange", marker="o")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training and Validation Loss Over Epochs")
plt.legend()
plt.grid(True)
plt.show()
