<a href="https://colab.research.google.com/github/yusufshihata/InternIntelligence_NER/blob/main/NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets evaluate



In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset
import evaluate

## Load the Dataset

In [None]:
dataset = load_dataset("conll2003")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [None]:
dataset['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [None]:
labels = dataset['train'].features['ner_tags'].feature.names

labels

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [None]:
training_data = []
def map_word_label(data):
    for i in range(len(data)):
        token2label = {"tokens": [], "labels": []}
        token2label['tokens'], token2label['labels'] = data[i]['tokens'], data[i]['ner_tags']
        training_data.append(token2label)

In [None]:
map_word_label(dataset['train'])

In [None]:
training_data[0]

{'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'labels': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [1]:
!pip install datasets seqeval evaluate

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━

In [2]:
import numpy as np
import torch
from datasets import load_dataset
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)

In [3]:
class DatasetPreprocessor:
    def __init__(self, tokenizer, dataset_name="conll2003"):
        self.dataset = load_dataset(dataset_name)
        self.tokenizer = tokenizer
        self.label_list = self.dataset["train"].features["ner_tags"].feature.names
        self.num_labels = len(self.label_list)

    def tokenize_and_align_labels(self, examples):
        tokenized_inputs = self.tokenizer(
            examples["tokens"], truncation=True, is_split_into_words=True,
            padding="max_length", max_length=25
        )

        all_labels = []
        for i, labels in enumerate(examples["ner_tags"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                if word_idx is None:
                    label_ids.append(-100)
                elif word_idx != previous_word_idx:
                    label_ids.append(labels[word_idx])
                else:
                    label_ids.append(-100)
                previous_word_idx = word_idx
            all_labels.append(label_ids)
        tokenized_inputs["labels"] = all_labels
        return tokenized_inputs

    def get_tokenized_dataset(self):
        return self.dataset.map(self.tokenize_and_align_labels, batched=True)

In [4]:
class MetricsComputer:
    def __init__(self, label_list):
        self.metric = evaluate.load("seqeval")
        self.label_list = label_list

    def compute(self, p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        true_predictions = []
        true_labels = []
        for prediction, label in zip(predictions, labels):
            pred_tags = []
            true_tags = []
            for p_val, l_val in zip(prediction, label):
                if l_val != -100:
                    pred_tags.append(self.label_list[p_val])
                    true_tags.append(self.label_list[l_val])
            true_predictions.append(pred_tags)
            true_labels.append(true_tags)

        results = self.metric.compute(predictions=true_predictions, references=true_labels)
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }

In [9]:
class NERTrainer:
    def __init__(self, model, tokenizer, train_dataset, eval_dataset, metric_computer):
        self.model = model
        self.tokenizer = tokenizer
        self.train_dataset = train_dataset
        self.eval_dataset = eval_dataset
        self.metric_computer = metric_computer
        self.data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
        self.training_args = TrainingArguments(
            output_dir="./results", evaluation_strategy="epoch", learning_rate=2e-5,
            per_device_train_batch_size=16, per_device_eval_batch_size=16,
            num_train_epochs=3, weight_decay=0.01, logging_dir="./logs",
        )

    def train(self):
        trainer = Trainer(
            model=self.model,
            args=self.training_args,
            train_dataset=self.train_dataset,
            eval_dataset=self.eval_dataset,
            tokenizer=self.tokenizer,
            data_collator=self.data_collator,
            compute_metrics=self.metric_computer.compute,
        )
        trainer.train()

    def inference(self, sentence):
        # Naively split the sentence into words
        words = sentence.split()

        # First, tokenize without return_tensors to retain alignment info
        encoding = tokenizer(
            words,
            is_split_into_words=True,
            padding="max_length",
            truncation=True,
            max_length=25,
            return_offsets_mapping=True
        )

        # Get word IDs from the encoding
        token_word_ids = encoding.word_ids(batch_index=0)

        # Convert input_ids to tensors and move to model's device
        inputs = {
            k: torch.tensor(v).unsqueeze(0).to(model.device)
            for k, v in encoding.items() if k != "offset_mapping"
        }

        # Run the model inference
        with torch.no_grad():
            outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=2)[0].tolist()

        # Align predictions: only take the first token prediction for each word
        predicted_labels = []
        current_word_idx = None
        for idx, word_idx in enumerate(token_word_ids):
            if word_idx is None:
                continue
            if word_idx != current_word_idx:
                predicted_label = dataset_preprocessor.label_list[predictions[idx]]
                predicted_labels.append(predicted_label)
                current_word_idx = word_idx

        return list(zip(words, predicted_labels))

In [10]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
dataset_preprocessor = DatasetPreprocessor(tokenizer)
tokenized_dataset = dataset_preprocessor.get_tokenized_dataset()

model = AutoModelForTokenClassification.from_pretrained("bert-base-cased", num_labels=dataset_preprocessor.num_labels)
metric_computer = MetricsComputer(dataset_preprocessor.label_list)
trainer = NERTrainer(
    model, tokenizer, tokenized_dataset["train"], tokenized_dataset["validation"], metric_computer
)

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1785,0.046993,0.924513,0.933607,0.929038,0.987326
2,0.0326,0.044488,0.934066,0.940574,0.937309,0.988891
3,0.015,0.046103,0.935733,0.942828,0.939267,0.988891


In [14]:
example_sentence = input("Enter your sentence: ")
result = trainer.inference(example_sentence)
print("NER predictions:")
for word, label in result:
    print(f"{word}: {label}")

Enter your sentence: Intern Intelligence is an Egyptian Company
NER predictions:
Intern: B-ORG
Intelligence: I-ORG
is: O
an: O
Egyptian: B-MISC
Company: O
