## **Custom NER with Transformers (Hugging Face)**

In [9]:
!pip install transformers torch datasets seqeval accelerate



In [10]:
import os
import warnings

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification,
    pipeline
)
from datasets import Dataset
import numpy as np

# Sample training data
TRAIN_DATA = [
    {
        "tokens": ["Apple", "Inc.", "is", "headquartered", "in", "Cupertino", "California"],
        "labels": ["B-ORG", "I-ORG", "O", "O", "O", "B-LOC", "B-LOC"]
    },
    {
        "tokens": ["Tim", "Cook", "is", "the", "CEO", "of", "Apple"],
        "labels": ["B-PER", "I-PER", "O", "O", "O", "O", "B-ORG"]
    },
    {
        "tokens": ["Microsoft", "was", "founded", "by", "Bill", "Gates"],
        "labels": ["B-ORG", "O", "O", "O", "B-PER", "I-PER"]
    },
    {
        "tokens": ["Google", "headquarters", "are", "in", "Mountain", "View"],
        "labels": ["B-ORG", "O", "O", "O", "B-LOC", "I-LOC"]
    },
    {
        "tokens": ["Elon", "Musk", "founded", "SpaceX", "in", "California"],
        "labels": ["B-PER", "I-PER", "O", "B-ORG", "O", "B-LOC"]
    },
    {
        "tokens": ["Amazon", "was", "started", "by", "Jeff", "Bezos"],
        "labels": ["B-ORG", "O", "O", "O", "B-PER", "I-PER"]
    }
]

class SimpleNERTrainer:
    def __init__(self):
        self.label2id = None
        self.id2label = None
        self.tokenizer = None
        self.model = None

    def setup_labels(self):
        """Create label mappings"""
        all_labels = set()
        for example in TRAIN_DATA:
            all_labels.update(example["labels"])

        label_list = sorted(list(all_labels))
        self.label2id = {label: i for i, label in enumerate(label_list)}
        self.id2label = {i: label for label, i in self.label2id.items()}

        print(f"Labels: {label_list}")
        return self.label2id, self.id2label

    def prepare_dataset(self):
        """Prepare dataset for training"""

        def tokenize_and_align_labels(examples):
            tokenized_inputs = self.tokenizer(
                examples["tokens"],
                truncation=True,
                is_split_into_words=True,
                padding=True
            )

            labels = []
            for i, label_list in enumerate(examples["labels"]):
                word_ids = tokenized_inputs.word_ids(batch_index=i)
                previous_word_idx = None
                label_ids = []

                for word_idx in word_ids:
                    if word_idx is None:
                        label_ids.append(-100)
                    elif word_idx != previous_word_idx:
                        label_ids.append(self.label2id[label_list[word_idx]])
                    else:
                        label_ids.append(-100)
                    previous_word_idx = word_idx

                labels.append(label_ids)

            tokenized_inputs["labels"] = labels
            return tokenized_inputs

        # Prepare data
        tokens_list = [example["tokens"] for example in TRAIN_DATA]
        labels_list = [example["labels"] for example in TRAIN_DATA]

        dataset_dict = {
            "tokens": tokens_list,
            "labels": labels_list
        }

        dataset = Dataset.from_dict(dataset_dict)
        tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

        return tokenized_dataset

    def compute_metrics(self, eval_pred):
        """Simple accuracy computation"""
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=2)

        correct = 0
        total = 0

        for pred_seq, label_seq in zip(predictions, labels):
            for pred, label in zip(pred_seq, label_seq):
                if label != -100:
                    total += 1
                    if pred == label:
                        correct += 1

        accuracy = correct / total if total > 0 else 0
        return {"accuracy": accuracy}

    def train(self):
        """Train the NER model"""
        print("Setting up training...")

        # Setup labels
        self.setup_labels()

        # Load model and tokenizer
        model_name = "distilbert-base-uncased"
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        self.model = AutoModelForTokenClassification.from_pretrained(
            model_name,
            num_labels=len(self.label2id),
            id2label=self.id2label,
            label2id=self.label2id
        )

        # Prepare dataset
        dataset = self.prepare_dataset()

        # Data collator
        data_collator = DataCollatorForTokenClassification(self.tokenizer)

        # Training arguments - explicitly disable all tracking
        training_args = TrainingArguments(
            output_dir="./ner-model-clean",
            per_device_train_batch_size=8,
            num_train_epochs=3,
            logging_steps=10,
            save_steps=500,
            learning_rate=5e-5,
            report_to=[],  # Empty list to disable all reporting
            logging_dir=None,  # Disable logging directory
            save_total_limit=1,
        )

        # Create trainer
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=dataset,
            data_collator=data_collator,
            compute_metrics=self.compute_metrics,
        )

        print("Training started...")
        trainer.train()

        print("Saving model...")
        trainer.save_model("./ner-model-final")
        self.tokenizer.save_pretrained("./ner-model-final")

        return self.model, self.tokenizer

    def test_model(self):
        """Test the trained model"""
        print("\nLoading and testing model...")

        try:
            # Load model
            tokenizer = AutoTokenizer.from_pretrained("./ner-model-final")
            model = AutoModelForTokenClassification.from_pretrained("./ner-model-final")

            # Create pipeline
            ner_pipeline = pipeline(
                "ner",
                model=model,
                tokenizer=tokenizer,
                aggregation_strategy="simple"
            )

            # Test sentences
            test_sentences = [
                "Apple Inc. is based in California.",
                "Tim Cook is the CEO of Apple.",
                "Microsoft was founded by Bill Gates.",
                "Google headquarters are in Mountain View.",
                "Jeff Bezos started Amazon.",
                "Tesla is located in Austin Texas."
            ]

            print("\n" + "="*50)
            print("TEST RESULTS")
            print("="*50)

            for i, sentence in enumerate(test_sentences, 1):
                print(f"\n{i}. Text: {sentence}")

                try:
                    entities = ner_pipeline(sentence)
                    if entities:
                        print("   Entities found:")
                        for entity in entities:
                            print(f"      • {entity['word']} → {entity['entity_group']} "
                                  f"(confidence: {entity['score']:.3f})")
                    else:
                        print("   No entities detected")

                except Exception as e:
                    print(f"   Error: {e}")

            return ner_pipeline

        except Exception as e:
            print(f"Error loading model: {e}")
            return None

    def quick_test(self, text="Apple Inc. is in California"):
        """Quick manual test"""
        print(f"\nQuick test with: '{text}'")

        try:
            tokenizer = AutoTokenizer.from_pretrained("./ner-model-final")
            model = AutoModelForTokenClassification.from_pretrained("./ner-model-final")

            inputs = tokenizer(text, return_tensors="pt")

            with torch.no_grad():
                outputs = model(**inputs)

            predictions = torch.argmax(outputs.logits, dim=2)
            tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

            print("Token predictions:")
            for token, pred_id in zip(tokens, predictions[0]):
                if token not in ['[CLS]', '[SEP]', '[PAD]']:
                    label = model.config.id2label[pred_id.item()]
                    print(f"  {token} → {label}")

        except Exception as e:
            print(f"Quick test error: {e}")

def main():
    """Main function"""
    print("Custom NER Training - Clean Version")
    print("="*50)

    trainer = SimpleNERTrainer()

    try:
        # Train
        model, tokenizer = trainer.train()

        # Test
        trainer.test_model()

        # Quick test
        trainer.quick_test()

        print("\n" + "="*50)
        print("✓ Training completed successfully!")
        print("✓ Model saved to: ./ner-model-final")

    except Exception as e:
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

Custom NER Training - Clean Version
Setting up training...
Labels: ['B-LOC', 'B-ORG', 'B-PER', 'I-LOC', 'I-ORG', 'I-PER', 'O']


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Training started...


Step,Training Loss


Saving model...


Device set to use cuda:0



Loading and testing model...

TEST RESULTS

1. Text: Apple Inc. is based in California.
   No entities detected

2. Text: Tim Cook is the CEO of Apple.
   Entities found:
      • tim → PER (confidence: 0.212)

3. Text: Microsoft was founded by Bill Gates.
   No entities detected

4. Text: Google headquarters are in Mountain View.
   No entities detected

5. Text: Jeff Bezos started Amazon.
   No entities detected

6. Text: Tesla is located in Austin Texas.
   No entities detected

Quick test with: 'Apple Inc. is in California'
Token predictions:
  apple → O
  inc → O
  . → O
  is → O
  in → O
  california → O

✓ Training completed successfully!
✓ Model saved to: ./ner-model-final
