# Entity Extraction (NER) Model Training

This notebook trains a Named Entity Recognition (NER) model to extract entities from business documents:
- Invoice numbers
- Amounts
- Dates
- Email addresses
- Phone numbers
- Tax IDs
- Company names

## Approach
1. **Option A:** spaCy NER model (pre-trained + fine-tuning)
2. **Option B:** Transformers (BERT-based NER)
3. **Option C:** Regex + Rule-based (fast, no training needed)


In [None]:
# Install required dependencies (run this first)
!pip install pandas numpy spacy transformers torch datasets
!python -m spacy download en_core_web_sm


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import re
import json
from typing import Dict, List

# Set paths
BASE_DIR = Path('../')
DATA_DIR = BASE_DIR / 'data'
MODELS_DIR = BASE_DIR / 'models' / 'entity_extractor'
MODELS_DIR.mkdir(parents=True, exist_ok=True)

print(f"Base directory: {BASE_DIR}")
print(f"Data directory: {DATA_DIR}")
print(f"Models directory: {MODELS_DIR}")


## 1. Load and Prepare Data


In [None]:
# Create sample training data with entity annotations
sample_documents = [
    {
        'text': 'Invoice #INV-001\nDate: 2024-01-15\nAmount: $1,500.00\nEmail: contact@company.com\nPhone: 555-1234',
        'entities': {
            'invoice_number': 'INV-001',
            'date': '2024-01-15',
            'amount': '$1,500.00',
            'email': 'contact@company.com',
            'phone': '555-1234'
        }
    },
    {
        'text': 'Purchase Order PO-2024-001\nSupplier: ABC Corp\nOrder Date: 01/20/2024\nTotal: $5,000.00',
        'entities': {
            'po_number': 'PO-2024-001',
            'company_name': 'ABC Corp',
            'date': '01/20/2024',
            'amount': '$5,000.00'
        }
    },
    {
        'text': 'Receipt #RCP-789\nPayment Date: 2024-02-01\nAmount Paid: $750.25\nTax ID: 12-3456789',
        'entities': {
            'receipt_number': 'RCP-789',
            'date': '2024-02-01',
            'amount': '$750.25',
            'tax_id': '12-3456789'
        }
    }
]

df = pd.DataFrame(sample_documents)
print(f"✓ Created {len(df)} sample documents with entity annotations")
print(f"\nSample document:")
print(df.iloc[0]['text'])
print(f"\nEntities:")
print(df.iloc[0]['entities'])


## 2. Method A: Rule-based Entity Extraction (Fast, No Training)


In [None]:
def extract_entities_rule_based(text: str) -> Dict[str, List[str]]:
    """Extract entities using regex patterns"""
    entities = {
        'invoice_numbers': re.findall(r'(?:invoice|inv)[\s#:]*([A-Z0-9\-]+)', text, re.IGNORECASE),
        'po_numbers': re.findall(r'(?:po|purchase\s*order)[\s#:]*([A-Z0-9\-]+)', text, re.IGNORECASE),
        'amounts': re.findall(r'\$[\d,]+\.?\d*', text),
        'dates': re.findall(r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|\d{4}[/-]\d{1,2}[/-]\d{1,2}', text),
        'emails': re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text),
        'phones': re.findall(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', text),
        'tax_ids': re.findall(r'(?:tax\s*id|ein|vat)[\s#:]*([A-Z0-9\-]+)', text, re.IGNORECASE),
    }
    return {k: v for k, v in entities.items() if v}

# Test
test_text = df.iloc[0]['text']
extracted = extract_entities_rule_based(test_text)
print(f"Text: {test_text}")
print(f"\nExtracted entities:")
for entity_type, values in extracted.items():
    print(f"  {entity_type}: {values}")

# Save rule-based extractor
import pickle
with open(MODELS_DIR / 'rule_based_extractor.pkl', 'wb') as f:
    pickle.dump(extract_entities_rule_based, f)
print(f"\n✓ Rule-based extractor saved to {MODELS_DIR}")


## 3. Method B: spaCy NER (Pre-trained Model)


In [None]:
try:
    import spacy
    nlp = spacy.load("en_core_web_sm")
    
    def extract_entities_spacy(text: str) -> Dict[str, List[str]]:
        """Extract entities using spaCy NER"""
        doc = nlp(text)
        entities = {
            'persons': [],
            'organizations': [],
            'dates': [],
            'money': [],
            'emails': [],
            'phones': []
        }
        
        for ent in doc.ents:
            if ent.label_ == 'PERSON':
                entities['persons'].append(ent.text)
            elif ent.label_ == 'ORG':
                entities['organizations'].append(ent.text)
            elif ent.label_ == 'DATE':
                entities['dates'].append(ent.text)
            elif ent.label_ == 'MONEY':
                entities['money'].append(ent.text)
        
        # Also extract emails and phones with regex
        entities['emails'] = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
        entities['phones'] = re.findall(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', text)
        
        return {k: v for k, v in entities.items() if v}
    
    # Test
    test_text = df.iloc[0]['text']
    extracted = extract_entities_spacy(test_text)
    print(f"Text: {test_text}")
    print(f"\nExtracted entities (spaCy):")
    for entity_type, values in extracted.items():
        print(f"  {entity_type}: {values}")
    
    # Save
    with open(MODELS_DIR / 'spacy_extractor.pkl', 'wb') as f:
        pickle.dump(extract_entities_spacy, f)
    print(f"\n✓ spaCy extractor saved to {MODELS_DIR}")
    
except Exception as e:
    print(f"spaCy not available: {e}")
    print("Install with: pip install spacy && python -m spacy download en_core_web_sm")
