# Invoice Data Extraction Model Training

This notebook trains a specialized model to extract structured data from invoices:
- Invoice number
- Invoice date
- Due date
- Vendor name
- Line items
- Subtotal, tax, total amounts
- Payment terms

## Approach
1. **Option A:** LayoutLM (Document AI model) - Best for structured documents
2. **Option B:** Custom NER model fine-tuned on invoices
3. **Option C:** Template matching + OCR (for known invoice formats)


In [None]:
# Install required dependencies (run this first)
!pip install pandas numpy transformers torch datasets pillow pytesseract pdf2image


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import re
import json
from typing import Dict, List

# Set paths
BASE_DIR = Path('../')
DATA_DIR = BASE_DIR / 'data'
MODELS_DIR = BASE_DIR / 'models' / 'invoice_extractor'
MODELS_DIR.mkdir(parents=True, exist_ok=True)

print(f"Base directory: {BASE_DIR}")
print(f"Data directory: {DATA_DIR}")
print(f"Models directory: {MODELS_DIR}")


## 1. Load and Prepare Invoice Data


In [None]:
# Create sample invoice data with structured fields
sample_invoices = [
    {
        'text': '''INVOICE
Invoice Number: INV-2024-001
Date: January 15, 2024
Due Date: February 14, 2024

Bill To:
ABC Company
123 Main Street
City, State 12345

Item Description          Quantity    Unit Price    Total
Widget A                  10          $50.00        $500.00
Widget B                  5           $30.00        $150.00

Subtotal: $650.00
Tax (10%): $65.00
Total: $715.00

Payment Terms: Net 30
Thank you for your business!''',
        'extracted': {
            'invoice_number': 'INV-2024-001',
            'invoice_date': 'January 15, 2024',
            'due_date': 'February 14, 2024',
            'vendor_name': None,
            'customer_name': 'ABC Company',
            'subtotal': 650.00,
            'tax': 65.00,
            'total': 715.00,
            'payment_terms': 'Net 30',
            'line_items': [
                {'description': 'Widget A', 'quantity': 10, 'unit_price': 50.00, 'total': 500.00},
                {'description': 'Widget B', 'quantity': 5, 'unit_price': 30.00, 'total': 150.00}
            ]
        }
    }
]

df = pd.DataFrame(sample_invoices)
print(f"✓ Created {len(df)} sample invoices")
print(f"\nSample invoice text (first 200 chars):")
print(df.iloc[0]['text'][:200])
print(f"\nExtracted data:")
print(json.dumps(df.iloc[0]['extracted'], indent=2))


## 2. Rule-based Invoice Extraction


In [None]:
def extract_invoice_data(text: str) -> Dict:
    """Extract structured data from invoice text"""
    result = {
        'invoice_number': None,
        'invoice_date': None,
        'due_date': None,
        'vendor_name': None,
        'customer_name': None,
        'subtotal': None,
        'tax': None,
        'total': None,
        'payment_terms': None,
        'line_items': []
    }
    
    # Extract invoice number
    inv_match = re.search(r'invoice\s*(?:number|#|no\.?)[\s:]*([A-Z0-9\-]+)', text, re.IGNORECASE)
    if inv_match:
        result['invoice_number'] = inv_match.group(1)
    
    # Extract dates
    date_patterns = [
        r'(?:invoice\s*)?date[\s:]*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|\w+\s+\d{1,2},?\s+\d{4})',
        r'due\s*date[\s:]*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|\w+\s+\d{1,2},?\s+\d{4})',
    ]
    dates = re.findall(r'(\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|\w+\s+\d{1,2},?\s+\d{4})', text)
    if dates:
        result['invoice_date'] = dates[0]
        if len(dates) > 1:
            result['due_date'] = dates[1]
    
    # Extract amounts
    amounts = re.findall(r'\$[\d,]+\.?\d*', text)
    if amounts:
        # Try to find subtotal, tax, total
        subtotal_match = re.search(r'subtotal[\s:]*\$?([\d,]+\.?\d*)', text, re.IGNORECASE)
        tax_match = re.search(r'tax[\s:]*\$?([\d,]+\.?\d*)', text, re.IGNORECASE)
        total_match = re.search(r'total[\s:]*\$?([\d,]+\.?\d*)', text, re.IGNORECASE)
        
        if subtotal_match:
            result['subtotal'] = float(subtotal_match.group(1).replace(',', ''))
        if tax_match:
            result['tax'] = float(tax_match.group(1).replace(',', ''))
        if total_match:
            result['total'] = float(total_match.group(1).replace(',', ''))
    
    # Extract payment terms
    terms_match = re.search(r'payment\s*terms?[\s:]*([^\n]+)', text, re.IGNORECASE)
    if terms_match:
        result['payment_terms'] = terms_match.group(1).strip()
    
    return result

# Test
test_text = df.iloc[0]['text']
extracted = extract_invoice_data(test_text)
print("Extracted invoice data:")
print(json.dumps(extracted, indent=2, default=str))

# Save
import pickle
with open(MODELS_DIR / 'invoice_extractor.pkl', 'wb') as f:
    pickle.dump(extract_invoice_data, f)
print(f"\n✓ Invoice extractor saved to {MODELS_DIR}")
