# Document Classification Model Training

This notebook trains a model to classify business documents (invoice, purchase order, receipt, quotation, etc.)

## Approach
1. **Option A:** Fine-tuned Transformer (DistilBERT) - Better accuracy, requires GPU
2. **Option B:** Traditional ML (TF-IDF + Random Forest) - Faster, works on CPU

We'll implement both and compare.


In [None]:
!pip install pandas numpy sklearn matplotlib seaborn

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
import json
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

# Set paths
BASE_DIR = Path('../')
DATA_DIR = BASE_DIR / 'data'
MODELS_DIR = BASE_DIR / 'models' / 'document_classifier'
MODELS_DIR.mkdir(parents=True, exist_ok=True)

print(f"Base directory: {BASE_DIR}")
print(f"Data directory: {DATA_DIR}")
print(f"Models directory: {MODELS_DIR}")


ModuleNotFoundError: No module named 'pandas'

## 1. Load and Prepare Data


In [None]:
# Load training data
# Try to load from CSV first, otherwise generate sample data
import sys
sys.path.insert(0, str(Path('../../')))

from ml.utils.data_loader import load_document_data

df = load_document_data()
print(f"\nDataset shape: {df.shape}")
print(f"\nLabel distribution:")
print(df['label'].value_counts())
print(f"\nFirst few samples:")
print(df.head())


## 2. Train Traditional ML Model (TF-IDF + Random Forest)


In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)

# Vectorize text
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train model
print("Training Random Forest classifier...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train_vec, y_train)

# Evaluate
y_pred = rf_model.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)

print(f"\nAccuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=rf_model.classes_, yticklabels=rf_model.classes_)
plt.title('Confusion Matrix - Random Forest')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.savefig(MODELS_DIR / 'rf_confusion_matrix.png')
plt.show()

# Save model
from ml.utils.model_saver import save_model
save_model(rf_model, vectorizer, MODELS_DIR, accuracy)

print(f"\nModel saved to {MODELS_DIR}")


## 3. Test the Model


In [None]:
# Test with new document
test_doc = "Invoice #INV-999\nDate: 2024-03-01\nTotal: $5,000.00\nDue: 2024-03-31"

# Random Forest prediction
test_vec = vectorizer.transform([test_doc])
rf_pred = rf_model.predict(test_vec)[0]
rf_proba = rf_model.predict_proba(test_vec)[0]

print(f"Test Document: {test_doc[:50]}...")
print(f"\nRandom Forest Prediction: {rf_pred}")
print(f"Confidence: {max(rf_proba):.2%}")
print(f"\nAll probabilities:")
for label, prob in zip(rf_model.classes_, rf_proba):
    print(f"  {label}: {prob:.2%}")
