In [None]:
# Install required dependencies (run this first)
!pip install pandas numpy scikit-learn matplotlib seaborn transformers torch datasets kaggle gdown sqlalchemy pydantic-settings

Note: you may need to restart the kernel to use updated packages.


In [3]:
# Load training data with proper error handling
import sys
from pathlib import Path
sys.path.insert(0, str(Path('../../')))

from ml.utils.data_loader import load_document_data
import pandas as pd

print("=" * 60)
print("Loading training data...")
print("=" * 60)

try:
    df = load_document_data()
    
    # Safety check: ensure df is a DataFrame
    if df is None:
        print("⚠️ Warning: load_document_data returned None. Creating minimal sample...")
        df = pd.DataFrame({
            'text': [
                'Invoice #001\nDate: 2024-01-01\nAmount: $100',
                'Purchase Order PO-001\nSupplier: ABC Corp',
                'Receipt #RCP-001\nPayment: $50\nThank you!'
            ],
            'label': ['invoice', 'purchase_order', 'receipt']
        })
    
    # Verify df has required columns
    if 'text' not in df.columns or 'label' not in df.columns:
        print("⚠️ Warning: DataFrame missing required columns. Creating sample...")
        df = pd.DataFrame({
            'text': [
                'Invoice #001\nDate: 2024-01-01\nAmount: $100',
                'Purchase Order PO-001\nSupplier: ABC Corp',
                'Receipt #RCP-001\nPayment: $50\nThank you!'
            ],
            'label': ['invoice', 'purchase_order', 'receipt']
        })
    
    print(f"\n✓ Dataset loaded successfully!")
    print(f"  Shape: {df.shape}")
    print(f"\n  Label distribution:")
    print(df['label'].value_counts())
    print(f"\n  First few samples:")
    print(df.head())
    print(f"\n  Sample text length: {df['text'].str.len().describe()}")
    
except Exception as e:
    print(f"❌ Error loading data: {e}")
    print("Creating minimal sample data...")
    df = pd.DataFrame({
        'text': [
            'Invoice #001\nDate: 2024-01-01\nAmount: $100',
            'Purchase Order PO-001\nSupplier: ABC Corp',
            'Receipt #RCP-001\nPayment: $50\nThank you!',
            'Quotation QT-001\nDate: 2024-01-15\nEstimate: $500',
            'Delivery Order DO-001\nDate: 2024-01-20\nReceived: 10 units'
        ],
        'label': ['invoice', 'purchase_order', 'receipt', 'quotation', 'delivery_order']
    })
    print(f"Created sample dataset with {len(df)} samples")
    print(f"Shape: {df.shape}")


Loading training data...
Could not load from database: No module named 'psycopg2'
No local data found. Attempting to download sample data from public datasets...
Trying Hugging Face datasets...
  Trying ag_news...


README.md: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


data/train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

  ✓ Loaded 100 samples from ag_news
Downloaded 100 samples. Saved to c:\Users\User\Documents\Programming\tastar\backend\ml\notebooks\..\..\ml\data\processed\document_classification.csv

✓ Dataset loaded successfully!
  Shape: (100, 2)

  Label distribution:
label
general    96
invoice     4
Name: count, dtype: int64

  First few samples:
                                                text    label
0  Wall St. Bears Claw Back Into the Black (Reute...  general
1  Carlyle Looks Toward Commercial Aerospace (Reu...  general
2  Oil and Economy Cloud Stocks' Outlook (Reuters...  general
3  Iraq Halts Oil Exports from Main Southern Pipe...  general
4  Oil prices soar to all-time record, posing new...  general

  Sample text length: count    100.000000
mean     232.350000
std       86.301489
min      100.000000
25%      157.750000
50%      231.500000
75%      274.250000
max      500.000000
Name: text, dtype: float64
