In [None]:
#Cell 0
import pandas as pd
from pathlib import Path
import glob

print("✅ Imports loaded")

In [None]:
#Cell 1
# Configuration
GROUND_TRUTH_PATH = 'evaluation_data/ground_truth.csv'

# Find latest Llama results
llama_files = sorted(glob.glob('output/csv/llama_batch_results_*.csv'))
LLAMA_RESULTS_PATH = llama_files[-1] if llama_files else None

print(f"Ground truth: {GROUND_TRUTH_PATH}")
print(f"Llama results: {LLAMA_RESULTS_PATH}")

In [None]:
#Cell 2
# Load data
gt = pd.read_csv(GROUND_TRUTH_PATH)
llama = pd.read_csv(LLAMA_RESULTS_PATH)

print(f"Ground truth: {len(gt)} images")
print(f"Llama results: {len(llama)} images")

In [None]:
#Cell 3
# Merge and compare
merged = gt.merge(
    llama[['image_file', 'document_type']],
    on='image_file',
    how='inner'
)

merged['match'] = (
    merged['DOCUMENT_TYPE'].str.upper() == 
    merged['document_type'].str.upper()
)

merged['ground_truth'] = merged['DOCUMENT_TYPE']
merged['detected'] = merged['document_type']

print("\n" + "="*80)
print("DOCUMENT TYPE CLASSIFICATION COMPARISON")
print("="*80)
print(merged[['image_file', 'ground_truth', 'detected', 'match']].to_string(index=False))

In [None]:
#Cell 4
# Summary statistics
total = len(merged)
correct = merged['match'].sum()
incorrect = total - correct

print("\n" + "="*80)
print("SUMMARY")
print("="*80)
print(f"Total images: {total}")
print(f"Correct classifications: {correct} ({correct/total*100:.1f}%)")
print(f"Incorrect classifications: {incorrect} ({incorrect/total*100:.1f}%)")

In [None]:
#Cell 5
# Show misclassifications (if any)
misclassified = merged[~merged['match']]

if len(misclassified) > 0:
    print("\n" + "="*80)
    print("⚠️  MISCLASSIFIED DOCUMENTS")
    print("="*80)
    for _, row in misclassified.iterrows():
        print(f"\n❌ {row['image_file']}")
        print(f"   Ground Truth: {row['ground_truth']}")
        print(f"   Detected:     {row['detected']}")
else:
    print("\n✅ All documents correctly classified!")

In [None]:
#Cell 6
# Document type distribution
print("\n" + "="*80)
print("DOCUMENT TYPE DISTRIBUTION")
print("="*80)

print("\nGround Truth:")
print(gt['DOCUMENT_TYPE'].value_counts())

print("\nDetected:")
print(llama['document_type'].str.upper().value_counts())

In [None]:
#Cell 7
# Per-type accuracy
print("\n" + "="*80)
print("ACCURACY BY DOCUMENT TYPE")
print("="*80)

for doc_type in merged['ground_truth'].unique():
    subset = merged[merged['ground_truth'] == doc_type]
    accuracy = subset['match'].sum() / len(subset) * 100
    print(f"{doc_type:20s}: {subset['match'].sum()}/{len(subset)} ({accuracy:.1f}%)")