# Fix Ground Truth - Invoice/Receipt Fields

Manual review to correct invoice/receipt fields in ground truth CSV.

**Process:**
1. **Run Cell 1 (Setup)** - Run once at start
2. **Run Cell 2 (Display Image)** - Shows next image and current ground truth
3. **Edit Cell 3** - Only edit fields that are INCORRECT (leave others empty)
4. **Run Cell 3** - Saves changes to state
5. **Run Cell 4** - Updates only the edited fields, moves image to processed/
6. **Repeat** - Go back to step 2 for next image

**Key Design:**
- **Only type incorrect fields** - empty fields keep existing ground truth values
- Cell 2 automatically clears old values when new image detected
- Cell 3 stores values in state dictionary (prevents accidental reuse)
- Cell 4 selectively updates only non-empty fields

**Fields (14 total):**
DOCUMENT_TYPE, BUSINESS_ABN, SUPPLIER_NAME, BUSINESS_ADDRESS, PAYER_NAME, 
PAYER_ADDRESS, INVOICE_DATE, LINE_ITEM_DESCRIPTIONS, LINE_ITEM_QUANTITIES, 
LINE_ITEM_PRICES, LINE_ITEM_TOTAL_PRICES, IS_GST_INCLUDED, GST_AMOUNT, TOTAL_AMOUNT

**Format Notes:**
- Line items: Use pipe `|` to separate multiple items (e.g., "Item 1 | Item 2")
- Use "NOT_FOUND" if field doesn't exist on document
- Leave field empty (`""`) to keep existing ground truth value

In [None]:
# Cell 1: Setup
import pandas as pd
from PIL import Image
from pathlib import Path
import shutil

# ============================================================================
# CONFIGURATION - Edit these paths if needed
# ============================================================================
GT_PATH = 'evaluation_data/ground_truth.csv'
IMG_PATH = 'evaluation_data'
BACKUP_PATH = 'evaluation_data/ground_truth_backup_invoice_receipt.csv'
# ============================================================================

# Load ground truth CSV (all columns as strings)
gt_df = pd.read_csv(GT_PATH, dtype=str)

# Image directory
image_dir = Path(IMG_PATH)
processed_dir = image_dir / 'processed_invoice_receipt'
processed_dir.mkdir(exist_ok=True)

# Backup CSV before making changes (only if backup doesn't exist)
backup_path = Path(BACKUP_PATH)
if not backup_path.exists():
    shutil.copy(GT_PATH, backup_path)
    print(f"‚úÖ Backup created: {backup_path}")
else:
    print(f"‚ÑπÔ∏è  Backup already exists: {backup_path}")

# Initialize state tracking dictionary for all 14 fields
current_image_state = {
    'image_name': None,
    'document_type': "",
    'business_abn': "",
    'supplier_name': "",
    'business_address': "",
    'payer_name': "",
    'payer_address': "",
    'invoice_date': "",
    'line_item_descriptions': "",
    'line_item_quantities': "",
    'line_item_prices': "",
    'line_item_total_prices': "",
    'is_gst_included': "",
    'gst_amount': "",
    'total_amount': ""
}

# Count unprocessed images
images = [f for f in image_dir.glob('*.jpeg') if f.is_file()]
images.extend([f for f in image_dir.glob('*.jpg') if f.is_file()])
images.extend([f for f in image_dir.glob('*.png') if f.is_file()])
print(f"\nüìä Total images to process: {len(images)}")

In [None]:
# Cell 2: Display Current Image
# Run this to see the next image that needs processing

# Get next unprocessed image
images = [f for f in image_dir.glob('*.jpeg') if f.is_file()]
images.extend([f for f in image_dir.glob('*.jpg') if f.is_file()])
images.extend([f for f in image_dir.glob('*.png') if f.is_file()])

if not images:
    print("üéâ All images processed!")
    current_image_state['image_name'] = None
else:
    image_path = images[0]
    image_name = image_path.stem
    
    # Update state tracking - new image detected, clear all fields
    if current_image_state['image_name'] != image_name:
        for key in current_image_state:
            if key != 'image_name':
                current_image_state[key] = ""
        current_image_state['image_name'] = image_name
    
    # Display image at reasonable size (max width 800px)
    img = Image.open(image_path)
    
    # Resize if too large
    max_width = 800
    if img.width > max_width:
        ratio = max_width / img.width
        new_height = int(img.height * ratio)
        img = img.resize((max_width, new_height), Image.Resampling.LANCZOS)
    
    display(img)
    
    # Show current ground truth
    print(f"\n{'='*80}")
    print(f"Image: {image_name}")
    print(f"{'='*80}")
    
    row = gt_df[gt_df['image_name'] == image_name]
    if len(row) > 0:
        # Helper function to display field value
        def show_field(label, field_name):
            val = row[field_name].values[0]
            print(f"\n{label}:")
            if pd.notna(val) and val != "NOT_FOUND" and '|' in str(val):
                # Display pipe-separated values as numbered list
                for i, item in enumerate(val.split('|'), 1):
                    print(f"  {i}. {item.strip()}")
            else:
                print(f"  {val}")
        
        # Display all 14 fields
        show_field("DOCUMENT_TYPE", "DOCUMENT_TYPE")
        show_field("BUSINESS_ABN", "BUSINESS_ABN")
        show_field("SUPPLIER_NAME", "SUPPLIER_NAME")
        show_field("BUSINESS_ADDRESS", "BUSINESS_ADDRESS")
        show_field("PAYER_NAME", "PAYER_NAME")
        show_field("PAYER_ADDRESS", "PAYER_ADDRESS")
        show_field("INVOICE_DATE", "INVOICE_DATE")
        show_field("LINE_ITEM_DESCRIPTIONS", "LINE_ITEM_DESCRIPTIONS")
        show_field("LINE_ITEM_QUANTITIES", "LINE_ITEM_QUANTITIES")
        show_field("LINE_ITEM_PRICES", "LINE_ITEM_PRICES")
        show_field("LINE_ITEM_TOTAL_PRICES", "LINE_ITEM_TOTAL_PRICES")
        show_field("IS_GST_INCLUDED", "IS_GST_INCLUDED")
        show_field("GST_AMOUNT", "GST_AMOUNT")
        show_field("TOTAL_AMOUNT", "TOTAL_AMOUNT")
    else:
        print(f"\n‚ö†Ô∏è  No ground truth found for {image_name}")
    
    print(f"\n{'='*80}")
    print(f"Next: Edit Cell 3 with corrected fields (only edit incorrect fields)")
    print(f"{'='*80}")

In [None]:
# Cell 3: Set Field Values
# EDIT these values, then run this cell to save them

# ============================================================================
# EDIT THESE LINES - Only edit fields that are INCORRECT in ground truth
# Leave empty ("") to keep existing ground truth value for that field
# Line items: Use pipe | to separate multiple items
# Use "NOT_FOUND" if field doesn't exist on document
# ============================================================================

document_type = ""               # EDIT if wrong, leave empty to keep existing
business_abn = ""                # EDIT if wrong, leave empty to keep existing
supplier_name = ""               # EDIT if wrong, leave empty to keep existing
business_address = ""            # EDIT if wrong, leave empty to keep existing
payer_name = ""                  # EDIT if wrong, leave empty to keep existing
payer_address = ""               # EDIT if wrong, leave empty to keep existing
invoice_date = ""                # EDIT if wrong, leave empty to keep existing
line_item_descriptions = ""      # EDIT if wrong, leave empty to keep existing
line_item_quantities = ""        # EDIT if wrong, leave empty to keep existing
line_item_prices = ""            # EDIT if wrong, leave empty to keep existing
line_item_total_prices = ""      # EDIT if wrong, leave empty to keep existing
is_gst_included = ""             # EDIT if wrong, leave empty to keep existing
gst_amount = ""                  # EDIT if wrong, leave empty to keep existing
total_amount = ""                # EDIT if wrong, leave empty to keep existing

# ============================================================================

# Store in state dictionary
current_image_state['document_type'] = document_type
current_image_state['business_abn'] = business_abn
current_image_state['supplier_name'] = supplier_name
current_image_state['business_address'] = business_address
current_image_state['payer_name'] = payer_name
current_image_state['payer_address'] = payer_address
current_image_state['invoice_date'] = invoice_date
current_image_state['line_item_descriptions'] = line_item_descriptions
current_image_state['line_item_quantities'] = line_item_quantities
current_image_state['line_item_prices'] = line_item_prices
current_image_state['line_item_total_prices'] = line_item_total_prices
current_image_state['is_gst_included'] = is_gst_included
current_image_state['gst_amount'] = gst_amount
current_image_state['total_amount'] = total_amount

# Show what will be updated
print(f"‚úÖ Fields set for: {current_image_state['image_name']}")
fields_to_update = []
for field_name, var_name in [
    ('DOCUMENT_TYPE', document_type),
    ('BUSINESS_ABN', business_abn),
    ('SUPPLIER_NAME', supplier_name),
    ('BUSINESS_ADDRESS', business_address),
    ('PAYER_NAME', payer_name),
    ('PAYER_ADDRESS', payer_address),
    ('INVOICE_DATE', invoice_date),
    ('LINE_ITEM_DESCRIPTIONS', line_item_descriptions),
    ('LINE_ITEM_QUANTITIES', line_item_quantities),
    ('LINE_ITEM_PRICES', line_item_prices),
    ('LINE_ITEM_TOTAL_PRICES', line_item_total_prices),
    ('IS_GST_INCLUDED', is_gst_included),
    ('GST_AMOUNT', gst_amount),
    ('TOTAL_AMOUNT', total_amount)
]:
    if var_name:
        fields_to_update.append(field_name)
        print(f"   {field_name}: {var_name}")

if not fields_to_update:
    print("   (No changes - all fields keeping existing values)")
    
print(f"\nNext: Run Cell 4 to save and process")

In [None]:
# Cell 4: Process and Save
# Run this to save the changes and move to next image

if current_image_state['image_name'] is None:
    print("‚ö†Ô∏è  No image to process. Run Cell 2 first.")
else:
    image_name = current_image_state['image_name']
    
    # Get current ground truth row
    row_idx = gt_df[gt_df['image_name'] == image_name].index
    
    if len(row_idx) == 0:
        print(f"‚ö†Ô∏è  No ground truth found for {image_name}")
    else:
        # Only update fields that have non-empty values
        # Empty fields keep their existing ground truth values
        updated_fields = []
        
        field_mapping = {
            'document_type': 'DOCUMENT_TYPE',
            'business_abn': 'BUSINESS_ABN',
            'supplier_name': 'SUPPLIER_NAME',
            'business_address': 'BUSINESS_ADDRESS',
            'payer_name': 'PAYER_NAME',
            'payer_address': 'PAYER_ADDRESS',
            'invoice_date': 'INVOICE_DATE',
            'line_item_descriptions': 'LINE_ITEM_DESCRIPTIONS',
            'line_item_quantities': 'LINE_ITEM_QUANTITIES',
            'line_item_prices': 'LINE_ITEM_PRICES',
            'line_item_total_prices': 'LINE_ITEM_TOTAL_PRICES',
            'is_gst_included': 'IS_GST_INCLUDED',
            'gst_amount': 'GST_AMOUNT',
            'total_amount': 'TOTAL_AMOUNT'
        }
        
        for state_key, csv_col in field_mapping.items():
            value = current_image_state[state_key]
            if value:  # Only update if non-empty
                gt_df.loc[row_idx, csv_col] = value
                updated_fields.append(csv_col)
        
        # Save to CSV
        gt_df.to_csv(GT_PATH, index=False)
        
        # Reload DataFrame
        gt_df = pd.read_csv(GT_PATH, dtype=str)
        
        # Find and move the image file
        images = list(image_dir.glob(f'{image_name}.*'))
        if images:
            image_path = images[0]
            shutil.move(str(image_path), str(processed_dir / image_path.name))
            
            # Count remaining
            remaining = [f for f in image_dir.glob('*.jpeg') if f.is_file()]
            remaining.extend([f for f in image_dir.glob('*.jpg') if f.is_file()])
            remaining.extend([f for f in image_dir.glob('*.png') if f.is_file()])
            
            # Show what was updated
            if updated_fields:
                print(f"‚úÖ Updated {len(updated_fields)} field(s): {', '.join(updated_fields)}")
            else:
                print(f"‚úÖ No changes needed (all fields correct)")
            print(f"   Image: {image_name}")
            print(f"üìä Remaining: {len(remaining)}")
            print(f"\nNext: Run Cell 2 to see next image")
        else:
            print(f"‚ö†Ô∏è  Image file not found: {image_name}")