# Fix Ground Truth Addresses

Manual review to correct incorrectly combined addresses in ground truth CSV.

**Process:**
1. Run Cell 1 (Setup) once
2. Run Cell 2 repeatedly - each run processes one image
3. Look at image, edit the address variables, run cell
4. Image moves to processed/ directory
5. Repeat until all images processed

In [None]:
# Cell 1: Setup
import pandas as pd
from PIL import Image
from pathlib import Path
import shutil

# Load ground truth CSV (all columns as strings)
gt_df = pd.read_csv('evaluation_data/ground_truth.csv', dtype=str)

# Image directory
image_dir = Path('evaluation_data')
processed_dir = image_dir / 'processed'
processed_dir.mkdir(exist_ok=True)

# Backup CSV before making changes (only if backup doesn't exist)
backup_path = Path('evaluation_data/ground_truth_backup.csv')
if not backup_path.exists():
    shutil.copy('evaluation_data/ground_truth.csv', backup_path)
    print("‚úÖ Backup created: ground_truth_backup.csv")
else:
    print("‚ÑπÔ∏è  Backup already exists")

# Count unprocessed images
images = [f for f in image_dir.glob('*.jpeg') if f.is_file()]
images.extend([f for f in image_dir.glob('*.jpg') if f.is_file()])
images.extend([f for f in image_dir.glob('*.png') if f.is_file()])
print(f"\nüìä Total images to process: {len(images)}")

In [None]:
# Cell 2: Process One Image
# Run this cell repeatedly - once per image

# Get next unprocessed image
images = [f for f in image_dir.glob('*.jpeg') if f.is_file()]
images.extend([f for f in image_dir.glob('*.jpg') if f.is_file()])
images.extend([f for f in image_dir.glob('*.png') if f.is_file()])

if not images:
    print("üéâ All images processed!")
else:
    image_path = images[0]
    image_name = image_path.stem
    
    # Display image (large size for readability)
    img = Image.open(image_path)
    display(img)
    
    # Show current ground truth
    print(f"\n{'='*80}")
    print(f"Image: {image_name}")
    print(f"{'='*80}")
    
    row = gt_df[gt_df['image_name'] == image_name]
    if len(row) > 0:
        print(f"\nCurrent BUSINESS_ADDRESS:\n{row['BUSINESS_ADDRESS'].values[0]}")
        print(f"\nCurrent PAYER_ADDRESS:\n{row['PAYER_ADDRESS'].values[0]}")
    else:
        print(f"\n‚ö†Ô∏è  No ground truth found for {image_name}")
    
    print(f"\n{'='*80}")
    print("EDIT THE ADDRESSES BELOW (look at image above):")
    print(f"{'='*80}\n")
    
    # ============================================================================
    # EDIT THESE LINES - Type the correct addresses from the image
    # Use "NOT_FOUND" if address doesn't exist on document
    # ============================================================================
    
    business_address = ""  # EDIT THIS
    payer_address = ""     # EDIT THIS
    
    # ============================================================================
    
    if business_address or payer_address:
        # Update CSV
        gt_df.loc[gt_df['image_name'] == image_name, 'BUSINESS_ADDRESS'] = business_address if business_address else "NOT_FOUND"
        gt_df.loc[gt_df['image_name'] == image_name, 'PAYER_ADDRESS'] = payer_address if payer_address else "NOT_FOUND"
        gt_df.to_csv('evaluation_data/ground_truth.csv', index=False)
        
        # Move to processed
        shutil.move(str(image_path), str(processed_dir / image_path.name))
        
        print(f"\n‚úÖ Updated and moved to processed/")
        print(f"üìä Remaining: {len(images)-1}")
    else:
        print("\n‚ö†Ô∏è  Please edit the address variables above and run again")