# Fix Ground Truth Addresses

Manual review to correct incorrectly combined addresses in ground truth CSV.

**Process:**
1. **Run Cell 1 (Setup)** - Run once at start
2. **Run Cell 2 (Display Image)** - Shows next image to process
3. **Edit Cell 3** - Set business_address and payer_address variables
4. **Run Cell 3** - Saves addresses to state
5. **Run Cell 4** - Processes image, saves to CSV, moves to processed/
6. **Repeat** - Go back to step 2 for next image

**Key Design:**
- Cell 2 automatically clears old addresses when new image is detected
- Cell 3 stores addresses in state dictionary (prevents accidental reuse)
- Cell 4 validates state before processing

In [None]:
# Cell 1: Setup
import pandas as pd
from PIL import Image
from pathlib import Path
import shutil

# ============================================================================
# CONFIGURATION - Edit these paths if needed
# ============================================================================
GT_PATH = 'evaluation_data/ground_truth.csv'
IMG_PATH = 'evaluation_data'
BACKUP_PATH = 'evaluation_data/ground_truth_backup.csv'
# ============================================================================

# Load ground truth CSV (all columns as strings)
gt_df = pd.read_csv(GT_PATH, dtype=str)

# Image directory
image_dir = Path(IMG_PATH)
processed_dir = image_dir / 'processed'
processed_dir.mkdir(exist_ok=True)

# Backup CSV before making changes (only if backup doesn't exist)
backup_path = Path(BACKUP_PATH)
if not backup_path.exists():
    shutil.copy(GT_PATH, backup_path)
    print(f"‚úÖ Backup created: {backup_path}")
else:
    print(f"‚ÑπÔ∏è  Backup already exists: {backup_path}")

# Initialize state tracking dictionary
current_image_state = {
    'image_name': None,
    'business_address': "",
    'payer_address': ""
}

# Count unprocessed images
images = [f for f in image_dir.glob('*.jpeg') if f.is_file()]
images.extend([f for f in image_dir.glob('*.jpg') if f.is_file()])
images.extend([f for f in image_dir.glob('*.png') if f.is_file()])
print(f"\nüìä Total images to process: {len(images)}")

In [None]:
# Cell 2: Display Current Image
# Run this to see the next image that needs processing

# Get next unprocessed image
images = [f for f in image_dir.glob('*.jpeg') if f.is_file()]
images.extend([f for f in image_dir.glob('*.jpg') if f.is_file()])
images.extend([f for f in image_dir.glob('*.png') if f.is_file()])

if not images:
    print("üéâ All images processed!")
    current_image_state['image_name'] = None
else:
    image_path = images[0]
    image_name = image_path.stem
    
    # Update state tracking - new image detected, clear old addresses
    if current_image_state['image_name'] != image_name:
        current_image_state['image_name'] = image_name
        current_image_state['business_address'] = ""
        current_image_state['payer_address'] = ""
    
    # Display image at reasonable size (max width 800px)
    img = Image.open(image_path)
    
    # Resize if too large
    max_width = 800
    if img.width > max_width:
        ratio = max_width / img.width
        new_height = int(img.height * ratio)
        img = img.resize((max_width, new_height), Image.Resampling.LANCZOS)
    
    display(img)
    
    # Show current ground truth
    print(f"\n{'='*80}")
    print(f"Image: {image_name}")
    print(f"{'='*80}")
    
    row = gt_df[gt_df['image_name'] == image_name]
    if len(row) > 0:
        print(f"\nCurrent BUSINESS_ADDRESS:\n{row['BUSINESS_ADDRESS'].values[0]}")
        print(f"\nCurrent PAYER_ADDRESS:\n{row['PAYER_ADDRESS'].values[0]}")
    else:
        print(f"\n‚ö†Ô∏è  No ground truth found for {image_name}")
    
    print(f"\n{'='*80}")
    print(f"Next: Edit Cell 3 with addresses, then run Cell 3")
    print(f"{'='*80}")

In [None]:
# Cell 3: Set Addresses
# EDIT these values, then run this cell to save them

# ============================================================================
# EDIT THESE LINES - Type the correct addresses from the image in Cell 2
# Use "NOT_FOUND" if address doesn't exist on document
# ============================================================================

business_address = ""  # EDIT THIS
payer_address = ""     # EDIT THIS

# ============================================================================

# Store in state dictionary
current_image_state['business_address'] = business_address
current_image_state['payer_address'] = payer_address

print(f"‚úÖ Addresses set for: {current_image_state['image_name']}")
print(f"   BUSINESS_ADDRESS: {business_address if business_address else 'NOT_FOUND'}")
print(f"   PAYER_ADDRESS: {payer_address if payer_address else 'NOT_FOUND'}")
print(f"\nNext: Run Cell 4 to save and process")

In [None]:
# Cell 4: Process and Save
# Run this to save the addresses and move to next image

if current_image_state['image_name'] is None:
    print("‚ö†Ô∏è  No image to process. Run Cell 2 first.")
else:
    image_name = current_image_state['image_name']
    business_addr = current_image_state['business_address']
    payer_addr = current_image_state['payer_address']
    
    if not (business_addr or payer_addr):
        print("‚ö†Ô∏è  No addresses set. Run Cell 3 first.")
    else:
        # Update CSV
        gt_df.loc[gt_df['image_name'] == image_name, 'BUSINESS_ADDRESS'] = business_addr if business_addr else "NOT_FOUND"
        gt_df.loc[gt_df['image_name'] == image_name, 'PAYER_ADDRESS'] = payer_addr if payer_addr else "NOT_FOUND"
        gt_df.to_csv(GT_PATH, index=False)
        
        # Reload DataFrame
        gt_df = pd.read_csv(GT_PATH, dtype=str)
        
        # Find and move the image file
        images = list(image_dir.glob(f'{image_name}.*'))
        if images:
            image_path = images[0]
            shutil.move(str(image_path), str(processed_dir / image_path.name))
            
            # Count remaining
            remaining = [f for f in image_dir.glob('*.jpeg') if f.is_file()]
            remaining.extend([f for f in image_dir.glob('*.jpg') if f.is_file()])
            remaining.extend([f for f in image_dir.glob('*.png') if f.is_file()])
            
            print(f"‚úÖ Saved and moved: {image_name}")
            print(f"üìä Remaining: {len(remaining)}")
            print(f"\nNext: Run Cell 2 to see next image")
        else:
            print(f"‚ö†Ô∏è  Image file not found: {image_name}")