# Notebook 02: Image Preprocessing and OCR

This notebook handles image preprocessing and text extraction using the SimpleOCREngine for optimal quality and efficiency.

In [None]:
# Ensure local src/ modules are importable from notebook context
import sys
from pathlib import Path

ROOT = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
SRC = ROOT / 'src'
if str(SRC) not in sys.path:
    sys.path.append(str(SRC))
    print(f"Added to sys.path: {SRC}")
else:
    print(f"sys.path already contains: {SRC}")

## Section A: Setup and Environment Configuration

Configure environment and import necessary libraries for image processing.

## Step 1: Run Environment Setup

In [None]:
%pip install -q opencv-python pytesseract easyocr pandas

## Step 2: Utility Functions for Preprocessing

Define helper functions for image preprocessing and analysis.

In [None]:
import os
import sys
import pandas as pd
from pathlib import Path

# Define directories
ROOT = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
DATA = ROOT / 'data'
RAW_IMAGES = DATA / 'raw'
OCR_OUTPUT = DATA / 'ocr_results'
OCR_RAW_TEXT = OCR_OUTPUT / 'raw_text'
OCR_CLEAN_TEXT = OCR_OUTPUT / 'cleaned_text'

# Create output directories
OCR_RAW_TEXT.mkdir(parents=True, exist_ok=True)
OCR_CLEAN_TEXT.mkdir(parents=True, exist_ok=True)

# Load annotations
annotations_path = DATA / 'annotations.csv'
df = pd.read_csv(annotations_path)

# Load allergen dictionary
import json
allergen_dict_path = DATA / 'allergen_dictionary.json'
with open(allergen_dict_path, 'r', encoding='utf-8') as f:
    allergen_dict = json.load(f)

print(f"âœ“ Project root: {ROOT}")
print(f"âœ“ Raw images: {RAW_IMAGES}")
print(f"âœ“ OCR output: {OCR_OUTPUT}")
print(f"âœ“ Allergen dictionary loaded: {len(allergen_dict)} classes")
print(f"\nðŸ“Š Dataset loaded:")
print(f"   Total products: {len(df)}")
print(f"   Columns: {', '.join(df.columns.tolist())}")
print(f"\nFirst 3 products:")
print(df[['code', 'product_name', 'allergens']].head(3))

## Section B: Text Extraction with SimpleOCREngine

Extract text from preprocessed images using the SimpleOCREngine for high-quality OCR results.

In [None]:
import sys
sys.path.insert(0, str(ROOT / 'src'))

from preprocessing.ocr_preprocessor import prepare_for_ocr, generate_preprocessed_variants
from ocr.hybrid_extractor import HybridOCREngine
from ocr.easyocr_extractor import EasyOCREngine
from ocr.ocr_postprocessor import clean_text, normalize_ocr_noise, quality_score
from allergen_detection.synonym_mapper import SynonymMapper

# Configuration
USE_HYBRID = True  # Use hybrid OCR with multi-pass preprocessing
BATCH_SIZE = 10  # Progress update frequency

# Initialize OCR engine
if USE_HYBRID:
    ocr_engine = HybridOCREngine(
        lang_list=['en'], 
        try_easyocr=True, 
        try_tesseract=False  # Set to True if Tesseract is installed
    )
    print("âœ“ Hybrid OCR engine initialized (multi-pass preprocessing + confidence scoring)")
else:
    ocr_engine = EasyOCREngine(
        lang_list=['en'],
        contrast_ths=0.3,
        adjust_contrast=0.7,
        mag_ratio=1.5
    )
    print("âœ“ EasyOCR engine initialized")

# Initialize allergen synonym mapper
allergen_mapper = SynonymMapper(allergen_dict)
print("âœ“ Allergen synonym mapper loaded")

## Step 3: Process Sample Images and Extract Text

Run OCR on product images and save extracted text with quality metrics.

In [None]:
import cv2
import json
from datetime import datetime

from allergen_detection.product_matcher import ProductNameMatcher

# Initialize product matcher
product_matcher = ProductNameMatcher(df, confidence_threshold=0.70)

class HybridAllergenDetector:
    """Hybrid detector combining product matching + ingredient text OCR with advanced preprocessing"""
    
    def __init__(self, product_matcher, ingredient_mapper, ocr_engine):
        self.product_matcher = product_matcher
        self.ingredient_mapper = ingredient_mapper
        self.ocr_engine = ocr_engine
    
    def detect(self, image, image_file):
        """Intelligent detection with multi-pass OCR and quality scoring"""
        
        # Use hybrid OCR engine (tries multiple preprocessing variants)
        if hasattr(self.ocr_engine, 'extract_with_meta'):
            ocr_result = self.ocr_engine.extract_with_meta(image)
            ocr_text = ocr_result.get('text', '')
            ocr_quality = ocr_result.get('quality_score', 0.0)
            ocr_engine_used = ocr_result.get('engine', 'unknown')
            ocr_variant = ocr_result.get('variant', 'unknown')
        else:
            preprocessed = prepare_for_ocr(image)
            ocr_text = self.ocr_engine.extract(preprocessed)
            ocr_quality = 0.0
            ocr_engine_used = 'standard'
            ocr_variant = 'default'
        
        # Apply normalization
        ocr_clean = normalize_ocr_noise(clean_text(ocr_text))
        
        # Try product name matching
        result_product = self.product_matcher.detect_from_image(image, self.ocr_engine)
        product_confidence = result_product['confidence']
        product_allergens = result_product['allergens']
        
        # Try ingredient text OCR
        ingredient_allergens = self.ingredient_mapper.match(ocr_clean)
        
        # Intelligent fallback logic
        if product_confidence >= 0.70 and product_allergens:
            strategy = 'product_match'
            allergens = product_allergens
        elif ingredient_allergens:
            strategy = 'ingredient_text'
            allergens = ingredient_allergens
        elif product_allergens:
            strategy = 'product_match_low_conf'
            allergens = product_allergens
        else:
            strategy = 'no_match'
            allergens = set()
        
        return {
            'allergens': allergens,
            'strategy': strategy,
            'raw_text': ocr_text,
            'cleaned_text': ocr_clean,
            'ocr_quality': ocr_quality,
            'ocr_engine': ocr_engine_used,
            'ocr_variant': ocr_variant
        }

# Initialize hybrid detector
hybrid_detector = HybridAllergenDetector(product_matcher, allergen_mapper, ocr_engine)

# Create output directories
output_dirs = {
    'raw_text': str(OCR_RAW_TEXT),
    'cleaned_text': str(OCR_CLEAN_TEXT),
    'allergen_matches': str(OCR_OUTPUT / 'allergen_matches'),
}
for dir_path in output_dirs.values():
    os.makedirs(dir_path, exist_ok=True)

# Process all images
raw_image_dir = str(RAW_IMAGES)
image_files = sorted([f for f in os.listdir(raw_image_dir) if f.lower().endswith(('.jpg', '.png', '.jpeg'))])
print(f"Found {len(image_files)} images to process")

results = []
errors = []
strategy_counts = {}
quality_stats = []
start_time = datetime.now()

for idx, image_file in enumerate(image_files, 1):
    try:
        image_path = os.path.join(raw_image_dir, image_file)
        filename_base = os.path.splitext(image_file)[0]
        
        # Load and process image
        image = cv2.imread(image_path)
        if image is None:
            errors.append(f"{image_file}: Failed to read")
            continue
        
        # Run hybrid detection
        detection = hybrid_detector.detect(image, image_file)
        allergens = detection['allergens']
        strategy = detection['strategy']
        raw_text = detection['raw_text']
        cleaned_text = detection['cleaned_text']
        ocr_quality = detection.get('ocr_quality', 0.0)
        
        # Track statistics
        strategy_counts[strategy] = strategy_counts.get(strategy, 0) + 1
        quality_stats.append(ocr_quality)
        
        # Save outputs
        with open(os.path.join(output_dirs['raw_text'], f"{filename_base}.txt"), 'w', encoding='utf-8') as f:
            f.write(raw_text)
        
        with open(os.path.join(output_dirs['cleaned_text'], f"{filename_base}.txt"), 'w', encoding='utf-8') as f:
            f.write(cleaned_text)
        
        with open(os.path.join(output_dirs['allergen_matches'], f"{filename_base}.json"), 'w', encoding='utf-8') as f:
            json.dump({
                'allergens': list(allergens),
                'strategy': strategy,
                'ocr_quality': ocr_quality,
                'ocr_engine': detection.get('ocr_engine', 'unknown'),
                'ocr_variant': detection.get('ocr_variant', 'unknown')
            }, f)
        
        results.append({
            'filename': image_file,
            'allergens_found': list(allergens),
            'strategy': strategy,
            'text_length': len(cleaned_text),
            'ocr_quality': ocr_quality
        })
        
        # Progress update
        if idx % BATCH_SIZE == 0:
            elapsed = (datetime.now() - start_time).total_seconds()
            rate = idx / elapsed
            avg_quality = sum(quality_stats) / len(quality_stats) if quality_stats else 0.0
            print(f"[{idx}/{len(image_files)}] {rate:.1f} img/s | Avg quality: {avg_quality:.3f} | Strategies: {strategy_counts}")
    
    except Exception as e:
        errors.append(f"{image_file}: {str(e)}")

## Section C: Quality Assessment and Results

Analyze OCR quality and compare with baseline approaches.

In [None]:
from collections import Counter
import random

# Statistics
allergen_found_count = sum(1 for r in results if r['allergens_found'])
all_allergens = []
for r in results:
    all_allergens.extend(r['allergens_found'])

allergen_freq = Counter(all_allergens)

# Strategy effectiveness
strategy_with_allergens = {}
for r in results:
    strategy = r['strategy']
    if r['allergens_found']:
        strategy_with_allergens[strategy] = strategy_with_allergens.get(strategy, 0) + 1

print("\n" + "="*70)
print("HYBRID DETECTION ANALYSIS")
print("="*70)

print(f"\nDetection Statistics:")
print(f"  Total images: {len(results)}")
print(f"  Images with allergens: {allergen_found_count} ({allergen_found_count/len(results)*100:.1f}%)")
print(f"  Total allergen mentions: {len(all_allergens)}")

print(f"\nStrategy Effectiveness (% that found allergens):")
for strategy in sorted(strategy_with_allergens.keys()):
    count_total = sum(1 for r in results if r['strategy'] == strategy)
    count_with_allergens = strategy_with_allergens[strategy]
    rate = count_with_allergens / count_total * 100 if count_total > 0 else 0
    print(f"  {strategy:25s}: {count_with_allergens:4d}/{count_total:4d} ({rate:5.1f}%)")

print(f"\nTop 8 Detected Allergens:")
for allergen, count in allergen_freq.most_common(8):
    print(f"  - {allergen:15s}: {count:5d} occurrences")

print(f"\nProcessing Performance:")
print(f"  Average text length: {sum(r['text_length'] for r in results) / len(results):.0f} chars")
print(f"  Avg allergens per image: {len(all_allergens) / len(results):.2f}")

# Sample results
print(f"\n" + "="*70)
print("SAMPLE RESULTS (3 random images with allergens)")
print("="*70)

samples = [r for r in results if r['allergens_found']]
if samples:
    for sample in random.sample(samples, min(3, len(samples))):
        print(f"\n{sample['filename']}")
        print(f"  Strategy: {sample['strategy']}")
        print(f"  Allergens: {', '.join(sample['allergens_found'])}")
        print(f"  Text length: {sample['text_length']} chars")

# Save analysis summary
summary = {
    'total_images': len(results),
    'images_with_allergens': allergen_found_count,
    'allergen_detection_rate': allergen_found_count / len(results),
    'total_allergen_mentions': len(all_allergens),
    'top_allergens': dict(allergen_freq.most_common(10)),
    'strategy_distribution': strategy_counts,
    'strategy_effectiveness': strategy_with_allergens,
    'timestamp': datetime.now().isoformat()
}

summary_path = str(OCR_OUTPUT / 'processing_summary.json')
with open(summary_path, 'w', encoding='utf-8') as f:
    json.dump(summary, f, indent=2)

print(f"\n[OK] Summary saved: data/ocr_results/processing_summary.json")
print("="*70)