In [None]:
import os
import json
import time
import base64
import re
import requests
import pandas as pd
import logging

# =========================
# CONFIG
# =========================
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
MODEL = "google/gemini-2.5-flash"

IMAGE_DIR = "images"
OUT_DIR = "output"
CKPT = os.path.join(OUT_DIR, "checkpoint.json")
OUT_JSON = os.path.join(OUT_DIR, "result.json")
OUT_CSV = os.path.join(OUT_DIR, "result.csv")
ERROR_LOG = os.path.join(OUT_DIR, "errors.txt")

os.makedirs(OUT_DIR, exist_ok=True)

# =========================
# LOGGING
# =========================
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(message)s',
    handlers=[
        logging.FileHandler(os.path.join(OUT_DIR, 'process.log'), encoding='utf-8'),
        logging.StreamHandler()
    ]
)

# =========================
# PROMPT
# =========================
PROMPT = """‡∏Ñ‡∏∏‡∏ì‡∏Ñ‡∏∑‡∏≠‡πÄ‡∏à‡πâ‡∏≤‡∏´‡∏ô‡πâ‡∏≤‡∏ó‡∏µ‡πà‡∏Ñ‡∏ì‡∏∞‡∏Å‡∏£‡∏£‡∏°‡∏Å‡∏≤‡∏£‡∏Å‡∏≤‡∏£‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡∏ï‡∏±‡πâ‡∏á (‡∏Å‡∏Å‡∏ï.)
‡πÄ‡∏≠‡∏Å‡∏™‡∏≤‡∏£‡∏ô‡∏µ‡πâ‡πÄ‡∏õ‡πá‡∏ô‡∏´‡∏ô‡∏±‡∏á‡∏™‡∏∑‡∏≠‡∏£‡∏≤‡∏ä‡∏Å‡∏≤‡∏£
‡∏£‡∏≤‡∏¢‡∏ä‡∏∑‡πà‡∏≠‡∏ú‡∏π‡πâ‡∏™‡∏°‡∏±‡∏Ñ‡∏£‡∏™‡∏°‡∏≤‡∏ä‡∏¥‡∏Å‡∏™‡∏†‡∏≤‡∏ú‡∏π‡πâ‡πÅ‡∏ó‡∏ô‡∏£‡∏≤‡∏©‡∏é‡∏£ ‡πÅ‡∏ö‡∏ö‡∏ö‡∏±‡∏ç‡∏ä‡∏µ‡∏£‡∏≤‡∏¢‡∏ä‡∏∑‡πà‡∏≠

‡∏Å‡∏ï‡∏¥‡∏Å‡∏≤‡∏Å‡∏≤‡∏£‡∏≠‡πà‡∏≤‡∏ô:
1) ‡∏≠‡πà‡∏≤‡∏ô‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏à‡∏≤‡∏Å‡∏†‡∏≤‡∏û‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏ô‡πâ‡∏≤
2) ‡∏û‡∏£‡∏£‡∏Ñ‡∏Å‡∏≤‡∏£‡πÄ‡∏°‡∏∑‡∏≠‡∏á:
   - ‡∏≠‡∏¢‡∏π‡πà‡∏ö‡∏ô‡∏´‡∏±‡∏ß‡∏Å‡∏£‡∏∞‡∏î‡∏≤‡∏©
   - ‡∏≠‡∏¢‡∏π‡πà‡πÉ‡∏ô‡∏õ‡∏£‡∏∞‡πÇ‡∏¢‡∏Ñ "‡∏ï‡∏≤‡∏°‡∏ó‡∏µ‡πà‡∏û‡∏£‡∏£‡∏Ñ ‚Ä¶ ‡πÑ‡∏î‡πâ‡∏¢‡∏∑‡πà‡∏ô"
   - ‡∏ñ‡πâ‡∏≤‡πÑ‡∏°‡πà‡∏û‡∏ö‡∏û‡∏£‡∏£‡∏Ñ‡πÉ‡∏´‡∏°‡πà ‡πÉ‡∏´‡πâ‡πÉ‡∏ä‡πâ‡∏û‡∏£‡∏£‡∏Ñ‡∏à‡∏≤‡∏Å‡∏´‡∏ô‡πâ‡∏≤‡∏Å‡πà‡∏≠‡∏ô
3) ‡∏ú‡∏π‡πâ‡∏™‡∏°‡∏±‡∏Ñ‡∏£ 1 ‡∏Ñ‡∏ô = 1 record
4) ‡∏•‡∏≥‡∏î‡∏±‡∏ö‡∏ó‡∏µ‡πà = ‡∏ï‡∏±‡∏ß‡πÄ‡∏•‡∏Ç‡∏´‡∏ô‡πâ‡∏≤‡∏ä‡∏∑‡πà‡∏≠
5) ‡∏ä‡∏∑‡πà‡∏≠‚Äì‡∏™‡∏Å‡∏∏‡∏• ‡∏≠‡∏≤‡∏à‡∏Ç‡∏∂‡πâ‡∏ô‡∏´‡∏•‡∏≤‡∏¢‡∏ö‡∏£‡∏£‡∏ó‡∏±‡∏î ‡πÉ‡∏´‡πâ‡∏£‡∏ß‡∏°‡πÄ‡∏õ‡πá‡∏ô‡∏ä‡πà‡∏≠‡∏á‡πÄ‡∏î‡∏µ‡∏¢‡∏ß
6) ‡∏ó‡∏µ‡πà‡∏≠‡∏¢‡∏π‡πà‡∏ï‡∏≤‡∏°‡∏ó‡∏∞‡πÄ‡∏ö‡∏µ‡∏¢‡∏ô‡∏ö‡πâ‡∏≤‡∏ô ‡∏≠‡∏≤‡∏à‡∏Ç‡∏∂‡πâ‡∏ô‡∏´‡∏•‡∏≤‡∏¢‡∏ö‡∏£‡∏£‡∏ó‡∏±‡∏î ‡πÉ‡∏´‡πâ‡∏£‡∏ß‡∏°‡πÄ‡∏õ‡πá‡∏ô‡∏ä‡πà‡∏≠‡∏á‡πÄ‡∏î‡∏µ‡∏¢‡∏ß
7) ‡∏≠‡πà‡∏≤‡∏ô‡∏à‡∏≤‡∏Å‡∏ã‡πâ‡∏≤‡∏¢‡πÑ‡∏õ‡∏Ç‡∏ß‡∏≤ ‡∏ö‡∏ô‡∏•‡∏á‡∏•‡πà‡∏≤‡∏á

‡∏´‡πâ‡∏≤‡∏°‡πÄ‡∏î‡∏≤
‡∏ñ‡πâ‡∏≤‡∏≠‡πà‡∏≤‡∏ô‡πÑ‡∏°‡πà‡∏≠‡∏≠‡∏Å ‡πÉ‡∏´‡πâ‡πÄ‡∏ß‡πâ‡∏ô‡∏Ñ‡πà‡∏≤‡∏ß‡πà‡∏≤‡∏á ""

‡∏ï‡∏≠‡∏ö‡∏Å‡∏•‡∏±‡∏ö‡πÄ‡∏õ‡πá‡∏ô JSON ‡πÄ‡∏ó‡πà‡∏≤‡∏ô‡∏±‡πâ‡∏ô ‡∏ï‡∏≤‡∏° schema ‡∏ô‡∏µ‡πâ:
{
  "party": "string",
  "records": [
    {
      "order": number,
      "name": "string",
      "address": "string"
    }
  ]
}"""

# =========================
HEADERS = {
    "Authorization": f"Bearer {OPENROUTER_API_KEY}",
    "Content-Type": "application/json",
}

TIMEOUT_SECONDS = 120
SLEEP_BETWEEN_PAGES = 2
MAX_RETRIES = 3

# -------------------------
def img_to_b64(path):
    """Convert image to base64"""
    try:
        with open(path, "rb") as f:
            return base64.b64encode(f.read()).decode()
    except Exception as e:
        logging.error(f"Failed to read image {path}: {e}")
        raise

# -------------------------
def extract_json(text):
    """Extract JSON from response text with multiple fallback patterns"""
    if not text or not text.strip():
        logging.warning("Empty response received")
        return {"party": None, "records": []}
    
    # Try multiple JSON extraction patterns
    patterns = [
        r"\{[\s\S]*\}",  # Standard JSON
        r"```json\s*(\{[\s\S]*\})\s*```",  # Markdown code block
        r"```\s*(\{[\s\S]*\})\s*```",  # Generic code block
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            json_str = match.group(1) if "```" in pattern else match.group()
            try:
                parsed = json.loads(json_str)
                validate_result(parsed)
                return parsed
            except (json.JSONDecodeError, ValueError) as e:
                logging.debug(f"Pattern {pattern} failed: {e}")
                continue
    
    # Log the problematic response
    logging.error(f"NO_JSON_FOUND in response: {text[:500]}")
    raise ValueError(f"Could not extract valid JSON from response")

# -------------------------
def validate_result(result):
    """Validate extracted result structure"""
    if not isinstance(result, dict):
        raise ValueError("Result is not a dictionary")
    
    if "records" not in result:
        raise ValueError("Missing 'records' field")
    
    if not isinstance(result["records"], list):
        raise ValueError("'records' is not a list")
    
    # Validate each record structure
    for idx, rec in enumerate(result["records"]):
        if not isinstance(rec, dict):
            raise ValueError(f"Record {idx} is not a dictionary")
        
        # ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏ß‡πà‡∏≤‡∏°‡∏µ field ‡∏ó‡∏µ‡πà‡∏à‡∏≥‡πÄ‡∏õ‡πá‡∏ô
        if "order" not in rec:
            logging.warning(f"Record {idx} missing 'order'")
        if "name" not in rec or not rec.get("name"):
            logging.warning(f"Record {idx} missing or empty 'name': {rec}")
        if "address" not in rec:
            logging.warning(f"Record {idx} missing 'address'")
    
    return True

# -------------------------
def call_openrouter(image_path, last_party, max_retries=MAX_RETRIES):
    """Call OpenRouter API with retry logic and exponential backoff"""
    
    for attempt in range(max_retries):
        try:
            img_b64 = img_to_b64(image_path)
            
            # Build user message
            user_text = ""
            if last_party:
                user_text = f"‡∏û‡∏£‡∏£‡∏Ñ‡∏à‡∏≤‡∏Å‡∏´‡∏ô‡πâ‡∏≤‡∏Å‡πà‡∏≠‡∏ô: {last_party}\n\n‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡∏≠‡πà‡∏≤‡∏ô‡∏†‡∏≤‡∏û‡πÅ‡∏•‡∏∞‡πÅ‡∏¢‡∏Å‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏ï‡∏≤‡∏° schema"
            else:
                user_text = "‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡∏≠‡πà‡∏≤‡∏ô‡∏†‡∏≤‡∏û‡πÅ‡∏•‡∏∞‡πÅ‡∏¢‡∏Å‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏ï‡∏≤‡∏° schema"
            
            payload = {
                "model": MODEL,
                "messages": [
                    {"role": "system", "content": PROMPT},
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": user_text},
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/png;base64,{img_b64}"
                                }
                            }
                        ]
                    }
                ],
                "temperature": 0,
                "max_tokens": 2000  # ‡πÄ‡∏û‡∏¥‡πà‡∏°‡∏Ç‡∏∂‡πâ‡∏ô‡πÄ‡∏û‡∏£‡∏≤‡∏∞ prompt ‡∏¢‡∏≤‡∏ß‡πÅ‡∏•‡∏∞‡∏≠‡∏≤‡∏à‡∏°‡∏µ‡∏´‡∏•‡∏≤‡∏¢ records
            }
            
            r = requests.post(
                "https://openrouter.ai/api/v1/chat/completions",
                headers=HEADERS,
                json=payload,
                timeout=TIMEOUT_SECONDS
            )
            
            # Handle rate limiting with exponential backoff
            if r.status_code == 429:
                wait_time = min(60 * (2 ** attempt), 300)  # Max 5 minutes
                logging.warning(f"Rate limit hit (attempt {attempt+1}/{max_retries}), waiting {wait_time}s")
                time.sleep(wait_time)
                continue
            
            r.raise_for_status()
            
            response_data = r.json()
            content = response_data["choices"][0]["message"]["content"]
            
            result = extract_json(content)
            logging.info(f"Successfully extracted {len(result.get('records', []))} records")
            return result
            
        except requests.exceptions.Timeout:
            logging.error(f"Timeout on attempt {attempt+1}/{max_retries}")
            if attempt == max_retries - 1:
                raise
            time.sleep(10 * (attempt + 1))
            
        except requests.exceptions.RequestException as e:
            logging.error(f"Request error on attempt {attempt+1}/{max_retries}: {e}")
            if attempt == max_retries - 1:
                raise
            time.sleep(10 * (attempt + 1))
            
        except (json.JSONDecodeError, ValueError) as e:
            logging.error(f"Parse error on attempt {attempt+1}/{max_retries}: {e}")
            if attempt == max_retries - 1:
                raise
            time.sleep(10 * (attempt + 1))
        
        except Exception as e:
            logging.error(f"Unexpected error on attempt {attempt+1}/{max_retries}: {e}")
            if attempt == max_retries - 1:
                raise
            time.sleep(10 * (attempt + 1))
    
    raise RuntimeError(f"Max retries ({max_retries}) exceeded")

# -------------------------
def load_ckpt():
    """Load checkpoint data"""
    if os.path.exists(CKPT):
        try:
            with open(CKPT, encoding="utf-8") as f:
                data = json.load(f)
                logging.info(f"Loaded checkpoint with {len(data)} existing records")
                return data
        except Exception as e:
            logging.error(f"Failed to load checkpoint: {e}")
            return []
    return []

# -------------------------
def save_ckpt(data):
    """Save checkpoint data"""
    try:
        with open(CKPT, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
        logging.debug(f"Checkpoint saved with {len(data)} records")
    except Exception as e:
        logging.error(f"Failed to save checkpoint: {e}")

# -------------------------
def log_error(page_idx, img_name, error):
    """Log errors to separate file"""
    try:
        with open(ERROR_LOG, "a", encoding="utf-8") as f:
            f.write(f"{page_idx},{img_name},{str(error)}\n")
    except Exception as e:
        logging.error(f"Failed to write error log: {e}")

# =========================
def main():
    logging.info("=" * 50)
    logging.info("Starting OCR batch process - ‡∏Å‡∏Å‡∏ï. MP List Extraction")
    logging.info("=" * 50)
    
    # Load checkpoint
    rows = load_ckpt()
    done_pages = {r["__page"] for r in rows} if rows else set()
    last_party = rows[-1]["party"] if rows else None
    
    if last_party:
        logging.info(f"Resuming with last known party: {last_party}")
    
    # Get image list
    try:
        images = sorted(f for f in os.listdir(IMAGE_DIR) if f.lower().endswith((".png", ".jpg", ".jpeg")))
    except Exception as e:
        logging.error(f"Failed to read image directory: {e}")
        return
    
    if not images:
        logging.error(f"No images found in {IMAGE_DIR}")
        return
    
    logging.info(f"Found {len(images)} images, {len(done_pages)} already processed")
    
    # Process each image
    for idx, img in enumerate(images, start=1):
        if idx in done_pages:
            logging.info(f"[{idx}/{len(images)}] Skipping {img} (already done)")
            continue
        
        path = os.path.join(IMAGE_DIR, img)
        logging.info(f"[{idx}/{len(images)}] Processing {img}")
        
        try:
            result = call_openrouter(path, last_party)
            
            # Get party name (use from result or carry forward from last page)
            party = result.get("party") or last_party
            
            if party != last_party and party:
                logging.info(f"üîÑ Party changed: {last_party} ‚Üí {party}")
            
            last_party = party
            
            # Add records
            records_added = 0
            for r in result.get("records", []):
                rows.append({
                    "__page": idx,
                    "party": party,
                    "order": r.get("order"),
                    "name": r.get("name", ""),
                    "address": r.get("address", "")
                })
                records_added += 1
            
            logging.info(f"‚úÖ Added {records_added} records from page {idx}")
            
            # Save checkpoint every 3 pages or if records added
            if idx % 3 == 0 or records_added > 0:
                save_ckpt(rows)
            
            # Rate limiting sleep
            time.sleep(SLEEP_BETWEEN_PAGES)
            
        except Exception as e:
            logging.error(f"‚ùå Failed to process {img}: {e}")
            log_error(idx, img, e)
            # Continue to next image instead of stopping
            continue
    
    # Final save
    save_ckpt(rows)
    
    # Export results
    try:
        # JSON export
        with open(OUT_JSON, "w", encoding="utf-8") as f:
            json.dump(rows, f, ensure_ascii=False, indent=2)
        logging.info(f"üìÑ Exported JSON: {OUT_JSON}")
        
        # CSV export
        df = pd.DataFrame(rows).drop(columns="__page", errors='ignore')
        df.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")
        logging.info(f"üìä Exported CSV: {OUT_CSV}")
        
        # Summary statistics
        unique_parties = df['party'].dropna().unique()
        party_counts = df.groupby('party').size()
        
        logging.info("=" * 50)
        logging.info(f"‚úÖ COMPLETED SUCCESSFULLY")
        logging.info(f"Total records: {len(rows)}")
        logging.info(f"Total pages processed: {len(set(r['__page'] for r in rows))}")
        logging.info(f"Unique parties: {len(unique_parties)}")
        logging.info(f"\nRecords per party:")
        for party, count in party_counts.items():
            logging.info(f"  - {party}: {count} records")
        logging.info("=" * 50)
        
    except Exception as e:
        logging.error(f"Failed to export results: {e}")
        raise

# =========================
if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        logging.info("\n‚ö† Process interrupted by user")
    except Exception as e:
        logging.error(f"Fatal error: {e}", exc_info=True)

[1/140] OCR page_001.png


JSONDecodeError: Expecting value: line 1 column 1 (char 0)