<a href="https://colab.research.google.com/github/tanatet8/Colab_Script/blob/main/ThaiNovel_OCR_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
================================================================================
                OCR PROCESSING WITH API - AUTOMATED VERSION
                       ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏ó‡∏≥ Corpus ‡∏ô‡∏¥‡∏¢‡∏≤‡∏¢‡πÑ‡∏ó‡∏¢
================================================================================

Features:
1. Automated OCR ‚Üí LLM ‚Üí Clean corpus
2. ‡πÉ‡∏ä‡πâ GPT-4o-mini (‡∏ñ‡∏π‡∏Å‡∏™‡∏∏‡∏î) ‡∏´‡∏£‡∏∑‡∏≠ Claude Haiku
3. Quality validation & tracking
4. Save training pairs for fine-tuning

Requirements:
- pip install openai anthropic pandas tqdm
- API keys (OpenAI ‡∏´‡∏£‡∏∑‡∏≠ Anthropic)
================================================================================
"""

# ============================================
# üìå Block 1: Setup & Import
# ============================================
import os
import re
import json
import time
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Tuple, Optional
import pandas as pd
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Mount Drive (‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö Colab)
try:
    from google.colab import drive
    drive.mount('/content/drive')
    IN_COLAB = True
except:
    IN_COLAB = False

print("‚úÖ Libraries loaded")

In [None]:
# ============================================
# üìå Block 2: Configuration & API Setup
# ============================================
class Config:
    """Configuration ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö API Processing"""

    # ‚ö†Ô∏è ‡πÉ‡∏™‡πà API Keys ‡∏ó‡∏µ‡πà‡∏ô‡∏µ‡πà (‡∏´‡∏£‡∏∑‡∏≠‡πÉ‡∏ä‡πâ environment variables)
    OPENAI_API_KEY = ""  # ‡πÉ‡∏™‡πà OpenAI API key
    ANTHROPIC_API_KEY = ""  # ‡πÉ‡∏™‡πà Anthropic API key (‡∏ñ‡πâ‡∏≤‡πÉ‡∏ä‡πâ Claude)

    # ‡πÄ‡∏•‡∏∑‡∏≠‡∏Å Model (uncomment ‡∏≠‡∏±‡∏ô‡∏ó‡∏µ‡πà‡∏à‡∏∞‡πÉ‡∏ä‡πâ)
    MODEL = "gpt-4o-mini"  # ‡∏ñ‡∏π‡∏Å‡∏™‡∏∏‡∏î ‡πÅ‡∏ô‡∏∞‡∏ô‡∏≥!
    # MODEL = "gpt-3.5-turbo"
    # MODEL = "claude-3-haiku"

    # Paths (Google Drive)
    BASE = '/content/drive/MyDrive/OCR' if IN_COLAB else './OCR'

    RAW_OCR_DIR = f'{BASE}/raw_ocr'
    CLEANED_DIR = f'{BASE}/cleaned'
    CORPUS_DIR = f'{BASE}/final_corpus'
    TRAINING_PAIRS_DIR = f'{BASE}/training_pairs'
    LOGS_DIR = f'{BASE}/logs'

    # Processing settings
    MAX_PAGES_PER_BATCH = 5  # ‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡∏´‡∏ô‡πâ‡∏≤‡∏ï‡πà‡∏≠ API call
    MAX_RETRIES = 3  # retry ‡∏ñ‡πâ‡∏≤ API error
    TEMPERATURE = 0.1  # ‡∏ï‡πà‡∏≥ = consistent output
    MAX_TOKENS = 4000  # max response length

    # Cost tracking
    PRICE_PER_1K_TOKENS = {
        'gpt-4o-mini': 0.00015,  # $0.15 per 1M
        'gpt-3.5-turbo': 0.0005,
        'claude-3-haiku': 0.00025
    }

# ‡∏™‡∏£‡πâ‡∏≤‡∏á folders
for folder in [Config.RAW_OCR_DIR, Config.CLEANED_DIR, Config.CORPUS_DIR,
               Config.TRAINING_PAIRS_DIR, Config.LOGS_DIR]:
    Path(folder).mkdir(parents=True, exist_ok=True)

print("‚úÖ Config loaded")

In [None]:
# ============================================
# üìå Block 3: API Clients
# ============================================
class LLMClient:
    """Universal LLM Client ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö OpenAI ‡πÅ‡∏•‡∏∞ Anthropic"""

    def __init__(self):
        self.model = Config.MODEL
        self.client = None
        self.total_tokens = 0
        self.total_cost = 0

        # Initialize ‡∏ï‡∏≤‡∏° model
        if 'gpt' in self.model:
            self._init_openai()
        elif 'claude' in self.model:
            self._init_anthropic()

    def _init_openai(self):
        """Initialize OpenAI client"""
        try:
            import openai

            # Set API key
            if Config.OPENAI_API_KEY:
                openai.api_key = Config.OPENAI_API_KEY
            else:
                # ‡∏•‡∏≠‡∏á‡∏´‡∏≤‡∏à‡∏≤‡∏Å environment variable
                openai.api_key = os.getenv('OPENAI_API_KEY')

            if not openai.api_key:
                raise ValueError("‚ùå ‡πÑ‡∏°‡πà‡∏û‡∏ö OpenAI API key! ‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡πÉ‡∏™‡πà‡πÉ‡∏ô Config")

            self.client = openai.OpenAI(api_key=openai.api_key)
            print(f"‚úÖ OpenAI client ready (Model: {self.model})")

        except ImportError:
            print("‚ùå ‡∏ï‡πâ‡∏≠‡∏á‡∏ï‡∏¥‡∏î‡∏ï‡∏±‡πâ‡∏á: pip install openai")
            raise

    def _init_anthropic(self):
        """Initialize Anthropic client"""
        try:
            import anthropic

            if Config.ANTHROPIC_API_KEY:
                api_key = Config.ANTHROPIC_API_KEY
            else:
                api_key = os.getenv('ANTHROPIC_API_KEY')

            if not api_key:
                raise ValueError("‚ùå ‡πÑ‡∏°‡πà‡∏û‡∏ö Anthropic API key!")

            self.client = anthropic.Anthropic(api_key=api_key)
            print(f"‚úÖ Anthropic client ready (Model: {self.model})")

        except ImportError:
            print("‚ùå ‡∏ï‡πâ‡∏≠‡∏á‡∏ï‡∏¥‡∏î‡∏ï‡∏±‡πâ‡∏á: pip install anthropic")
            raise

    def clean_ocr_text(self, text: str, page_num: int = 1) -> Dict:
        """
        ‡∏™‡πà‡∏á OCR text ‡πÉ‡∏´‡πâ LLM ‡πÅ‡∏Å‡πâ‡πÑ‡∏Ç

        Returns:
            {
                'cleaned_text': str,
                'tokens_used': int,
                'cost': float,
                'changes': list
            }
        """
        # ‡∏™‡∏£‡πâ‡∏≤‡∏á prompt
        prompt = self._create_prompt(text)

        # ‡πÄ‡∏£‡∏µ‡∏¢‡∏Å API ‡∏ï‡∏≤‡∏° provider
        if 'gpt' in self.model:
            result = self._call_openai(prompt)
        else:
            result = self._call_anthropic(prompt)

        # Track usage
        self.total_tokens += result['tokens_used']
        self.total_cost += result['cost']

        return result

    def _create_prompt(self, text: str) -> str:
        """‡∏™‡∏£‡πâ‡∏≤‡∏á prompt ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö OCR cleaning"""
        return f"""‡πÅ‡∏Å‡πâ‡πÑ‡∏Ç‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏° OCR ‡∏à‡∏≤‡∏Å‡∏ô‡∏¥‡∏¢‡∏≤‡∏¢‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢‡∏ï‡πà‡∏≠‡πÑ‡∏õ‡∏ô‡∏µ‡πâ

‡∏Å‡∏é‡∏Å‡∏≤‡∏£‡πÅ‡∏Å‡πâ‡πÑ‡∏Ç:
1. ‡πÅ‡∏Å‡πâ‡πÄ‡∏â‡∏û‡∏≤‡∏∞ typo ‡πÅ‡∏•‡∏∞‡∏Å‡∏≤‡∏£‡∏™‡∏∞‡∏Å‡∏î‡∏ú‡∏¥‡∏î
2. ‡πÅ‡∏Å‡πâ‡∏Ñ‡∏≥‡∏ó‡∏µ‡πà‡∏Ç‡∏≤‡∏î‡∏´‡∏≤‡∏¢/‡πÅ‡∏ï‡∏Å‡∏´‡∏±‡∏Å (‡πÄ‡∏ä‡πà‡∏ô "‡∏°‡∏≤ ‡∏Å‡∏≥‡∏•‡∏±‡∏á" ‚Üí "‡∏°‡∏≤‡∏Å‡∏≥‡∏•‡∏±‡∏á")
3. ‡∏•‡∏ö‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£‡πÄ‡∏î‡∏µ‡πà‡∏¢‡∏ß‡∏ó‡∏µ‡πà‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏°‡∏´‡∏°‡∏≤‡∏¢
4. ‡∏£‡∏±‡∏Å‡∏©‡∏≤‡∏£‡∏π‡∏õ‡πÅ‡∏ö‡∏ö‡∏ö‡∏ó‡∏™‡∏ô‡∏ó‡∏ô‡∏≤ (‡∏Ñ‡∏≥‡∏û‡∏π‡∏î‡πÉ‡∏ô "...")
5. ‡∏´‡πâ‡∏≤‡∏°‡πÄ‡∏û‡∏¥‡πà‡∏°‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤‡πÉ‡∏´‡∏°‡πà
6. ‡∏´‡πâ‡∏≤‡∏°‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô‡∏Ñ‡∏ß‡∏≤‡∏°‡∏´‡∏°‡∏≤‡∏¢

‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏° OCR:
{text}

‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏ó‡∏µ‡πà‡πÅ‡∏Å‡πâ‡πÅ‡∏•‡πâ‡∏ß:"""

    def _call_openai(self, prompt: str) -> Dict:
        """Call OpenAI API"""
        try:
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[
                    {"role": "system", "content": "‡∏Ñ‡∏∏‡∏ì‡∏Ñ‡∏∑‡∏≠‡∏ú‡∏π‡πâ‡πÄ‡∏ä‡∏µ‡πà‡∏¢‡∏ß‡∏ä‡∏≤‡∏ç‡πÅ‡∏Å‡πâ‡πÑ‡∏Ç OCR ‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢"},
                    {"role": "user", "content": prompt}
                ],
                temperature=Config.TEMPERATURE,
                max_tokens=Config.MAX_TOKENS
            )

            # Extract result
            cleaned_text = response.choices[0].message.content
            tokens = response.usage.total_tokens

            # Calculate cost
            price_per_token = Config.PRICE_PER_1K_TOKENS.get(self.model, 0.0005) / 1000
            cost = tokens * price_per_token

            return {
                'cleaned_text': cleaned_text,
                'tokens_used': tokens,
                'cost': cost,
                'model': self.model
            }

        except Exception as e:
            print(f"‚ùå OpenAI API error: {e}")
            # Retry logic
            for retry in range(Config.MAX_RETRIES):
                time.sleep(2 ** retry)  # Exponential backoff
                try:
                    return self._call_openai(prompt)
                except:
                    continue
            raise

    def _call_anthropic(self, prompt: str) -> Dict:
        """Call Anthropic API"""
        try:
            response = self.client.messages.create(
                model=self.model,
                max_tokens=Config.MAX_TOKENS,
                temperature=Config.TEMPERATURE,
                messages=[
                    {"role": "user", "content": prompt}
                ]
            )

            cleaned_text = response.content[0].text
            tokens = response.usage.input_tokens + response.usage.output_tokens

            price_per_token = Config.PRICE_PER_1K_TOKENS.get(self.model, 0.00025) / 1000
            cost = tokens * price_per_token

            return {
                'cleaned_text': cleaned_text,
                'tokens_used': tokens,
                'cost': cost,
                'model': self.model
            }

        except Exception as e:
            print(f"‚ùå Anthropic API error: {e}")
            raise

    def get_usage_summary(self) -> Dict:
        """‡∏™‡∏£‡∏∏‡∏õ‡∏Å‡∏≤‡∏£‡πÉ‡∏ä‡πâ‡∏á‡∏≤‡∏ô API"""
        return {
            'total_tokens': self.total_tokens,
            'total_cost_usd': self.total_cost,
            'total_cost_thb': self.total_cost * 35,  # ‡∏õ‡∏£‡∏∞‡∏°‡∏≤‡∏ì
            'pages_processed': self.total_tokens // 500  # ‡∏õ‡∏£‡∏∞‡∏°‡∏≤‡∏ì 500 tokens/page
        }

print("‚úÖ LLM Client ready")

In [None]:

# ============================================
# üìå Block 4: OCR Processor
# ============================================
class OCRProcessor:
    """Main processor ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö OCR ‚Üí LLM ‚Üí Clean corpus"""

    def __init__(self):
        self.llm = LLMClient()
        self.stats = {
            'processed': 0,
            'failed': 0,
            'total_cost': 0
        }
        self.training_pairs = []

    def process_file(self, file_path: Path) -> Dict:
        """
        Process 1 ‡πÑ‡∏ü‡∏•‡πå OCR ‡∏û‡∏£‡πâ‡∏≠‡∏° validation

        Returns:
            {
                'success': bool,
                'cleaned_path': str,
                'stats': dict,
                'validation': dict
            }
        """
        print(f"\nüìÑ Processing: {file_path.name}")

        try:
            # ‡∏≠‡πà‡∏≤‡∏ô‡πÑ‡∏ü‡∏•‡πå OCR
            raw_text = file_path.read_text(encoding='utf-8')

            # ‡∏ñ‡πâ‡∏≤‡πÑ‡∏ü‡∏•‡πå‡πÉ‡∏´‡∏ç‡πà ‡∏ï‡πâ‡∏≠‡∏á‡πÅ‡∏ö‡πà‡∏á chunks
            if len(raw_text) > 3000:
                chunks = self._split_text(raw_text)
                cleaned_chunks = []

                for i, chunk in enumerate(chunks):
                    print(f"   Chunk {i+1}/{len(chunks)}...")
                    result = self.llm.clean_ocr_text(chunk, i+1)
                    cleaned_chunks.append(result['cleaned_text'])
                    time.sleep(1)  # Rate limiting

                cleaned_text = '\n\n'.join(cleaned_chunks)
            else:
                # ‡πÑ‡∏ü‡∏•‡πå‡πÄ‡∏•‡πá‡∏Å ‡∏™‡πà‡∏á‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î
                result = self.llm.clean_ocr_text(raw_text)
                cleaned_text = result['cleaned_text']

            # === VALIDATION ===
            print(f"   üîç Validating...")
            validation = QualityValidator.enhanced_validate(
                raw_text,
                cleaned_text,
                file_path.name
            )

            # ‡πÅ‡∏™‡∏î‡∏á‡∏ú‡∏• validation
            if validation['status'] == 'FAIL':
                print(f"   ‚ùå VALIDATION FAILED:")
                for issue in validation['issues']:
                    print(f"      {issue}")
            elif validation['status'] == 'WARNING':
                print(f"   ‚ö†Ô∏è VALIDATION WARNING:")
                if validation['warnings']:
                    print(f"      {validation['warnings'][0]}")
                if validation['suspicious']:
                    print(f"      {validation['suspicious'][0]}")
            else:
                print(f"   ‚úÖ Validation passed (score: {validation['score']:.2f})")

            # ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å validation report ‡∏ñ‡πâ‡∏≤‡∏°‡∏µ issues
            if validation['status'] in ['FAIL', 'WARNING']:
                val_report_path = QualityValidator.save_validation_report(validation)
                print(f"   üìä Validation report: {val_report_path.name}")

            # ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå (‡πÅ‡∏°‡πâ validation ‡∏à‡∏∞ fail ‡∏Å‡πá save ‡πÑ‡∏ß‡πâ review)
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

            # ‡∏ñ‡πâ‡∏≤ validation fail ‡πÉ‡∏™‡πà prefix WARNING_
            if validation['status'] == 'FAIL':
                clean_filename = f"WARNING_{file_path.stem}_clean_{timestamp}.txt"
            else:
                clean_filename = f"{file_path.stem}_clean_{timestamp}.txt"

            clean_path = Path(Config.CLEANED_DIR) / clean_filename
            clean_path.write_text(cleaned_text, encoding='utf-8')

            # ‡πÄ‡∏Å‡πá‡∏ö training pair (‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡∏ó‡∏µ‡πà‡∏ú‡πà‡∏≤‡∏ô validation)
            if validation['status'] != 'FAIL':
                self.training_pairs.append({
                    'input': raw_text[:1000],
                    'output': cleaned_text[:1000],
                    'source': file_path.name,
                    'timestamp': timestamp,
                    'validation_score': validation['score']
                })

            # Update stats
            self.stats['processed'] += 1
            if validation['status'] == 'FAIL':
                self.stats['validation_failed'] = self.stats.get('validation_failed', 0) + 1
            elif validation['status'] == 'WARNING':
                self.stats['validation_warning'] = self.stats.get('validation_warning', 0) + 1

            print(f"   ‚úÖ Saved: {clean_filename}")
            print(f"   üí∞ Cost: ${result.get('cost', 0):.4f}")

            return {
                'success': True,
                'cleaned_path': str(clean_path),
                'tokens': result.get('tokens_used', 0),
                'cost': result.get('cost', 0),
                'validation': validation
            }

        except Exception as e:
            print(f"   ‚ùå Error: {e}")
            self.stats['failed'] += 1
            return {'success': False, 'error': str(e)}

    def _split_text(self, text: str, max_chars: int = 2500) -> List[str]:
        """‡πÅ‡∏ö‡πà‡∏á text ‡∏¢‡∏≤‡∏ß‡πÄ‡∏õ‡πá‡∏ô chunks"""
        # ‡πÅ‡∏ö‡πà‡∏á‡∏ï‡∏≤‡∏° paragraph ‡∏ñ‡πâ‡∏≤‡πÄ‡∏õ‡πá‡∏ô‡πÑ‡∏õ‡πÑ‡∏î‡πâ
        paragraphs = text.split('\n\n')

        chunks = []
        current_chunk = []
        current_length = 0

        for para in paragraphs:
            para_length = len(para)

            if current_length + para_length > max_chars and current_chunk:
                # Chunk ‡πÄ‡∏ï‡πá‡∏° - save ‡πÅ‡∏•‡∏∞‡πÄ‡∏£‡∏¥‡πà‡∏°‡πÉ‡∏´‡∏°‡πà
                chunks.append('\n\n'.join(current_chunk))
                current_chunk = [para]
                current_length = para_length
            else:
                current_chunk.append(para)
                current_length += para_length

        # Chunk ‡∏™‡∏∏‡∏î‡∏ó‡πâ‡∏≤‡∏¢
        if current_chunk:
            chunks.append('\n\n'.join(current_chunk))

        return chunks

    def process_batch(self, file_pattern: str = "*.txt", limit: int = None):
        """
        Process ‡∏´‡∏•‡∏≤‡∏¢‡πÑ‡∏ü‡∏•‡πå

        Args:
            file_pattern: pattern ‡∏Ç‡∏≠‡∏á‡πÑ‡∏ü‡∏•‡πå‡∏ó‡∏µ‡πà‡∏à‡∏∞ process
            limit: ‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡πÑ‡∏ü‡∏•‡πå‡∏™‡∏π‡∏á‡∏™‡∏∏‡∏î (None = ‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î)
        """
        # ‡∏´‡∏≤‡πÑ‡∏ü‡∏•‡πå‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î
        raw_dir = Path(Config.RAW_OCR_DIR)
        files = list(raw_dir.glob(file_pattern))

        if limit:
            files = files[:limit]

        print(f"\nüöÄ Processing {len(files)} files...")
        print("=" * 50)

        # Process ‡πÅ‡∏ï‡πà‡∏•‡∏∞‡πÑ‡∏ü‡∏•‡πå
        results = []
        for file_path in tqdm(files, desc="Processing"):
            result = self.process_file(file_path)
            results.append(result)

            # Rate limiting
            time.sleep(0.5)

        # ‡∏™‡∏£‡∏∏‡∏õ‡∏ú‡∏•
        self._print_summary(results)

        # ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å training pairs
        self._save_training_pairs()

        return results

    def _print_summary(self, results: List[Dict]):
        """‡πÅ‡∏™‡∏î‡∏á‡∏™‡∏£‡∏∏‡∏õ‡∏ú‡∏•‡∏Å‡∏≤‡∏£ process ‡∏û‡∏£‡πâ‡∏≠‡∏° validation summary"""
        successful = [r for r in results if r.get('success')]
        total_tokens = sum(r.get('tokens', 0) for r in successful)
        total_cost = sum(r.get('cost', 0) for r in successful)

        # ‡∏ô‡∏±‡∏ö validation status
        validation_stats = {
            'PASS': 0,
            'WARNING': 0,
            'FAIL': 0
        }

        for r in successful:
            if 'validation' in r:
                status = r['validation'].get('status', 'UNKNOWN')
                validation_stats[status] = validation_stats.get(status, 0) + 1

        print("\n" + "=" * 50)
        print("üìä PROCESSING SUMMARY")
        print("=" * 50)
        print(f"‚úÖ Success: {len(successful)}/{len(results)}")
        print(f"‚ùå Failed: {len(results) - len(successful)}")
        print(f"üî§ Total tokens: {total_tokens:,}")
        print(f"üí∞ Total cost: ${total_cost:.4f} (~{total_cost*35:.2f} ‡∏ö‡∏≤‡∏ó)")

        # Validation summary
        print(f"\nüìã Validation Summary:")
        print(f"   ‚úÖ Passed: {validation_stats.get('PASS', 0)}")
        print(f"   ‚ö†Ô∏è Warnings: {validation_stats.get('WARNING', 0)}")
        print(f"   ‚ùå Failed: {validation_stats.get('FAIL', 0)}")

        # ‡πÅ‡∏à‡πâ‡∏á‡πÄ‡∏ï‡∏∑‡∏≠‡∏ô‡∏ñ‡πâ‡∏≤‡∏°‡∏µ validation issues
        if validation_stats.get('WARNING', 0) > 0:
            print(f"\nüí° ‡∏°‡∏µ {validation_stats['WARNING']} ‡πÑ‡∏ü‡∏•‡πå‡∏ó‡∏µ‡πà‡∏Ñ‡∏ß‡∏£‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö")
            print(f"   ‡∏î‡∏π validation reports ‡πÉ‡∏ô: {Config.LOGS_DIR}")

        if validation_stats.get('FAIL', 0) > 0:
            print(f"\n‚ö†Ô∏è ‡∏°‡∏µ {validation_stats['FAIL']} ‡πÑ‡∏ü‡∏•‡πå‡∏ó‡∏µ‡πà validation ‡πÑ‡∏°‡πà‡∏ú‡πà‡∏≤‡∏ô")
            print(f"   ‡πÑ‡∏ü‡∏•‡πå‡πÄ‡∏´‡∏•‡πà‡∏≤‡∏ô‡∏µ‡πâ‡∏°‡∏µ prefix 'WARNING_' ‡πÉ‡∏ô cleaned folder")

        print(f"\nüìÅ Cleaned files saved to: {Config.CLEANED_DIR}")

        # API usage summary
        usage = self.llm.get_usage_summary()
        print(f"\nüìà API Usage:")
        print(f"   Model: {Config.MODEL}")
        print(f"   Tokens: {usage['total_tokens']:,}")
        print(f"   Cost: ${usage['total_cost_usd']:.4f} (~{usage['total_cost_thb']:.2f} ‡∏ö‡∏≤‡∏ó)")

    def _save_training_pairs(self):
        """‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å training pairs ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö fine-tuning"""
        if not self.training_pairs:
            return

        # Save as JSONL
        pairs_file = Path(Config.TRAINING_PAIRS_DIR) / f"pairs_{datetime.now().strftime('%Y%m%d')}.jsonl"

        with open(pairs_file, 'w', encoding='utf-8') as f:
            for pair in self.training_pairs:
                f.write(json.dumps(pair, ensure_ascii=False) + '\n')

        print(f"üíæ Training pairs saved: {pairs_file}")

print("‚úÖ OCR Processor ready")

In [None]:
# ============================================
# üìå Block 5: Enhanced Quality Validator
# ============================================
class QualityValidator:
    """‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏Ñ‡∏∏‡∏ì‡∏†‡∏≤‡∏û‡∏Ç‡∏≠‡∏á cleaned text ‡πÅ‡∏ö‡∏ö‡∏•‡∏∞‡πÄ‡∏≠‡∏µ‡∏¢‡∏î"""

    @staticmethod
    def validate(raw_text: str, cleaned_text: str) -> Dict:
        """
        ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏Ñ‡∏∏‡∏ì‡∏†‡∏≤‡∏û‡∏Å‡∏≤‡∏£ clean ‡πÅ‡∏ö‡∏ö‡∏û‡∏∑‡πâ‡∏ô‡∏ê‡∏≤‡∏ô
        """
        issues = []
        warnings = []

        # 1. ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏Ñ‡∏ß‡∏≤‡∏°‡∏¢‡∏≤‡∏ß
        len_ratio = len(cleaned_text) / len(raw_text) if raw_text else 0
        len_change = (len(cleaned_text) - len(raw_text)) / len(raw_text) * 100 if raw_text else 0

        if len_ratio < 0.8:
            issues.append(f"‚ö†Ô∏è ‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏±‡πâ‡∏ô‡∏•‡∏á {abs(len_change):.1f}% (‡∏≠‡∏≤‡∏à‡∏°‡∏µ‡∏Å‡∏≤‡∏£‡∏•‡∏ö‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤)")
        elif len_ratio > 1.2:
            issues.append(f"‚ö†Ô∏è ‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏¢‡∏≤‡∏ß‡∏Ç‡∏∂‡πâ‡∏ô {len_change:.1f}% (‡∏≠‡∏≤‡∏à‡∏°‡∏µ‡∏Å‡∏≤‡∏£‡πÄ‡∏û‡∏¥‡πà‡∏°‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤)")
        elif len_ratio < 0.9:
            warnings.append(f"üìù ‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏±‡πâ‡∏ô‡∏•‡∏á {abs(len_change):.1f}%")
        elif len_ratio > 1.1:
            warnings.append(f"üìù ‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏¢‡∏≤‡∏ß‡∏Ç‡∏∂‡πâ‡∏ô {len_change:.1f}%")

        # 2. ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö quotes
        raw_quotes = len(re.findall(r'"[^"]*"', raw_text))
        clean_quotes = len(re.findall(r'"[^"]*"', cleaned_text))

        if abs(raw_quotes - clean_quotes) > 3:
            issues.append(f"‚ö†Ô∏è ‡∏à‡∏≥‡∏ô‡∏ß‡∏ô quotes ‡∏ï‡πà‡∏≤‡∏á‡∏Å‡∏±‡∏ô‡∏°‡∏≤‡∏Å ({raw_quotes} ‚Üí {clean_quotes})")
        elif abs(raw_quotes - clean_quotes) > 1:
            warnings.append(f"üìù ‡∏à‡∏≥‡∏ô‡∏ß‡∏ô quotes ‡∏ï‡πà‡∏≤‡∏á‡∏Å‡∏±‡∏ô ({raw_quotes} ‚Üí {clean_quotes})")

        # 3. ‡∏Ñ‡∏≥‡∏ô‡∏ß‡∏ì score
        score = 1.0
        score -= len(issues) * 0.2
        score -= len(warnings) * 0.05
        score = max(0, min(1, score))

        return {
            'valid': len(issues) == 0,
            'score': score,
            'issues': issues,
            'warnings': warnings,
            'length_ratio': len_ratio,
            'length_change_percent': len_change
        }

    @staticmethod
    def enhanced_validate(raw_text: str, cleaned_text: str, filename: str = "") -> Dict:
        """
        ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡πÅ‡∏ö‡∏ö‡∏•‡∏∞‡πÄ‡∏≠‡∏µ‡∏¢‡∏î ‡∏û‡∏£‡πâ‡∏≠‡∏° diff ‡πÅ‡∏•‡∏∞ suspicious changes
        """
        issues = []
        warnings = []
        suspicious_changes = []

        # 1. Basic validation
        basic_result = QualityValidator.validate(raw_text, cleaned_text)
        issues.extend(basic_result['issues'])
        warnings.extend(basic_result['warnings'])

        # 2. ‡∏ï‡∏£‡∏ß‡∏à‡∏Ñ‡∏≥‡∏ó‡∏µ‡πà‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô
        raw_words = set(raw_text.split())
        clean_words = set(cleaned_text.split())

        added_words = clean_words - raw_words
        removed_words = raw_words - clean_words

        # 3. ‡∏´‡∏≤‡∏Ñ‡∏≥‡∏ó‡∏µ‡πà‡∏ô‡πà‡∏≤‡∏™‡∏á‡∏™‡∏±‡∏¢
        for word in added_words:
            # ‡∏Ñ‡∏≥‡∏¢‡∏≤‡∏ß‡πÄ‡∏Å‡∏¥‡∏ô 15 ‡∏ï‡∏±‡∏ß = ‡∏ô‡πà‡∏≤‡∏™‡∏á‡∏™‡∏±‡∏¢
            if len(word) > 15:
                suspicious_changes.append(f"‚ûï ‡πÄ‡∏û‡∏¥‡πà‡∏°‡∏Ñ‡∏≥‡∏¢‡∏≤‡∏ß: '{word}'")
            # ‡∏Ñ‡∏≥‡∏†‡∏≤‡∏©‡∏≤‡∏≠‡∏±‡∏á‡∏Å‡∏§‡∏©‡∏ó‡∏µ‡πà‡πÑ‡∏°‡πà‡∏ô‡πà‡∏≤‡∏≠‡∏¢‡∏π‡πà‡πÉ‡∏ô‡∏ô‡∏¥‡∏¢‡∏≤‡∏¢‡πÑ‡∏ó‡∏¢
            elif word.isascii() and len(word) > 3 and word.lower() not in ['okay', 'yes', 'no']:
                suspicious_changes.append(f"‚ûï ‡πÄ‡∏û‡∏¥‡πà‡∏°‡∏†‡∏≤‡∏©‡∏≤‡∏≠‡∏±‡∏á‡∏Å‡∏§‡∏©: '{word}'")

        # ‡∏ï‡∏£‡∏ß‡∏à‡∏Ñ‡∏≥‡∏ó‡∏µ‡πà‡∏´‡∏≤‡∏¢‡πÑ‡∏õ‡πÄ‡∏¢‡∏≠‡∏∞
        if len(removed_words) > 20:
            suspicious_changes.append(f"‚ûñ ‡∏Ñ‡∏≥‡∏´‡∏≤‡∏¢‡πÑ‡∏õ {len(removed_words)} ‡∏Ñ‡∏≥")

        # 4. ‡∏ï‡∏£‡∏ß‡∏à pattern ‡∏ó‡∏µ‡πà‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô‡∏ö‡πà‡∏≠‡∏¢
        pattern_changes = QualityValidator._check_common_patterns(raw_text, cleaned_text)
        if pattern_changes:
            warnings.extend(pattern_changes)

        # 5. ‡∏™‡∏£‡πâ‡∏≤‡∏á diff sample
        diff_sample = QualityValidator._get_diff_sample(raw_text, cleaned_text)

        # 6. ‡∏™‡∏£‡πâ‡∏≤‡∏á report
        status = 'PASS'
        if issues:
            status = 'FAIL'
        elif warnings or suspicious_changes:
            status = 'WARNING'

        report = {
            'filename': filename,
            'status': status,
            'score': basic_result['score'],
            'stats': {
                'length_change': f"{basic_result['length_change_percent']:+.1f}%",
                'words_added': len(added_words),
                'words_removed': len(removed_words),
                'quotes_change': f"{len(re.findall(r'\"', raw_text))} ‚Üí {len(re.findall(r'\"', cleaned_text))}"
            },
            'issues': issues,
            'warnings': warnings,
            'suspicious': suspicious_changes[:5],  # ‡πÅ‡∏™‡∏î‡∏á‡πÅ‡∏Ñ‡πà 5 ‡∏≠‡∏±‡∏ô‡πÅ‡∏£‡∏Å
            'diff_sample': diff_sample,
            'timestamp': datetime.now().isoformat()
        }

        return report

    @staticmethod
    def _check_common_patterns(raw_text: str, cleaned_text: str) -> List[str]:
        """‡∏ï‡∏£‡∏ß‡∏à pattern ‡∏ó‡∏µ‡πà‡∏°‡∏±‡∏Å‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô"""
        warnings = []

        # ‡∏ï‡∏£‡∏ß‡∏à ‡πÜ (‡πÑ‡∏°‡πâ‡∏¢‡∏°‡∏Å)
        raw_yamok_space = raw_text.count(' ‡πÜ ')
        clean_yamok_space = cleaned_text.count(' ‡πÜ ')
        raw_yamok_no_space = raw_text.count('‡πÜ') - raw_yamok_space
        clean_yamok_no_space = cleaned_text.count('‡πÜ') - clean_yamok_space

        if raw_yamok_space != clean_yamok_space or raw_yamok_no_space != clean_yamok_no_space:
            warnings.append(f"üìù ‡∏£‡∏π‡∏õ‡πÅ‡∏ö‡∏ö '‡πÜ' ‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô (space: {raw_yamok_space}‚Üí{clean_yamok_space}, no-space: {raw_yamok_no_space}‚Üí{clean_yamok_no_space})")

        # ‡∏ï‡∏£‡∏ß‡∏à‡∏ï‡∏±‡∏ß‡πÄ‡∏•‡∏Ç
        raw_numbers = len(re.findall(r'\d+', raw_text))
        clean_numbers = len(re.findall(r'\d+', cleaned_text))
        if abs(raw_numbers - clean_numbers) > 2:
            warnings.append(f"üìù ‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡∏ï‡∏±‡∏ß‡πÄ‡∏•‡∏Ç‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô ({raw_numbers} ‚Üí {clean_numbers})")

        return warnings

    @staticmethod
    def _get_diff_sample(raw_text: str, cleaned_text: str, max_lines: int = 3) -> List[str]:
        """‡πÅ‡∏™‡∏î‡∏á‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏ó‡∏µ‡πà‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô"""
        import difflib

        # ‡πÅ‡∏ö‡πà‡∏á‡πÄ‡∏õ‡πá‡∏ô‡∏ö‡∏£‡∏£‡∏ó‡∏±‡∏î
        raw_lines = raw_text[:500].split('\n')
        clean_lines = cleaned_text[:500].split('\n')

        # ‡∏´‡∏≤ diff
        diff = difflib.unified_diff(
            raw_lines,
            clean_lines,
            lineterm='',
            n=0
        )

        changes = []
        for line in diff:
            if line.startswith('+') and not line.startswith('+++'):
                changes.append(f"‚úÖ {line[1:][:100]}")  # ‡∏à‡∏≥‡∏Å‡∏±‡∏î 100 chars
            elif line.startswith('-') and not line.startswith('---'):
                changes.append(f"‚ùå {line[1:][:100]}")

        return changes[:max_lines]

    @staticmethod
    def save_validation_report(report: Dict, output_dir: str = None):
        """‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å validation report"""
        if output_dir is None:
            output_dir = Config.LOGS_DIR

        Path(output_dir).mkdir(parents=True, exist_ok=True)

        # ‡∏™‡∏£‡πâ‡∏≤‡∏á‡∏ä‡∏∑‡πà‡∏≠‡πÑ‡∏ü‡∏•‡πå
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = report.get('filename', 'unknown').replace('.txt', '')
        report_file = Path(output_dir) / f"validation_{filename}_{timestamp}.json"

        # ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å
        with open(report_file, 'w', encoding='utf-8') as f:
            json.dump(report, f, ensure_ascii=False, indent=2)

        return report_file

print("‚úÖ Enhanced Quality Validator ready")

In [None]:
# ============================================
# üìå Block 6: Main Menu
# ============================================
def main_menu():
    """Interactive menu ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡πÉ‡∏ä‡πâ‡∏á‡∏≤‡∏ô"""

    processor = OCRProcessor()

    while True:
        print("""
‚ïî‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó
‚ïë     OCR PROCESSING WITH API v1.0            ‚ïë
‚ïë          Automated Thai Novel OCR           ‚ïë
‚ïö‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù

[1] üöÄ Process ‡πÑ‡∏ü‡∏•‡πå‡πÄ‡∏î‡∏µ‡∏¢‡∏ß
[2] üì¶ Process ‡∏´‡∏•‡∏≤‡∏¢‡πÑ‡∏ü‡∏•‡πå (Batch)
[3] üí∞ Check API usage & cost
[4] üîß Test with sample text
[5] üìä View statistics
[6] ‚öôÔ∏è Settings
[7] ‚ùå Exit

        """)

        choice = input("Select (1-7): ").strip()

        if choice == '1':
            # Process single file
            print("\nüìÑ Single File Processing")
            print("-" * 40)

            # List available files
            raw_dir = Path(Config.RAW_OCR_DIR)
            files = list(raw_dir.glob("*.txt"))

            if not files:
                print("‚ùå ‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå‡πÉ‡∏ô raw_ocr/")
                input("\nPress Enter to continue...")
                continue

            print("‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå:")
            for i, f in enumerate(files[:10], 1):
                print(f"  [{i}] {f.name}")

            file_idx = input("\n‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡πÑ‡∏ü‡∏•‡πå (number): ").strip()

            try:
                selected_file = files[int(file_idx) - 1]
                processor.process_file(selected_file)
            except:
                print("‚ùå Invalid selection")

            input("\nPress Enter to continue...")

        elif choice == '2':
            # Batch processing
            print("\nüì¶ Batch Processing")
            print("-" * 40)

            limit = input("‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡πÑ‡∏ü‡∏•‡πå‡∏ó‡∏µ‡πà‡∏à‡∏∞ process (Enter = ‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î): ").strip()
            limit = int(limit) if limit else None

            confirm = input(f"\n‚ö†Ô∏è ‡∏à‡∏∞ process {limit or '‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î'} ‡πÑ‡∏ü‡∏•‡πå ‡∏ï‡πâ‡∏≠‡∏á‡∏Å‡∏≤‡∏£‡∏î‡∏≥‡πÄ‡∏ô‡∏¥‡∏ô‡∏Å‡∏≤‡∏£? (y/n): ")

            if confirm.lower() == 'y':
                processor.process_batch(limit=limit)
            else:
                print("‚ùå Cancelled")

            input("\nPress Enter to continue...")

        elif choice == '3':
            # Check usage
            print("\nüí∞ API Usage & Cost")
            print("-" * 40)

            usage = processor.llm.get_usage_summary()
            print(f"Model: {Config.MODEL}")
            print(f"Total tokens: {usage['total_tokens']:,}")
            print(f"Total cost: ${usage['total_cost_usd']:.4f}")
            print(f"Total cost (THB): ~{usage['total_cost_thb']:.2f} ‡∏ö‡∏≤‡∏ó")
            print(f"Est. pages: ~{usage['pages_processed']}")

            input("\nPress Enter to continue...")

        elif choice == '4':
            # Test sample
            print("\nüîß Test with Sample")
            print("-" * 40)

            sample = """‡∏´‡∏≠‡∏õ‡∏£‡∏∞‡∏ä‡∏∏‡∏°‡πÄ‡∏õ‡πá‡∏ô‡∏≠‡∏≤‡∏Ñ‡∏≤‡∏£‡∏ó‡∏µ‡πà‡∏°‡∏µ‡∏´‡∏ô‡πâ‡∏≤‡∏ï‡∏≤‡∏Ñ‡∏•‡πâ‡∏≤‡∏¢‡∏ö‡πâ‡∏≤‡∏ô‡∏ä‡∏±‡πâ‡∏ô‡πÄ‡∏î‡∏µ‡∏¢‡∏ß‡∏ó‡∏±‡πà‡∏ß‡πÑ‡∏õ
‡πÅ‡∏ï‡πà‡∏°‡∏µ‡∏Ç‡∏ô‡∏≤‡∏î‡πÉ‡∏´‡∏ç‡πà‡∏Å‡∏ß‡πà‡∏≤‡πÄ‡∏•‡πá‡∏Å‡∏ô‡πâ‡∏≠‡∏¢ ‡∏ä‡∏≤‡∏¢‡∏´‡∏ç‡∏¥‡∏á‡πÉ‡∏ô‡∏ä‡∏∏‡∏î‡πÑ‡∏ß‡πâ‡∏ó‡∏∏‡∏Å‡∏Ç‡πå‡∏´‡∏•‡∏≤‡∏¢‡∏Ñ‡∏ô
‡πÄ‡∏î‡∏¥‡∏ô‡∏Ç‡∏ß‡∏±‡∏Å‡πÑ‡∏Ç‡∏ß‡πà‡πÑ‡∏õ‡∏°‡∏≤‡∏ó‡πà‡∏≤‡∏°‡∏Å‡∏•‡∏≤‡∏á‡∏Ñ‡∏ß‡∏≤‡∏°‡∏ß‡∏∏‡πà‡∏ô‡∏ß‡∏≤‡∏¢"""

            print("Sample text:")
            print(sample)
            print("\nProcessing...")

            result = processor.llm.clean_ocr_text(sample)

            print("\nCleaned text:")
            print(result['cleaned_text'])
            print(f"\nTokens: {result['tokens_used']}")
            print(f"Cost: ${result['cost']:.4f}")

            input("\nPress Enter to continue...")

        elif choice == '5':
            # Statistics
            print("\nüìä Statistics")
            print("-" * 40)

            # Count files
            raw_files = len(list(Path(Config.RAW_OCR_DIR).glob("*.txt")))
            clean_files = len(list(Path(Config.CLEANED_DIR).glob("*.txt")))

            print(f"Raw OCR files: {raw_files}")
            print(f"Cleaned files: {clean_files}")
            print(f"Success rate: {processor.stats['processed']}/{processor.stats['processed'] + processor.stats['failed']}")

            input("\nPress Enter to continue...")

        elif choice == '6':
            # Settings
            print("\n‚öôÔ∏è Settings")
            print("-" * 40)
            print(f"Current model: {Config.MODEL}")
            print(f"Temperature: {Config.TEMPERATURE}")
            print(f"Max tokens: {Config.MAX_TOKENS}")

            change = input("\nChange model? (y/n): ")
            if change.lower() == 'y':
                print("\nAvailable models:")
                print("[1] gpt-4o-mini (cheapest)")
                print("[2] gpt-3.5-turbo")
                print("[3] claude-3-haiku")

                model_choice = input("Select: ").strip()
                if model_choice == '1':
                    Config.MODEL = 'gpt-4o-mini'
                elif model_choice == '2':
                    Config.MODEL = 'gpt-3.5-turbo'
                elif model_choice == '3':
                    Config.MODEL = 'claude-3-haiku'

                processor.llm = LLMClient()  # Reinitialize
                print(f"‚úÖ Model changed to: {Config.MODEL}")

            input("\nPress Enter to continue...")

        elif choice == '7':
            print("\nüëã Goodbye!")
            break

        else:
            print("‚ùå Invalid choice")

print("‚úÖ Main menu ready")

In [None]:
# ============================================
# üìå Block 7: Quick Start Functions
# ============================================

def quick_setup():
    """Setup API key ‡πÅ‡∏•‡∏∞ test connection"""
    print("\nüîß Quick Setup")
    print("=" * 50)

    # Check API key
    if not Config.OPENAI_API_KEY and not Config.ANTHROPIC_API_KEY:
        print("\n‚ö†Ô∏è ‡πÑ‡∏°‡πà‡∏û‡∏ö API key!")
        print("\n‡∏ß‡∏¥‡∏ò‡∏µ‡πÉ‡∏™‡πà API key:")
        print("1. ‡πÅ‡∏Å‡πâ‡πÉ‡∏ô Config class ‡∏î‡πâ‡∏≤‡∏ô‡∏ö‡∏ô")
        print("2. ‡∏´‡∏£‡∏∑‡∏≠ set environment variable:")

        provider = input("\n‡πÉ‡∏ä‡πâ [1] OpenAI ‡∏´‡∏£‡∏∑‡∏≠ [2] Anthropic? : ").strip()

        if provider == '1':
            key = input("Enter OpenAI API key: ").strip()
            Config.OPENAI_API_KEY = key
            Config.MODEL = 'gpt-4o-mini'
        else:
            key = input("Enter Anthropic API key: ").strip()
            Config.ANTHROPIC_API_KEY = key
            Config.MODEL = 'claude-3-haiku'

    # Test connection
    print("\nüîç Testing API connection...")
    try:
        client = LLMClient()
        result = client.clean_ocr_text("‡∏ó‡∏î‡∏™‡∏≠‡∏ö API")
        print("‚úÖ API connection successful!")
        print(f"   Model: {Config.MODEL}")
        print(f"   Test cost: ${result['cost']:.4f}")
        return True
    except Exception as e:
        print(f"‚ùå API test failed: {e}")
        return False


def process_single_file_quick(filename: str):
    """Process 1 ‡πÑ‡∏ü‡∏•‡πå‡πÅ‡∏ö‡∏ö‡πÄ‡∏£‡πá‡∏ß"""
    processor = OCRProcessor()
    file_path = Path(Config.RAW_OCR_DIR) / filename

    if not file_path.exists():
        print(f"‚ùå File not found: {filename}")
        return

    result = processor.process_file(file_path)

    if result['success']:
        print(f"\n‚úÖ Success!")
        print(f"   Output: {result['cleaned_path']}")
        print(f"   Cost: ${result['cost']:.4f} (~{result['cost']*35:.2f} ‡∏ö‡∏≤‡∏ó)")
    else:
        print(f"\n‚ùå Failed: {result.get('error')}")

In [None]:
# ============================================
# üìå Block 8: Main Execution
# ============================================

if __name__ == "__main__":
    print("""
    ================================================================================
                      OCR PROCESSING WITH API v1.0
                         Automated Thai Novel OCR
    ================================================================================

    üéØ Features:
       - Automated OCR cleaning with GPT/Claude
       - Cost tracking & optimization
       - Quality validation
       - Training pairs collection

    üí∞ Estimated cost:
       - GPT-4o-mini: ~0.01 ‡∏ö‡∏≤‡∏ó/‡∏´‡∏ô‡πâ‡∏≤
       - 100 ‡∏´‡∏ô‡πâ‡∏≤ = ~1 ‡∏ö‡∏≤‡∏ó
       - 1,000 ‡∏´‡∏ô‡πâ‡∏≤ = ~10 ‡∏ö‡∏≤‡∏ó

    """)

    # Quick setup ‡∏ñ‡πâ‡∏≤‡∏¢‡∏±‡∏á‡πÑ‡∏°‡πà‡∏°‡∏µ API key
    if not Config.OPENAI_API_KEY and not Config.ANTHROPIC_API_KEY:
        print("üìù ‡∏ï‡πâ‡∏≠‡∏á setup API key ‡∏Å‡πà‡∏≠‡∏ô")
        if quick_setup():
            print("\n‚úÖ Setup complete! Ready to use")
        else:
            print("\n‚ùå Setup failed. Please check API key")
            exit(1)

    # Run main menu
    print("\nüöÄ Starting main menu...")
    main_menu()

    print("\nüéâ Thank you for using OCR Processor!")