<a href="https://colab.research.google.com/github/tanatet8/Colab_Script/blob/main/ThaiNovel_OCR_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
================================================================================
                OCR PROCESSING WITH API v3.0 - FIXED VERSION
                       Enhanced Thai Novel OCR Processor
================================================================================
"""

# ============================================
# üìå Block 1: Setup & Import
# ============================================
import os
import re
import json
import time
import random
import unicodedata
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Tuple, Optional
import pandas as pd
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Mount Drive (‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö Colab)
try:
    from google.colab import drive
    drive.mount('/content/drive')
    IN_COLAB = True
except:
    IN_COLAB = False

print("‚úÖ Libraries loaded")

Mounted at /content/drive
‚úÖ Libraries loaded


In [2]:
# ============================================
# üìå Block 2: API Key Loading & Configuration
# ============================================

def load_api_key():
    """‡πÇ‡∏´‡∏•‡∏î API key ‡∏à‡∏≤‡∏Å‡πÑ‡∏ü‡∏•‡πå (‡∏£‡∏≠‡∏á‡∏£‡∏±‡∏ö‡∏ó‡∏±‡πâ‡∏á .env ‡πÅ‡∏•‡∏∞ .env.txt)"""
    base_path = "/content/drive/MyDrive/OCR" if IN_COLAB else "./OCR"

    # ‡∏•‡∏≠‡∏á‡∏´‡∏≤‡πÑ‡∏ü‡∏•‡πå‡∏ó‡∏±‡πâ‡∏á‡∏™‡∏≠‡∏á‡πÅ‡∏ö‡∏ö
    possible_files = [
        f"{base_path}/openai.env",
        f"{base_path}/openai.env.txt"
    ]

    for env_path in possible_files:
        try:
            # ‡∏≠‡πà‡∏≤‡∏ô‡∏Ñ‡πà‡∏≤ key
            with open(env_path, "r") as f:
                key = f.read().strip()

            if key and key.startswith('sk-'):
                # ‡∏ï‡∏±‡πâ‡∏á environment variable
                os.environ["OPENAI_API_KEY"] = key
                print(f"‚úÖ OpenAI API key loaded from: {Path(env_path).name}")
                return key
            else:
                print(f"‚ö†Ô∏è Invalid API key format in {Path(env_path).name}")
                continue

        except FileNotFoundError:
            continue
        except Exception as e:
            print(f"‚ùå Error reading {env_path}: {e}")
            continue

    # ‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå‡πÉ‡∏î ‡πÜ
    print("‚ùå ‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå API key ‡∏´‡∏£‡∏∑‡∏≠‡∏£‡∏π‡∏õ‡πÅ‡∏ö‡∏ö‡πÑ‡∏°‡πà‡∏ñ‡∏π‡∏Å‡∏ï‡πâ‡∏≠‡∏á")
    print("   ‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡∏™‡∏£‡πâ‡∏≤‡∏á‡πÑ‡∏ü‡∏•‡πå‡πÉ‡∏î‡πÑ‡∏ü‡∏•‡πå‡∏´‡∏ô‡∏∂‡πà‡∏á:")
    for path in possible_files:
        print(f"   - {path}")
    print("   ‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤: sk-xxxxxxxxxxxxxxxxxxxxxxxx")
    return None

class Config:
    """Configuration ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö API Processing"""

    # ‡πÇ‡∏´‡∏•‡∏î API key ‡∏à‡∏≤‡∏Å‡πÑ‡∏ü‡∏•‡πå
    OPENAI_API_KEY = load_api_key()
    ANTHROPIC_API_KEY = ""  # ‡πÉ‡∏™‡πà Anthropic API key ‡∏ñ‡πâ‡∏≤‡πÉ‡∏ä‡πâ Claude

    # ‡πÄ‡∏•‡∏∑‡∏≠‡∏Å Model
    MODEL = "gpt-4o-mini"  # ‡∏ñ‡∏π‡∏Å‡∏™‡∏∏‡∏î ‡πÅ‡∏ô‡∏∞‡∏ô‡∏≥!
    # MODEL = "gpt-3.5-turbo"
    # MODEL = "claude-3-haiku"

    # Paths (Google Drive)
    BASE = '/content/drive/MyDrive/OCR' if IN_COLAB else './OCR'

    RAW_OCR_DIR = f'{BASE}/raw_ocr'
    CLEANED_DIR = f'{BASE}/cleaned'
    CORPUS_DIR = f'{BASE}/final_corpus'
    TRAINING_PAIRS_DIR = f'{BASE}/training_pairs'
    LOGS_DIR = f'{BASE}/logs'

    # Processing settings
    MAX_PAGES_PER_BATCH = 8  # ‡∏•‡∏î‡∏•‡∏á‡πÄ‡∏•‡πá‡∏Å‡∏ô‡πâ‡∏≠‡∏¢‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏ñ‡∏µ‡∏¢‡∏£
    MAX_RETRIES = 3
    TEMPERATURE = 0.05  # ‡∏•‡∏î‡∏•‡∏á‡πÉ‡∏´‡πâ consistent ‡∏°‡∏≤‡∏Å‡∏Ç‡∏∂‡πâ‡∏ô
    MAX_TOKENS = 8000
    CONTEXT_OVERLAP = 100  # ‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£‡∏ó‡∏µ‡πà overlap ‡∏£‡∏∞‡∏´‡∏ß‡πà‡∏≤‡∏á chunk

    # Cost tracking (‡πÅ‡∏°‡πà‡∏ô‡∏¢‡∏≥‡∏Ç‡∏∂‡πâ‡∏ô - ‡πÅ‡∏¢‡∏Å input/output)
    PRICE_PER_1K_TOKENS = {
        'gpt-4o-mini': {'input': 0.00015, 'output': 0.0006},  # $0.15/$0.60 per 1M
        'gpt-3.5-turbo': {'input': 0.0005, 'output': 0.0015},
        'claude-3-haiku': {'input': 0.00025, 'output': 0.00125}
    }

# ‡∏™‡∏£‡πâ‡∏≤‡∏á folders
for folder in [Config.RAW_OCR_DIR, Config.CLEANED_DIR, Config.CORPUS_DIR,
               Config.TRAINING_PAIRS_DIR, Config.LOGS_DIR]:
    Path(folder).mkdir(parents=True, exist_ok=True)

print("‚úÖ Config loaded")

‚úÖ OpenAI API key loaded from: openai.env
‚úÖ Config loaded


In [3]:
# ============================================
# üìå Block 3: Usage Logger
# ============================================
class UsageLogger:
    """‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏Å‡∏≤‡∏£‡πÉ‡∏ä‡πâ‡∏á‡∏≤‡∏ô API ‡πÄ‡∏õ‡πá‡∏ô CSV ‡∏û‡∏£‡πâ‡∏≠‡∏° detailed tracking"""

    def __init__(self):
        self.log_file = Path(Config.LOGS_DIR) / "usage.csv"
        self.filename_map_file = Path(Config.LOGS_DIR) / "filename_mapping.json"
        self._init_csv()

    def _init_csv(self):
        """‡∏™‡∏£‡πâ‡∏≤‡∏á‡πÑ‡∏ü‡∏•‡πå CSV ‡∏ñ‡πâ‡∏≤‡∏¢‡∏±‡∏á‡πÑ‡∏°‡πà‡∏°‡∏µ"""
        if not self.log_file.exists():
            columns = [
                'timestamp', 'original_filename', 'clean_filename', 'pages_count',
                'model', 'input_tokens', 'output_tokens', 'total_tokens',
                'cost_usd', 'cost_thb', 'processing_time_sec', 'retry_count',
                'validation_status'
            ]
            df = pd.DataFrame(columns=columns)
            df.to_csv(self.log_file, index=False, encoding='utf-8')

    def log_usage(self, original_filename: str, clean_filename: str, pages_count: int,
                  model: str, input_tokens: int, output_tokens: int, cost_usd: float,
                  processing_time: float = 0, retry_count: int = 0,
                  validation_status: str = 'PASS'):
        """‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏Å‡∏≤‡∏£‡πÉ‡∏ä‡πâ‡∏á‡∏≤‡∏ô"""

        new_row = {
            'timestamp': datetime.now().isoformat(),
            'original_filename': original_filename,
            'clean_filename': clean_filename,
            'pages_count': pages_count,
            'model': model,
            'input_tokens': input_tokens,
            'output_tokens': output_tokens,
            'total_tokens': input_tokens + output_tokens,
            'cost_usd': cost_usd,
            'cost_thb': cost_usd * 35,  # ‡∏õ‡∏£‡∏∞‡∏°‡∏≤‡∏ì
            'processing_time_sec': processing_time,
            'retry_count': retry_count,
            'validation_status': validation_status
        }

        # ‡πÄ‡∏û‡∏¥‡πà‡∏°‡∏•‡∏á CSV
        df = pd.DataFrame([new_row])
        df.to_csv(self.log_file, mode='a', header=False, index=False, encoding='utf-8')

        # ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å filename mapping
        self._save_filename_mapping(original_filename, clean_filename)

    def _save_filename_mapping(self, original: str, cleaned: str):
        """‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å mapping ‡∏£‡∏∞‡∏´‡∏ß‡πà‡∏≤‡∏á‡∏ä‡∏∑‡πà‡∏≠‡πÑ‡∏ü‡∏•‡πå‡πÄ‡∏î‡∏¥‡∏°‡πÅ‡∏•‡∏∞‡πÉ‡∏´‡∏°‡πà"""
        mapping = {}
        if self.filename_map_file.exists():
            try:
                with open(self.filename_map_file, 'r', encoding='utf-8') as f:
                    mapping = json.load(f)
            except:
                pass

        mapping[cleaned] = {
            'original': original,
            'timestamp': datetime.now().isoformat()
        }

        with open(self.filename_map_file, 'w', encoding='utf-8') as f:
            json.dump(mapping, f, ensure_ascii=False, indent=2)

    def get_summary(self) -> Dict:
        """‡∏™‡∏£‡∏∏‡∏õ‡∏Å‡∏≤‡∏£‡πÉ‡∏ä‡πâ‡∏á‡∏≤‡∏ô‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î"""
        try:
            df = pd.read_csv(self.log_file, encoding='utf-8')

            if df.empty:
                return {'total_files': 0, 'total_cost_usd': 0, 'total_tokens': 0}

            return {
                'total_files': len(df),
                'total_pages': df['pages_count'].sum(),
                'total_tokens': df['total_tokens'].sum(),
                'input_tokens': df['input_tokens'].sum(),
                'output_tokens': df['output_tokens'].sum(),
                'total_cost_usd': df['cost_usd'].sum(),
                'total_cost_thb': df['cost_thb'].sum(),
                'avg_cost_per_page': df['cost_usd'].sum() / df['pages_count'].sum() if df['pages_count'].sum() > 0 else 0,
                'avg_processing_time': df['processing_time_sec'].mean(),
                'most_used_model': df['model'].mode()[0] if len(df) > 0 else 'N/A',
                'validation_stats': df['validation_status'].value_counts().to_dict() if 'validation_status' in df.columns else {}
            }
        except Exception as e:
            print(f"Warning: Error reading usage log: {e}")
            return {'total_files': 0, 'total_cost_usd': 0, 'total_tokens': 0}

print("‚úÖ Enhanced Usage Logger ready")

‚úÖ Enhanced Usage Logger ready


In [4]:
# ============================================
# üìå Block 4: Thai Text Utilities
# ============================================
class ThaiTextUtils:
    """Utilities ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢"""

    @staticmethod
    def normalize_unicode(text: str) -> str:
        """Normalize Unicode characters"""
        # ‡∏Å‡∏≥‡∏à‡∏±‡∏î zero-width characters
        text = re.sub(r'[\u200b\ufeff\u00a0]', '', text)
        # Normalize Unicode form
        text = unicodedata.normalize('NFC', text)
        # ‡∏à‡∏±‡∏î‡∏£‡∏∞‡πÄ‡∏ö‡∏µ‡∏¢‡∏ö‡∏ß‡∏£‡∏£‡∏ì‡∏¢‡∏∏‡∏Å‡∏ï‡πå‡πÑ‡∏ó‡∏¢ (‡∏ñ‡πâ‡∏≤‡∏à‡∏≥‡πÄ‡∏õ‡πá‡∏ô)
        text = re.sub(r'([‡∏Å-‡∏Æ])([‡∏¥‡∏µ‡∏∂‡∏∑‡∏∏‡∏π])([‡πà‡πâ‡πä‡πã])', r'\1\3\2', text)
        return text

    @staticmethod
    def count_thai_quotes(text: str) -> Dict[str, int]:
        """‡∏ô‡∏±‡∏ö‡∏à‡∏≥‡∏ô‡∏ß‡∏ô '‡∏Ñ‡∏π‡πà' ‡∏Ç‡∏≠‡∏á‡πÄ‡∏Ñ‡∏£‡∏∑‡πà‡∏≠‡∏á‡∏´‡∏°‡∏≤‡∏¢‡∏Ñ‡∏≥‡∏û‡∏π‡∏î (pairs)"""
        eng_double = text.count('"') // 2
        eng_single = text.count("'") // 2
        thai_double = (text.count('\u201c') + text.count('\u201d')) // 2  # ‚Äú ‚Äù
        thai_single = (text.count('\u2018') + text.count('\u2019')) // 2  # ‚Äò ‚Äô
        return {
            'english_double': eng_double,
            'english_single': eng_single,
            'thai_double': thai_double,
            'thai_single': thai_single,
        }

    @staticmethod
    def fix_yamok_spacing(text: str) -> str:
        """‡πÅ‡∏Å‡πâ‡∏õ‡∏±‡∏ç‡∏´‡∏≤‡∏Å‡∏≤‡∏£‡πÉ‡∏ä‡πâ ‡πÜ (‡πÑ‡∏°‡πâ‡∏¢‡∏°‡∏Å)"""
        text = re.sub(r'\s+‡πÜ\s+', '‡πÜ ', text)
        text = re.sub(r'\s+‡πÜ(?=\s|$)', '‡πÜ', text)
        return text

    @staticmethod
    def detect_word_breaks(text: str) -> List[str]:
        """‡∏ï‡∏£‡∏ß‡∏à‡∏´‡∏≤‡∏Ñ‡∏≥‡∏ó‡∏µ‡πà‡πÅ‡∏ï‡∏Å‡∏´‡∏±‡∏Å (‡πÄ‡∏ä‡πà‡∏ô '‡∏°‡∏≤ ‡∏Å‡∏≥‡∏•‡∏±‡∏á')"""
        broken_patterns = [
            r'([‡∏Å-‡∏Æ])\s+([‡∏Å-‡∏Æ‡∏¥‡∏µ‡∏∂‡∏∑‡∏∏‡∏π‡πç‡πå‡πà‡πâ‡πä‡πã])',      # ‡∏û‡∏¢‡∏±‡∏ç‡∏ä‡∏ô‡∏∞ + ‡πÄ‡∏ß‡πâ‡∏ô‡∏ß‡∏£‡∏£‡∏Ñ + ‡∏™‡∏£‡∏∞/‡∏ß‡∏£‡∏£‡∏ì‡∏¢‡∏∏‡∏Å‡∏ï‡πå
            r'([‡∏Å-‡∏Æ‡∏¥‡∏µ‡∏∂‡∏∑‡∏∏‡∏π])\s+([‡∏Å-‡∏Æ]{1}(?![‡∏Å-‡∏Æ]))', # ‡∏Ñ‡∏≥‡∏™‡∏±‡πâ‡∏ô ‡πÜ ‡∏ó‡∏µ‡πà‡πÅ‡∏¢‡∏Å
        ]
        issues = []
        for pattern in broken_patterns:
            matches = re.findall(pattern, text)
            issues.extend([f"{m[0]} {m[1]}" for m in matches])
        return list(set(issues))

print("‚úÖ Thai Text Utilities ready")

‚úÖ Thai Text Utilities ready


In [5]:
# ============================================
# üìå Block 5: Multi-page Parser (XML Enhanced)
# ============================================
class MultiPageParser:
    """‡πÅ‡∏¢‡∏Å‡πÑ‡∏ü‡∏•‡πå‡∏´‡∏•‡∏≤‡∏¢‡∏´‡∏ô‡πâ‡∏≤‡∏û‡∏£‡πâ‡∏≠‡∏° XML processing"""

    @staticmethod
    def parse_multipage_file(content: str) -> List[Dict]:
        """‡πÅ‡∏¢‡∏Å‡πÑ‡∏ü‡∏•‡πå‡∏ó‡∏µ‡πà‡∏°‡∏µ‡∏´‡∏•‡∏≤‡∏¢‡∏´‡∏ô‡πâ‡∏≤‡∏≠‡∏≠‡∏Å‡πÄ‡∏õ‡πá‡∏ô‡∏£‡∏≤‡∏¢‡∏Å‡∏≤‡∏£"""
        pages = []
        page_sections = re.split(r'--- PAGE:\s*(\d+)\s*---', content)
        if len(page_sections) > 1:
            for i in range(1, len(page_sections), 2):
                if i + 1 < len(page_sections):
                    page_num = int(page_sections[i])
                    page_content = page_sections[i + 1]
                    raw_text = ""
                    cleaned_text = ""
                    raw_match = re.search(
                        r'--- RAW ---(.*?)(?=--- CLEANED ---|--- PAGE:|$)',
                        page_content, re.DOTALL
                    )
                    if raw_match:
                        raw_text = raw_match.group(1)  # keep as-is
                    else:
                        raw_text = page_content
                    cleaned_match = re.search(
                        r'--- CLEANED ---(.*?)(?=--- PAGE:|$)',
                        page_content, re.DOTALL
                    )
                    if cleaned_match:
                        cleaned_text = cleaned_match.group(1).strip()
                        cleaned_text = ThaiTextUtils.normalize_unicode(cleaned_text)
                    pages.append({
                        'page_num': page_num,
                        'raw_text': raw_text,
                        'cleaned_text': cleaned_text
                    })
        return pages

    @staticmethod
    def parse_xml_result(xml_content: str) -> List[Dict]:
        """‡πÅ‡∏¢‡∏Å‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡∏ó‡∏µ‡πà‡πÉ‡∏ä‡πâ XML tags <page id="X">"""
        pages = []
        page_matches = re.findall(
            r'<page\s+id=[\"\'](\d+)[\"\']>(.*?)</page>',
            xml_content, re.DOTALL | re.IGNORECASE
        )
        for page_num_str, content in page_matches:
            content = ThaiTextUtils.normalize_unicode(content.strip())
            pages.append({
                'page_num': int(page_num_str),
                'cleaned_text': content
            })
        return pages

    @staticmethod
    def extract_metadata(content: str) -> Dict:
        """‡πÅ‡∏¢‡∏Å metadata ‡∏à‡∏≤‡∏Å header ‡∏Ç‡∏≠‡∏á‡πÑ‡∏ü‡∏•‡πå"""
        metadata = {}
        patterns = {
            'book_title': r'### üìò ‡∏ä‡∏∑‡πà‡∏≠‡∏´‡∏ô‡∏±‡∏á‡∏™‡∏∑‡∏≠.*?:\s*(.*)',
            'chapter': r'### üßæ Chapter:\s*(.*)',
            'sub_chapter': r'### üîñ Sub-Chapter:\s*(.*)',
            'format': r'### üìÇ Format:\s*(.*)',
            'purpose': r'### üß† Purpose:\s*(.*)'
        }
        for key, pattern in patterns.items():
            match = re.search(pattern, content)
            if match:
                metadata[key] = match.group(1).strip()
        return metadata

print("‚úÖ Multi-page Parser ready")

‚úÖ Multi-page Parser ready


In [6]:
# ============================================
# üìå Block 6: Enhanced LLM Client with Metadata Analysis
# ============================================
class LLMClient:
    """Universal LLM Client with enhanced error handling ‡πÅ‡∏•‡∏∞ metadata analysis"""

    def __init__(self):
        self.model = Config.MODEL
        self.client = None
        self.total_tokens = 0
        self.total_cost = 0
        self.usage_logger = UsageLogger()
        if 'gpt' in self.model:
            self._init_openai()
        elif 'claude' in self.model:
            self._init_anthropic()

    def _init_openai(self):
        """Initialize OpenAI client (SDK v1.0+)"""
        try:
            from openai import OpenAI
            if not Config.OPENAI_API_KEY:
                raise ValueError("‚ùå ‡πÑ‡∏°‡πà‡∏û‡∏ö OpenAI API key! ‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡πÑ‡∏ü‡∏•‡πå key")
            self.client = OpenAI(api_key=Config.OPENAI_API_KEY)
            print(f"‚úÖ OpenAI client ready (Model: {self.model})")
        except ImportError:
            print("‚ùå ‡∏ï‡πâ‡∏≠‡∏á‡∏ï‡∏¥‡∏î‡∏ï‡∏±‡πâ‡∏á: pip install --upgrade openai>=1.0.0")
            raise

    def _init_anthropic(self):
        """Initialize Anthropic client"""
        try:
            import anthropic
            api_key = Config.ANTHROPIC_API_KEY or os.getenv('ANTHROPIC_API_KEY')
            if not api_key:
                raise ValueError("‚ùå ‡πÑ‡∏°‡πà‡∏û‡∏ö Anthropic API key!")
            self.client = anthropic.Anthropic(api_key=api_key)
            print(f"‚úÖ Anthropic client ready (Model: {self.model})")
        except ImportError:
            print("‚ùå ‡∏ï‡πâ‡∏≠‡∏á‡∏ï‡∏¥‡∏î‡∏ï‡∏±‡πâ‡∏á: pip install anthropic")
            raise

    def _create_multipage_xml_prompt(self, pages: List[Dict]) -> str:
        """‡∏™‡∏£‡πâ‡∏≤‡∏á prompt ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡πÅ‡∏Å‡πâ‡πÑ‡∏Ç OCR ‡∏´‡∏•‡∏≤‡∏¢‡∏´‡∏ô‡πâ‡∏≤"""
        page_texts = [f"--- ‡∏´‡∏ô‡πâ‡∏≤‡∏ó‡∏µ‡πà {p['page_num']} ---\n{p['raw_text']}" for p in pages]
        combined_text = "\n\n".join(page_texts)
        return f"""‡πÅ‡∏Å‡πâ‡πÑ‡∏Ç‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏° OCR ‡∏à‡∏≤‡∏Å‡∏ô‡∏¥‡∏¢‡∏≤‡∏¢‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢‡∏´‡∏•‡∏≤‡∏¢‡∏´‡∏ô‡πâ‡∏≤‡∏ï‡πà‡∏≠‡πÑ‡∏õ‡∏ô‡∏µ‡πâ

üö® ‡∏Å‡∏é‡∏Å‡∏≤‡∏£‡πÅ‡∏Å‡πâ‡πÑ‡∏Ç (‡πÄ‡∏Ç‡πâ‡∏°‡∏á‡∏ß‡∏î):
1. ‡πÅ‡∏Å‡πâ‡πÄ‡∏â‡∏û‡∏≤‡∏∞ typo ‡πÅ‡∏•‡∏∞‡∏Å‡∏≤‡∏£‡∏™‡∏∞‡∏Å‡∏î‡∏ú‡∏¥‡∏î
2. ‡πÅ‡∏Å‡πâ‡∏Ñ‡∏≥‡∏ó‡∏µ‡πà‡∏Ç‡∏≤‡∏î‡∏´‡∏≤‡∏¢/‡πÅ‡∏ï‡∏Å‡∏´‡∏±‡∏Å (‡πÄ‡∏ä‡πà‡∏ô "‡∏°‡∏≤ ‡∏Å‡∏≥‡∏•‡∏±‡∏á" ‚Üí "‡∏°‡∏≤‡∏Å‡∏≥‡∏•‡∏±‡∏á")
3. ‡∏•‡∏ö‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£‡πÄ‡∏î‡∏µ‡πà‡∏¢‡∏ß‡∏ó‡∏µ‡πà‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏°‡∏´‡∏°‡∏≤‡∏¢ (‡πÄ‡∏ä‡πà‡∏ô ‡∏ï‡∏±‡∏ß ‡∏Å ‡∏≠ ‡∏¢ ‡πÄ‡∏î‡∏µ‡πà‡∏¢‡∏ß ‡πÜ)
4. ‡∏£‡∏±‡∏Å‡∏©‡∏≤‡∏£‡∏π‡∏õ‡πÅ‡∏ö‡∏ö‡∏ö‡∏ó‡∏™‡∏ô‡∏ó‡∏ô‡∏≤ (‡∏Ñ‡∏≥‡∏û‡∏π‡∏î‡πÉ‡∏ô "...")
5. ‚ùå ‡∏´‡πâ‡∏≤‡∏°‡πÄ‡∏û‡∏¥‡πà‡∏°‡∏õ‡∏£‡∏∞‡πÇ‡∏¢‡∏Ñ‡πÉ‡∏´‡∏°‡πà
6. ‚ùå ‡∏´‡πâ‡∏≤‡∏°‡∏•‡∏ö‡∏õ‡∏£‡∏∞‡πÇ‡∏¢‡∏Ñ
7. ‚ùå ‡∏´‡πâ‡∏≤‡∏°‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô‡∏Ñ‡∏ß‡∏≤‡∏°‡∏´‡∏°‡∏≤‡∏¢
8. ‚ùå ‡∏´‡πâ‡∏≤‡∏°‡∏™‡∏£‡∏∏‡∏õ/‡πÄ‡∏£‡∏µ‡∏¢‡∏á‡πÉ‡∏´‡∏°‡πà
9. ‚ùå ‡∏´‡πâ‡∏≤‡∏°‡πÅ‡∏Å‡πâ‡∏ä‡∏∑‡πà‡∏≠‡∏ï‡∏±‡∏ß‡∏•‡∏∞‡∏Ñ‡∏£/‡∏™‡∏ñ‡∏≤‡∏ô‡∏ó‡∏µ‡πà
10. ‡∏£‡∏±‡∏Å‡∏©‡∏≤‡∏•‡∏≥‡∏î‡∏±‡∏ö‡∏Ñ‡∏≥‡πÄ‡∏î‡∏¥‡∏°‡πÉ‡∏´‡πâ‡∏°‡∏≤‡∏Å‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î

üìã ‡∏£‡∏π‡∏õ‡πÅ‡∏ö‡∏ö‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå (‡∏ö‡∏±‡∏á‡∏Ñ‡∏±‡∏ö):
‡∏™‡πà‡∏á‡∏Ñ‡∏∑‡∏ô‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡πÉ‡∏ô‡∏£‡∏π‡∏õ‡πÅ‡∏ö‡∏ö‡∏ô‡∏µ‡πâ‡πÄ‡∏ó‡πà‡∏≤‡∏ô‡∏±‡πâ‡∏ô:

<page num="1">
‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏ó‡∏µ‡πà‡πÅ‡∏Å‡πâ‡πÅ‡∏•‡πâ‡∏ß‡∏´‡∏ô‡πâ‡∏≤ 1
</page>
<page num="2">
‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏ó‡∏µ‡πà‡πÅ‡∏Å‡πâ‡πÅ‡∏•‡πâ‡∏ß‡∏´‡∏ô‡πâ‡∏≤ 2
</page>

‡∏´‡πâ‡∏≤‡∏°‡πÉ‡∏™‡πà‡∏Ñ‡∏≥‡∏≠‡∏ò‡∏¥‡∏ö‡∏≤‡∏¢ ‡∏Ñ‡∏≥‡∏ô‡∏≥ ‡∏´‡∏£‡∏∑‡∏≠‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏≠‡∏∑‡πà‡∏ô‡πÉ‡∏î

üìù ‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏° OCR:
{combined_text}

üîÑ ‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå:"""

    def _create_single_page_prompt(self, text: str) -> str:
        """‡∏™‡∏£‡πâ‡∏≤‡∏á prompt ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡πÅ‡∏Å‡πâ‡πÑ‡∏Ç OCR ‡∏´‡∏ô‡πâ‡∏≤‡πÄ‡∏î‡∏µ‡∏¢‡∏ß"""
        return f"""‡πÅ‡∏Å‡πâ‡πÑ‡∏Ç‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏° OCR ‡∏à‡∏≤‡∏Å‡∏ô‡∏¥‡∏¢‡∏≤‡∏¢‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢‡∏ï‡πà‡∏≠‡πÑ‡∏õ‡∏ô‡∏µ‡πâ

üö® ‡∏Å‡∏é‡∏Å‡∏≤‡∏£‡πÅ‡∏Å‡πâ‡πÑ‡∏Ç (‡πÄ‡∏Ç‡πâ‡∏°‡∏á‡∏ß‡∏î):
1. ‡πÅ‡∏Å‡πâ‡πÄ‡∏â‡∏û‡∏≤‡∏∞ typo ‡πÅ‡∏•‡∏∞‡∏Å‡∏≤‡∏£‡∏™‡∏∞‡∏Å‡∏î‡∏ú‡∏¥‡∏î
2. ‡πÅ‡∏Å‡πâ‡∏Ñ‡∏≥‡∏ó‡∏µ‡πà‡∏Ç‡∏≤‡∏î‡∏´‡∏≤‡∏¢/‡πÅ‡∏ï‡∏Å‡∏´‡∏±‡∏Å (‡πÄ‡∏ä‡πà‡∏ô "‡∏°‡∏≤ ‡∏Å‡∏≥‡∏•‡∏±‡∏á" ‚Üí "‡∏°‡∏≤‡∏Å‡∏≥‡∏•‡∏±‡∏á")
3. ‡∏•‡∏ö‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£‡πÄ‡∏î‡∏µ‡πà‡∏¢‡∏ß‡∏ó‡∏µ‡πà‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏°‡∏´‡∏°‡∏≤‡∏¢
4. ‡∏£‡∏±‡∏Å‡∏©‡∏≤‡∏£‡∏π‡∏õ‡πÅ‡∏ö‡∏ö‡∏ö‡∏ó‡∏™‡∏ô‡∏ó‡∏ô‡∏≤ (‡∏Ñ‡∏≥‡∏û‡∏π‡∏î‡πÉ‡∏ô "...")
5. ‚ùå ‡∏´‡πâ‡∏≤‡∏°‡πÄ‡∏û‡∏¥‡πà‡∏°‡∏õ‡∏£‡∏∞‡πÇ‡∏¢‡∏Ñ‡πÉ‡∏´‡∏°‡πà
6. ‚ùå ‡∏´‡πâ‡∏≤‡∏°‡∏•‡∏ö‡∏õ‡∏£‡∏∞‡πÇ‡∏¢‡∏Ñ
7. ‚ùå ‡∏´‡πâ‡∏≤‡∏°‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô‡∏Ñ‡∏ß‡∏≤‡∏°‡∏´‡∏°‡∏≤‡∏¢
8. ‚ùå ‡∏´‡πâ‡∏≤‡∏°‡πÅ‡∏Å‡πâ‡∏ä‡∏∑‡πà‡∏≠‡∏ï‡∏±‡∏ß‡∏•‡∏∞‡∏Ñ‡∏£/‡∏™‡∏ñ‡∏≤‡∏ô‡∏ó‡∏µ‡πà

‡∏™‡πà‡∏á‡∏Ñ‡∏∑‡∏ô‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏ó‡∏µ‡πà‡πÅ‡∏Å‡πâ‡πÅ‡∏•‡πâ‡∏ß ‡πÑ‡∏°‡πà‡∏ï‡πâ‡∏≠‡∏á‡∏°‡∏µ‡∏Ñ‡∏≥‡∏≠‡∏ò‡∏¥‡∏ö‡∏≤‡∏¢

‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏° OCR:
{text}

‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏ó‡∏µ‡πà‡πÅ‡∏Å‡πâ‡πÅ‡∏•‡πâ‡∏ß:"""

    def _create_metadata_analysis_prompt(self, page_text: str, page_num: int, book_info: Dict) -> str:
        """‡∏™‡∏£‡πâ‡∏≤‡∏á prompt ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏ß‡∏¥‡πÄ‡∏Ñ‡∏£‡∏≤‡∏∞‡∏´‡πå metadata ‡∏Ç‡∏≠‡∏á‡πÅ‡∏ï‡πà‡∏•‡∏∞‡∏´‡∏ô‡πâ‡∏≤"""
        return f"""‡∏ß‡∏¥‡πÄ‡∏Ñ‡∏£‡∏≤‡∏∞‡∏´‡πå‡∏´‡∏ô‡πâ‡∏≤‡∏ô‡∏¥‡∏¢‡∏≤‡∏¢‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢‡∏ï‡πà‡∏≠‡πÑ‡∏õ‡∏ô‡∏µ‡πâ‡πÅ‡∏•‡∏∞‡∏™‡∏Å‡∏±‡∏î metadata ‡∏ï‡∏≤‡∏°‡∏£‡∏π‡∏õ‡πÅ‡∏ö‡∏ö‡∏ó‡∏µ‡πà‡∏Å‡∏≥‡∏´‡∏ô‡∏î

üìö ‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏´‡∏ô‡∏±‡∏á‡∏™‡∏∑‡∏≠:
- ‡∏ä‡∏∑‡πà‡∏≠‡πÄ‡∏£‡∏∑‡πà‡∏≠‡∏á: {book_info.get('book_title', '‡πÑ‡∏°‡πà‡∏£‡∏∞‡∏ö‡∏∏')}
- ‡∏ö‡∏ó: {book_info.get('chapter', '‡πÑ‡∏°‡πà‡∏£‡∏∞‡∏ö‡∏∏')}
- ‡∏´‡∏ô‡πâ‡∏≤: {page_num}

üìù ‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡πÉ‡∏ô‡∏´‡∏ô‡πâ‡∏≤:
{page_text[:2500]}  # ‡∏à‡∏≥‡∏Å‡∏±‡∏î 2500 ‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏õ‡∏£‡∏∞‡∏´‡∏¢‡∏±‡∏î token

üéØ ‡∏ß‡∏¥‡πÄ‡∏Ñ‡∏£‡∏≤‡∏∞‡∏´‡πå‡πÅ‡∏•‡∏∞‡∏ï‡∏≠‡∏ö‡πÄ‡∏õ‡πá‡∏ô JSON ‡∏î‡∏±‡∏á‡∏ô‡∏µ‡πâ (‡∏´‡πâ‡∏≤‡∏°‡πÉ‡∏™‡πà comment ‡∏´‡∏£‡∏∑‡∏≠‡∏Ñ‡∏≥‡∏≠‡∏ò‡∏¥‡∏ö‡∏≤‡∏¢):
{{
  "tone": ["‡∏≠‡∏≤‡∏£‡∏°‡∏ì‡πå‡∏´‡∏•‡∏±‡∏Å‡∏Ç‡∏≠‡∏á‡∏´‡∏ô‡πâ‡∏≤‡∏ô‡∏µ‡πâ ‡πÄ‡∏ä‡πà‡∏ô dramatic, tense, romantic"],
  "tags": ["‡πÅ‡∏ó‡πá‡∏Å‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡∏´‡∏ô‡πâ‡∏≤‡∏ô‡∏µ‡πâ ‡∏ñ‡πâ‡∏≤‡πÑ‡∏°‡πà‡∏ä‡∏±‡∏î‡πÄ‡∏à‡∏ô‡πÉ‡∏´‡πâ‡πÉ‡∏ä‡πâ general"],
  "characters": ["‡∏ä‡∏∑‡πà‡∏≠‡∏ï‡∏±‡∏ß‡∏•‡∏∞‡∏Ñ‡∏£‡∏ó‡∏µ‡πà‡∏õ‡∏£‡∏≤‡∏Å‡∏è‡πÉ‡∏ô‡∏´‡∏ô‡πâ‡∏≤‡∏ô‡∏µ‡πâ‡πÄ‡∏ó‡πà‡∏≤‡∏ô‡∏±‡πâ‡∏ô"],
  "places": ["‡∏™‡∏ñ‡∏≤‡∏ô‡∏ó‡∏µ‡πà‡πÉ‡∏ô‡∏´‡∏ô‡πâ‡∏≤‡∏ô‡∏µ‡πâ"],
  "objects": ["‡∏ß‡∏±‡∏ï‡∏ñ‡∏∏‡∏™‡∏≥‡∏Ñ‡∏±‡∏ç‡πÉ‡∏ô‡∏´‡∏ô‡πâ‡∏≤‡∏ô‡∏µ‡πâ"],
  "dialogue_pairs": ‡∏ï‡∏±‡∏ß‡πÄ‡∏•‡∏Ç‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡∏Ñ‡∏π‡πà‡∏ö‡∏ó‡∏™‡∏ô‡∏ó‡∏ô‡∏≤,
  "char_count": ‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£‡πÇ‡∏î‡∏¢‡∏õ‡∏£‡∏∞‡∏°‡∏≤‡∏ì,
  "word_count": ‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡∏Ñ‡∏≥‡πÇ‡∏î‡∏¢‡∏õ‡∏£‡∏∞‡∏°‡∏≤‡∏ì,
  "paragraph_count": ‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡∏¢‡πà‡∏≠‡∏´‡∏ô‡πâ‡∏≤,
  "style_notes": "‡∏™‡∏≥‡∏ô‡∏ß‡∏ô‡∏Å‡∏≤‡∏£‡πÄ‡∏Ç‡∏µ‡∏¢‡∏ô‡∏´‡∏£‡∏∑‡∏≠‡∏à‡∏±‡∏á‡∏´‡∏ß‡∏∞‡∏Ç‡∏≠‡∏á‡∏´‡∏ô‡πâ‡∏≤‡∏ô‡∏µ‡πâ (1-2 ‡∏ö‡∏£‡∏£‡∏ó‡∏±‡∏î)",
  "summary": "‡∏™‡∏£‡∏∏‡∏õ 1 ‡∏ö‡∏£‡∏£‡∏ó‡∏±‡∏î‡πÑ‡∏°‡πà‡πÄ‡∏Å‡∏¥‡∏ô 25 ‡∏Ñ‡∏≥",
  "anomalies": "‡∏™‡∏¥‡πà‡∏á‡∏ú‡∏¥‡∏î‡∏õ‡∏Å‡∏ï‡∏¥‡∏ñ‡πâ‡∏≤‡∏°‡∏µ ‡∏´‡∏£‡∏∑‡∏≠ null",
  "confidence": 0.85
}}

‡∏ï‡∏≠‡∏ö‡πÄ‡∏â‡∏û‡∏≤‡∏∞ JSON ‡πÄ‡∏ó‡πà‡∏≤‡∏ô‡∏±‡πâ‡∏ô ‡πÑ‡∏°‡πà‡∏ï‡πâ‡∏≠‡∏á‡∏°‡∏µ‡∏Ñ‡∏≥‡∏≠‡∏ò‡∏¥‡∏ö‡∏≤‡∏¢:"""

    def _call_api_with_retry(self, prompt: str, filename: str, retry_count: int) -> Dict:
        """‡πÄ‡∏£‡∏µ‡∏¢‡∏Å API ‡∏û‡∏£‡πâ‡∏≠‡∏° retry mechanism"""
        for attempt in range(Config.MAX_RETRIES + 1):
            try:
                if 'gpt' in self.model:
                    return self._call_openai(prompt)
                else:
                    return self._call_anthropic(prompt)
            except Exception as e:
                error_type = type(e).__name__
                print(f"   ‚ùå API error (attempt {attempt + 1}): {error_type}")
                if attempt < Config.MAX_RETRIES:
                    base_delay = 2 ** attempt
                    jitter = random.uniform(0.5, 1.5)
                    delay = base_delay * jitter
                    print(f"   ‚è± Retrying in {delay:.1f}s...")
                    time.sleep(delay)
                else:
                    raise Exception(f"API failed after {Config.MAX_RETRIES + 1} attempts: {e}")

    def _call_openai(self, prompt: str) -> Dict:
        """‡πÄ‡∏£‡∏µ‡∏¢‡∏Å OpenAI API"""
        response = self.client.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": "‡∏Ñ‡∏∏‡∏ì‡∏Ñ‡∏∑‡∏≠‡∏ú‡∏π‡πâ‡πÄ‡∏ä‡∏µ‡πà‡∏¢‡∏ß‡∏ä‡∏≤‡∏ç‡∏ß‡∏¥‡πÄ‡∏Ñ‡∏£‡∏≤‡∏∞‡∏´‡πå‡∏ô‡∏¥‡∏¢‡∏≤‡∏¢‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢ ‡∏ï‡∏≠‡∏ö‡∏ï‡∏≤‡∏°‡∏£‡∏π‡∏õ‡πÅ‡∏ö‡∏ö‡∏ó‡∏µ‡πà‡∏Å‡∏≥‡∏´‡∏ô‡∏î‡∏≠‡∏¢‡πà‡∏≤‡∏á‡πÄ‡∏Ñ‡∏£‡πà‡∏á‡∏Ñ‡∏£‡∏±‡∏î"},
                {"role": "user", "content": prompt}
            ],
            temperature=Config.TEMPERATURE,
            max_tokens=Config.MAX_TOKENS
        )
        cleaned_text = response.choices[0].message.content
        total_tokens = response.usage.total_tokens
        input_tokens = getattr(response.usage, 'prompt_tokens', total_tokens // 2)
        output_tokens = getattr(response.usage, 'completion_tokens', total_tokens // 2)
        prices = Config.PRICE_PER_1K_TOKENS.get(self.model, {'input': 0.0005, 'output': 0.0015})
        input_cost = (input_tokens / 1000) * prices['input']
        output_cost = (output_tokens / 1000) * prices['output']
        total_cost = input_cost + output_cost
        return {
            'cleaned_text': cleaned_text,
            'tokens_used': total_tokens,
            'input_tokens': input_tokens,
            'output_tokens': output_tokens,
            'cost': total_cost,
            'model': self.model
        }

    def _call_anthropic(self, prompt: str) -> Dict:
        """‡πÄ‡∏£‡∏µ‡∏¢‡∏Å Anthropic API"""
        response = self.client.messages.create(
            model=self.model,
            max_tokens=Config.MAX_TOKENS,
            temperature=Config.TEMPERATURE,
            messages=[{"role": "user", "content": prompt}]
        )
        cleaned_text = response.content[0].text
        input_tokens = getattr(response.usage, 'input_tokens', 0)
        output_tokens = getattr(response.usage, 'output_tokens', 0)
        total_tokens = input_tokens + output_tokens
        prices = Config.PRICE_PER_1K_TOKENS.get(self.model, {'input': 0.00025, 'output': 0.00125})
        input_cost = (input_tokens / 1000) * prices['input']
        output_cost = (output_tokens / 1000) * prices['output']
        total_cost = input_cost + output_cost
        return {
            'cleaned_text': cleaned_text,
            'tokens_used': total_tokens,
            'input_tokens': input_tokens,
            'output_tokens': output_tokens,
            'cost': total_cost,
            'model': self.model
        }

    def analyze_page_metadata(self, page_text: str, page_num: int, book_info: Dict, filename: str = "") -> Dict:
        """
        ‡∏ß‡∏¥‡πÄ‡∏Ñ‡∏£‡∏≤‡∏∞‡∏´‡πå metadata ‡∏Ç‡∏≠‡∏á‡∏´‡∏ô‡πâ‡∏≤‡πÄ‡∏î‡∏µ‡∏¢‡∏ß

        Args:
            page_text: ‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡πÉ‡∏ô‡∏´‡∏ô‡πâ‡∏≤‡∏ó‡∏µ‡πà‡∏à‡∏∞‡∏ß‡∏¥‡πÄ‡∏Ñ‡∏£‡∏≤‡∏∞‡∏´‡πå
            page_num: ‡∏´‡∏°‡∏≤‡∏¢‡πÄ‡∏•‡∏Ç‡∏´‡∏ô‡πâ‡∏≤
            book_info: ‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏´‡∏ô‡∏±‡∏á‡∏™‡∏∑‡∏≠ (title, chapter, etc.)
            filename: ‡∏ä‡∏∑‡πà‡∏≠‡πÑ‡∏ü‡∏•‡πå‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö logging

        Returns:
            Dict ‡∏ó‡∏µ‡πà‡∏°‡∏µ metadata ‡∏Ç‡∏≠‡∏á‡∏´‡∏ô‡πâ‡∏≤
        """
        start_time = time.time()

        # ‡∏™‡∏£‡πâ‡∏≤‡∏á prompt
        prompt = self._create_metadata_analysis_prompt(page_text, page_num, book_info)

        # ‡πÄ‡∏£‡∏µ‡∏¢‡∏Å API
        result = self._call_api_with_retry(prompt, f"{filename}_meta_p{page_num}", 0)

        # ‡∏û‡∏¢‡∏≤‡∏¢‡∏≤‡∏° parse JSON ‡∏à‡∏≤‡∏Å response
        try:
            import json
            # ‡∏•‡∏ö markdown code block ‡∏ñ‡πâ‡∏≤‡∏°‡∏µ
            response_text = result['cleaned_text']
            response_text = response_text.replace('```json\n', '').replace('\n```', '')
            response_text = response_text.replace('```', '')

            metadata = json.loads(response_text)

            # ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö required fields
            required_fields = ['tone', 'tags', 'characters', 'places', 'objects',
                             'dialogue_pairs', 'style_notes', 'summary', 'confidence']
            for field in required_fields:
                if field not in metadata:
                    metadata[field] = [] if field in ['tone', 'tags', 'characters', 'places', 'objects'] else ""

        except json.JSONDecodeError as e:
            print(f"   ‚ö†Ô∏è Failed to parse metadata for page {page_num}: {e}")
            # Return default metadata
            metadata = {
                'tone': ['unknown'],
                'tags': ['general'],
                'characters': [],
                'places': [],
                'objects': [],
                'dialogue_pairs': 0,
                'char_count': len(page_text),
                'word_count': len(page_text.split()),
                'paragraph_count': page_text.count('\n\n') + 1,
                'style_notes': '‡πÑ‡∏°‡πà‡∏™‡∏≤‡∏°‡∏≤‡∏£‡∏ñ‡∏ß‡∏¥‡πÄ‡∏Ñ‡∏£‡∏≤‡∏∞‡∏´‡πå‡πÑ‡∏î‡πâ',
                'summary': '‡πÑ‡∏°‡πà‡∏™‡∏≤‡∏°‡∏≤‡∏£‡∏ñ‡∏™‡∏£‡∏∏‡∏õ‡πÑ‡∏î‡πâ',
                'anomalies': 'JSON parsing failed',
                'confidence': 0.0
            }

        # ‡πÄ‡∏û‡∏¥‡πà‡∏°‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏Å‡∏≤‡∏£‡∏õ‡∏£‡∏∞‡∏°‡∏ß‡∏•‡∏ú‡∏•
        processing_time = time.time() - start_time
        metadata['processing_time'] = processing_time
        metadata['tokens_used'] = result.get('tokens_used', 0)
        metadata['cost'] = result.get('cost', 0)

        # Log usage
        self.usage_logger.log_usage(
            original_filename=filename,
            clean_filename=f"{filename}_metadata_p{page_num}",
            pages_count=1,
            model=self.model,
            input_tokens=result.get('input_tokens', 0),
            output_tokens=result.get('output_tokens', 0),
            cost_usd=result.get('cost', 0),
            processing_time=processing_time,
            retry_count=0,
            validation_status='META_ANALYSIS'
        )

        self.total_tokens += result.get('tokens_used', 0)
        self.total_cost += result.get('cost', 0)

        return metadata

    def clean_multipage_ocr(self, pages: List[Dict], filename: str = "") -> Dict:
        """‡πÅ‡∏Å‡πâ‡πÑ‡∏Ç OCR ‡∏´‡∏•‡∏≤‡∏¢‡∏´‡∏ô‡πâ‡∏≤‡∏û‡∏£‡πâ‡∏≠‡∏°‡∏Å‡∏±‡∏ô"""
        start_time = time.time()
        retry_count = 0
        prompt = self._create_multipage_xml_prompt(pages)
        result = self._call_api_with_retry(prompt, filename, retry_count)
        cleaned_pages = self._parse_xml_result(result['cleaned_text'], pages)
        processing_time = time.time() - start_time
        self.total_tokens += result['tokens_used']
        self.total_cost += result['cost']
        self.usage_logger.log_usage(
            original_filename=filename,
            clean_filename=f"{filename}_processed",
            pages_count=len(pages),
            model=self.model,
            input_tokens=result.get('input_tokens', 0),
            output_tokens=result.get('output_tokens', 0),
            cost_usd=result['cost'],
            processing_time=processing_time,
            retry_count=retry_count
        )
        return {
            'cleaned_pages': cleaned_pages,
            'tokens_used': result['tokens_used'],
            'cost': result['cost'],
            'processing_time': processing_time,
            'input_tokens': result.get('input_tokens', 0),
            'output_tokens': result.get('output_tokens', 0)
        }

    def clean_ocr_text(self, text: str, filename: str = "", page_num: int = 1) -> Dict:
        """‡πÅ‡∏Å‡πâ‡πÑ‡∏Ç OCR ‡∏´‡∏ô‡πâ‡∏≤‡πÄ‡∏î‡∏µ‡∏¢‡∏ß"""
        start_time = time.time()
        retry_count = 0
        prompt = self._create_single_page_prompt(text)
        result = self._call_api_with_retry(prompt, filename, retry_count)
        processing_time = time.time() - start_time
        self.total_tokens += result['tokens_used']
        self.total_cost += result['cost']
        self.usage_logger.log_usage(
            original_filename=filename,
            clean_filename=f"{filename}_processed_p{page_num}",
            pages_count=1,
            model=self.model,
            input_tokens=result.get('input_tokens', 0),
            output_tokens=result.get('output_tokens', 0),
            cost_usd=result['cost'],
            processing_time=processing_time,
            retry_count=retry_count
        )
        return result

    def _parse_xml_result(self, cleaned_text: str, original_pages: List[Dict]) -> List[Dict]:
        """‡πÅ‡∏¢‡∏Å‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡∏à‡∏≤‡∏Å XML tags"""
        cleaned_pages = []
        xml_pages = MultiPageParser.parse_xml_result(cleaned_text)
        xml_dict = {p['page_num']: p['cleaned_text'] for p in xml_pages}
        for original_page in original_pages:
            page_num = original_page['page_num']
            cleaned_content = xml_dict.get(page_num, "")
            if not cleaned_content:
                print(f"   ‚ö†Ô∏è Warning: ‡πÑ‡∏°‡πà‡∏û‡∏ö‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏´‡∏ô‡πâ‡∏≤ {page_num}")
                cleaned_content = original_page['raw_text']
            cleaned_pages.append({
                'page_num': page_num,
                'raw_text': original_page['raw_text'],
                'cleaned_text': cleaned_content
            })
        return cleaned_pages

    def generate_text(self, prompt: str, filename: str = "", system: str = "You are a helpful assistant.") -> Dict:
        """Generate text ‡∏ó‡∏±‡πà‡∏ß‡πÑ‡∏õ"""
        start_time = time.time()
        retry_count = 0
        for attempt in range(Config.MAX_RETRIES + 1):
            try:
                if 'gpt' in self.model:
                    response = self.client.chat.completions.create(
                        model=self.model,
                        messages=[{"role": "system", "content": system},
                                  {"role": "user", "content": prompt}],
                        temperature=Config.TEMPERATURE,
                        max_tokens=Config.MAX_TOKENS
                    )
                    result_text = response.choices[0].message.content
                    total_tokens = response.usage.total_tokens
                    input_tokens = getattr(response.usage, 'prompt_tokens', total_tokens // 2)
                    output_tokens = getattr(response.usage, 'completion_tokens', total_tokens // 2)
                else:
                    response = self.client.messages.create(
                        model=self.model,
                        max_tokens=Config.MAX_TOKENS,
                        temperature=Config.TEMPERATURE,
                        messages=[{"role": "user", "content": prompt}]
                    )
                    result_text = response.content[0].text
                    input_tokens = getattr(response.usage, 'input_tokens', 0)
                    output_tokens = getattr(response.usage, 'output_tokens', 0)
                    total_tokens = input_tokens + output_tokens

                prices = Config.PRICE_PER_1K_TOKENS.get(self.model, {'input': 0.0005, 'output': 0.0015})
                input_cost = (input_tokens / 1000) * prices['input']
                output_cost = (output_tokens / 1000) * prices['output']
                total_cost = input_cost + output_cost

                processing_time = time.time() - start_time
                self.usage_logger.log_usage(
                    original_filename=filename,
                    clean_filename=f"{filename}_generated",
                    pages_count=1,
                    model=self.model,
                    input_tokens=input_tokens,
                    output_tokens=output_tokens,
                    cost_usd=total_cost,
                    processing_time=processing_time,
                    retry_count=retry_count
                )
                self.total_tokens += total_tokens
                self.total_cost += total_cost
                return {
                    'text': result_text,
                    'tokens_used': total_tokens,
                    'input_tokens': input_tokens,
                    'output_tokens': output_tokens,
                    'cost': total_cost,
                    'processing_time': processing_time
                }
            except Exception as e:
                retry_count += 1
                if attempt < Config.MAX_RETRIES:
                    base_delay = 2 ** attempt
                    jitter = random.uniform(0.5, 1.5)
                    delay = base_delay * jitter
                    print(f"   Retry {attempt + 1}/{Config.MAX_RETRIES} in {delay:.1f}s...")
                    time.sleep(delay)
                else:
                    raise Exception(f"Generate text failed: {e}")

    def get_usage_summary(self) -> Dict:
        """‡∏™‡∏£‡∏∏‡∏õ‡∏Å‡∏≤‡∏£‡πÉ‡∏ä‡πâ‡∏á‡∏≤‡∏ô API"""
        file_summary = self.usage_logger.get_summary()
        return {
            'total_tokens': self.total_tokens,
            'total_cost_usd': self.total_cost,
            'total_cost_thb': self.total_cost * 35,
            'pages_processed': self.total_tokens // 500,
            'session_files': file_summary.get('total_files', 0),
            'session_pages': file_summary.get('total_pages', 0),
            'avg_cost_per_page': file_summary.get('avg_cost_per_page', 0),
            'detailed_stats': file_summary
        }

print("‚úÖ Enhanced LLM Client with Metadata Analysis ready")

‚úÖ Enhanced LLM Client with Metadata Analysis ready


In [7]:
# ============================================
# üìå Block 7: Enhanced Text Chunker
# ============================================
class EnhancedChunker:
    """‡πÅ‡∏ö‡πà‡∏á‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏î‡πâ‡∏ß‡∏¢ context overlap ‡πÅ‡∏•‡∏∞ smart boundary detection"""

    @staticmethod
    def smart_chunk_text(text: str, max_chars: int = 2500) -> List[str]:
        """‡πÅ‡∏ö‡πà‡∏á‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏ä‡∏≤‡∏ç‡∏â‡∏•‡∏≤‡∏î‡∏î‡πâ‡∏ß‡∏¢ context overlap"""
        sentence_endings = ['.', '!', '?', '‚Ä¶', '"', '"']
        paragraph_break = '\n\n'

        chunks = []
        current_pos = 0
        half = max_chars // 2

        while current_pos < len(text):
            chunk_end = min(current_pos + max_chars, len(text))
            if chunk_end < len(text):
                para_pos = text.rfind(paragraph_break, current_pos, chunk_end)
                if para_pos > current_pos + half:
                    chunk_end = para_pos + len(paragraph_break)
                else:
                    best_pos = -1
                    for ending in sentence_endings:
                        pos = text.rfind(ending, current_pos + half, chunk_end)
                        if pos > best_pos:
                            best_pos = pos + len(ending)
                    if best_pos > current_pos:
                        chunk_end = best_pos

            chunk = text[current_pos:chunk_end]

            if chunks and Config.CONTEXT_OVERLAP > 0:
                overlap_start = max(0, current_pos - Config.CONTEXT_OVERLAP)
                overlap_text = text[overlap_start:current_pos]
                if overlap_text:
                    chunk = f"[‡∏ö‡∏£‡∏¥‡∏ö‡∏ó‡∏Å‡πà‡∏≠‡∏ô‡∏´‡∏ô‡πâ‡∏≤: {overlap_text[-50:]}...]\n\n{chunk}"

            chunks.append(chunk)
            prev_pos = current_pos
            current_pos = chunk_end
            if current_pos == prev_pos and current_pos < len(text):
                current_pos += 1

        return chunks

    @staticmethod
    def merge_chunks_with_dedup(chunks: List[str]) -> str:
        """‡∏£‡∏ß‡∏° chunks ‡πÇ‡∏î‡∏¢‡∏•‡∏ö overlap ‡∏ã‡πâ‡∏≥"""
        if not chunks:
            return ""
        if len(chunks) == 1:
            return chunks[0]

        result = chunks[0]
        for i in range(1, len(chunks)):
            chunk = chunks[i]
            if chunk.startswith('[‡∏ö‡∏£‡∏¥‡∏ö‡∏ó‡∏Å‡πà‡∏≠‡∏ô‡∏´‡∏ô‡πâ‡∏≤:'):
                cut = chunk.find(']\n\n')
                if cut != -1:
                    chunk = chunk[cut + 4:]
            result += chunk
        return result

print("‚úÖ Enhanced Chunker ready")

‚úÖ Enhanced Chunker ready


In [8]:
# ============================================
# üìå Block 8: Thai-specific Validator
# ============================================
class ThaiValidator:
    """‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏Ñ‡∏∏‡∏ì‡∏†‡∏≤‡∏û‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢‡πÄ‡∏â‡∏û‡∏≤‡∏∞"""

    @staticmethod
    def validate_thai_text(raw_text: str, cleaned_text: str, filename: str = "") -> Dict:
        """‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡πÅ‡∏ö‡∏ö‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢"""
        issues, warnings = [], []

        # 1) ‡∏Ñ‡∏ß‡∏≤‡∏°‡∏¢‡∏≤‡∏ß
        len_ratio = len(cleaned_text) / len(raw_text) if raw_text else 0
        len_change = (len(cleaned_text) - len(raw_text)) / len(raw_text) * 100 if raw_text else 0
        if len_ratio < 0.7:
            issues.append(f"‚ö†Ô∏è ‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏±‡πâ‡∏ô‡∏•‡∏á‡∏°‡∏≤‡∏Å {abs(len_change):.1f}% (‡∏≠‡∏≤‡∏à‡∏°‡∏µ‡∏Å‡∏≤‡∏£‡∏•‡∏ö‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤)")
        elif len_ratio > 1.3:
            issues.append(f"‚ö†Ô∏è ‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏¢‡∏≤‡∏ß‡∏Ç‡∏∂‡πâ‡∏ô‡∏°‡∏≤‡∏Å {len_change:.1f}% (‡∏≠‡∏≤‡∏à‡∏°‡∏µ‡∏Å‡∏≤‡∏£‡πÄ‡∏û‡∏¥‡πà‡∏°‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤)")
        elif len_ratio < 0.85:
            warnings.append(f"üìù ‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏±‡πâ‡∏ô‡∏•‡∏á {abs(len_change):.1f}%")
        elif len_ratio > 1.15:
            warnings.append(f"üìù ‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏¢‡∏≤‡∏ß‡∏Ç‡∏∂‡πâ‡∏ô {len_change:.1f}%")

        # 2) ‡∏≠‡∏±‡∏ç‡∏õ‡∏£‡∏∞‡∏Å‡∏≤‡∏®
        raw_quotes = ThaiTextUtils.count_thai_quotes(raw_text)
        clean_quotes = ThaiTextUtils.count_thai_quotes(cleaned_text)
        total_raw = sum(raw_quotes.values())
        total_clean = sum(clean_quotes.values())
        if abs(total_raw - total_clean) > 3:
            issues.append(f"‚ö†Ô∏è ‡∏≠‡∏±‡∏ç‡∏õ‡∏£‡∏∞‡∏Å‡∏≤‡∏®‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô‡∏°‡∏≤‡∏Å ({total_raw} ‚Üí {total_clean})")
        elif abs(total_raw - total_clean) > 1:
            warnings.append(f"üìù ‡∏≠‡∏±‡∏ç‡∏õ‡∏£‡∏∞‡∏Å‡∏≤‡∏®‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô ({total_raw} ‚Üí {total_clean})")

        # 3) ‡πÑ‡∏°‡πâ‡∏¢‡∏°‡∏Å
        raw_yamok = raw_text.count('‡πÜ')
        clean_yamok = cleaned_text.count('‡πÜ')
        if abs(raw_yamok - clean_yamok) > 2:
            warnings.append(f"üìù ‡πÑ‡∏°‡πâ‡∏¢‡∏°‡∏Å (‡πÜ) ‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô ({raw_yamok} ‚Üí {clean_yamok})")

        # 4) ‡∏ï‡∏±‡∏ß‡πÄ‡∏•‡∏Ç
        raw_numbers = len(re.findall(r'\d+', raw_text))
        clean_numbers = len(re.findall(r'\d+', cleaned_text))
        if abs(raw_numbers - clean_numbers) > 3:
            issues.append(f"‚ö†Ô∏è ‡∏ï‡∏±‡∏ß‡πÄ‡∏•‡∏Ç‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô‡∏°‡∏≤‡∏Å ({raw_numbers} ‚Üí {clean_numbers})")
        elif abs(raw_numbers - clean_numbers) > 1:
            warnings.append(f"üìù ‡∏ï‡∏±‡∏ß‡πÄ‡∏•‡∏Ç‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô ({raw_numbers} ‚Üí {clean_numbers})")

        # 5) ‡∏Ñ‡∏≥‡πÅ‡∏ï‡∏Å‡∏´‡∏±‡∏Å
        broken_words_raw = ThaiTextUtils.detect_word_breaks(raw_text)
        broken_words_clean = ThaiTextUtils.detect_word_breaks(cleaned_text)
        if len(broken_words_clean) > len(broken_words_raw) * 0.5:
            warnings.append(f"üìù ‡∏¢‡∏±‡∏á‡∏°‡∏µ‡∏Ñ‡∏≥‡πÅ‡∏ï‡∏Å‡∏´‡∏±‡∏Å: {broken_words_clean[:3]}")

        # Score & Status
        score = 1.0 - 0.3*len(issues) - 0.1*len(warnings)
        score = max(0, min(1, score))
        status = 'PASS'
        if issues:
            status = 'FAIL'
        elif warnings:
            status = 'WARNING'

        return {
            'filename': filename,
            'status': status,
            'score': score,
            'issues': issues,
            'warnings': warnings,
            'stats': {
                'length_change': f"{len_change:+.1f}%",
                'quotes_change': f"{total_raw} ‚Üí {total_clean}",
                'yamok_change': f"{raw_yamok} ‚Üí {clean_yamok}",
                'numbers_change': f"{raw_numbers} ‚Üí {clean_numbers}",
                'broken_words_remaining': len(broken_words_clean)
            },
            'timestamp': datetime.now().isoformat()
        }

print("‚úÖ Thai Validator ready")

‚úÖ Thai Validator ready


In [9]:
# ============================================
# üìå Block 9: Enhanced OCR Processor with Metadata
# ============================================
class OCRProcessor:
    """Main processor with metadata analysis ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡πÅ‡∏ï‡πà‡∏•‡∏∞‡∏´‡∏ô‡πâ‡∏≤"""

    def __init__(self, analyze_metadata: bool = True):
        """
        Args:
            analyze_metadata: ‡∏ñ‡πâ‡∏≤ True ‡∏à‡∏∞‡∏ß‡∏¥‡πÄ‡∏Ñ‡∏£‡∏≤‡∏∞‡∏´‡πå metadata ‡∏Ç‡∏≠‡∏á‡πÅ‡∏ï‡πà‡∏•‡∏∞‡∏´‡∏ô‡πâ‡∏≤ (‡∏°‡∏µ‡∏Ñ‡πà‡∏≤‡πÉ‡∏ä‡πâ‡∏à‡πà‡∏≤‡∏¢‡πÄ‡∏û‡∏¥‡πà‡∏°)
        """
        self.llm = LLMClient()
        self.chunker = EnhancedChunker()
        self.validator = ThaiValidator()
        self.analyze_metadata = analyze_metadata
        self.stats = {
            'processed': 0,
            'failed': 0,
            'validation_pass': 0,
            'validation_warning': 0,
            'validation_fail': 0,
            'multipage_files': 0,
            'single_page_files': 0,
            'metadata_analyzed': 0
        }
        self.training_pairs = []
        self.metadata_collection = []  # ‡πÄ‡∏Å‡πá‡∏ö metadata ‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö analysis

    # ---------- Top-level ----------
    def process_file(self, file_path: Path) -> Dict:
        """Process ‡πÑ‡∏ü‡∏•‡πå OCR ‡∏û‡∏£‡πâ‡∏≠‡∏° metadata analysis"""
        print(f"\nüìÑ Processing: {file_path.name}")
        if self.analyze_metadata:
            print("   üîç Metadata analysis: ENABLED")
        try:
            raw_content = file_path.read_text(encoding='utf-8')
            if "--- PAGE:" in raw_content:
                return self._process_multipage_file(file_path, raw_content)
            else:
                normalized_content = ThaiTextUtils.normalize_unicode(raw_content)
                return self._process_single_page_file(file_path, normalized_content)
        except Exception as e:
            print(f"   ‚ùå Error: {e}")
            self.stats['failed'] += 1
            return {'success': False, 'error': str(e)}

    # ---------- Multi-page with Metadata ----------
    def _process_multipage_file(self, file_path: Path, content: str) -> Dict:
        """Process multi-page file ‡∏û‡∏£‡πâ‡∏≠‡∏° metadata analysis"""
        print("   üìñ Multi-page file detected")
        metadata = MultiPageParser.extract_metadata(content)
        pages = MultiPageParser.parse_multipage_file(content)
        print(f"   üìä Found {len(pages)} pages")
        if metadata:
            print(f"   üìò Book: {metadata.get('book_title', 'Unknown')}")
            print(f"   üßæ Chapter: {metadata.get('chapter', 'Unknown')}")

        total_chars = sum(len(p['raw_text']) for p in pages)
        estimated_tokens = total_chars * 0.75
        print(f"   üìè Total content: {total_chars:,} chars (~{estimated_tokens:,.0f} tokens)")

        # Process cleaning
        if len(pages) <= Config.MAX_PAGES_PER_BATCH:
            print(f"   üöÄ Processing all {len(pages)} pages in one batch...")
            result = self.llm.clean_multipage_ocr(pages, file_path.name)
            cleaned_pages = result['cleaned_pages']
            total_cost = result['cost']
            total_tokens = result['tokens_used']
            processing_time = result.get('processing_time', 0)
        else:
            print(f"   üì¶ Processing in batches (max {Config.MAX_PAGES_PER_BATCH} pages/batch)...")
            cleaned_pages, total_cost, total_tokens, processing_time = [], 0, 0, 0
            for i in range(0, len(pages), Config.MAX_PAGES_PER_BATCH):
                batch = pages[i:i + Config.MAX_PAGES_PER_BATCH]
                batch_num = i // Config.MAX_PAGES_PER_BATCH + 1
                print(f"      Batch {batch_num}: Pages {batch[0]['page_num']}-{batch[-1]['page_num']}")
                result = self.llm.clean_multipage_ocr(batch, f"{file_path.name}_batch_{batch_num}")
                cleaned_pages.extend(result['cleaned_pages'])
                total_cost += result['cost']
                total_tokens += result['tokens_used']
                processing_time += result.get('processing_time', 0)
                if i + Config.MAX_PAGES_PER_BATCH < len(pages):
                    time.sleep(2)

        # Analyze metadata for each page (‡∏ñ‡πâ‡∏≤‡πÄ‡∏õ‡∏¥‡∏î‡πÉ‡∏ä‡πâ‡∏á‡∏≤‡∏ô)
        page_metadata_dict = {}
        metadata_cost = 0
        metadata_tokens = 0

        if self.analyze_metadata:
            print("   üîç Analyzing metadata for each page...")
            for page in cleaned_pages:
                page_num = page['page_num']
                print(f"      Analyzing page {page_num}...")

                # ‡∏ß‡∏¥‡πÄ‡∏Ñ‡∏£‡∏≤‡∏∞‡∏´‡πå metadata ‡∏à‡∏≤‡∏Å cleaned text
                page_meta = self.llm.analyze_page_metadata(
                    page_text=page['cleaned_text'],
                    page_num=page_num,
                    book_info=metadata,
                    filename=file_path.name
                )

                page_metadata_dict[page_num] = page_meta
                metadata_cost += page_meta.get('cost', 0)
                metadata_tokens += page_meta.get('tokens_used', 0)
                self.stats['metadata_analyzed'] += 1

                # ‡πÄ‡∏Å‡πá‡∏ö‡πÑ‡∏ß‡πâ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö analysis ‡∏†‡∏≤‡∏¢‡∏´‡∏•‡∏±‡∏á
                self.metadata_collection.append({
                    'file': file_path.name,
                    'page': page_num,
                    'metadata': page_meta
                })

                # ‡∏´‡∏ô‡πà‡∏ß‡∏á‡πÄ‡∏ß‡∏•‡∏≤‡πÄ‡∏•‡πá‡∏Å‡∏ô‡πâ‡∏≠‡∏¢‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÑ‡∏°‡πà‡πÉ‡∏´‡πâ API rate limit
                time.sleep(0.5)

            print(f"   ‚úÖ Metadata analysis complete: {metadata_tokens:,} tokens, ${metadata_cost:.4f}")
            total_cost += metadata_cost
            total_tokens += metadata_tokens

        # Validation
        validation_results = []
        for page in cleaned_pages:
            if page.get('raw_text') and page.get('cleaned_text'):
                val = self.validator.validate_thai_text(
                    page['raw_text'], page['cleaned_text'], f"{file_path.name}_p{page['page_num']}"
                )
                validation_results.append(val)

        validation_summary = self._summarize_validation(validation_results)

        # ---------- ‡∏™‡∏£‡πâ‡∏≤‡∏á‡πÑ‡∏ü‡∏•‡πå output ----------
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        clean_filename = f"{file_path.stem}_clean_{timestamp}.txt"
        clean_path = Path(Config.CLEANED_DIR) / clean_filename

        output_content = []
        if metadata:
            if 'book_title' in metadata:
                output_content.append(f"### üìò ‡∏ä‡∏∑‡πà‡∏≠‡∏´‡∏ô‡∏±‡∏á‡∏™‡∏∑‡∏≠ (Book Title): {metadata['book_title']}")
            if 'chapter' in metadata:
                output_content.append(f"### üßæ Chapter: {metadata['chapter']}")
            if 'sub_chapter' in metadata:
                output_content.append(f"### üîñ Sub-Chapter: {metadata['sub_chapter']}")
            output_content.append(f"### üìÇ Format: CLEANED (v3.0)")
            if 'purpose' in metadata:
                output_content.append(f"### üß† Purpose: {metadata['purpose']}")
        output_content.append(f"### ‚öôÔ∏è Processing: {datetime.now().isoformat()}")
        output_content.append(f"### üìä Stats: {len(cleaned_pages)} pages, {total_tokens:,} tokens, ${total_cost:.4f}")
        output_content.append(f"### ‚úÖ Validation: {validation_summary}")
        output_content.append("")

        # ‡πÄ‡∏Ç‡∏µ‡∏¢‡∏ô‡πÅ‡∏ï‡πà‡∏•‡∏∞‡∏´‡∏ô‡πâ‡∏≤‡∏û‡∏£‡πâ‡∏≠‡∏° metadata
        for page in cleaned_pages:
            page_num = page['page_num']
            output_content.append(f"--- PAGE: {page_num} ---")

            # ‡πÄ‡∏û‡∏¥‡πà‡∏° metadata ‡∏ñ‡πâ‡∏≤‡∏°‡∏µ
            if page_num in page_metadata_dict:
                meta = page_metadata_dict[page_num]
                output_content.append(f"### üìò Book: {metadata.get('book_title', '')}")
                output_content.append(f"### üßæ Chapter: {metadata.get('chapter', '')}")
                output_content.append(f"### üìÑ Page: {page_num}")
                output_content.append(f"üó£Ô∏è Tone: {', '.join(meta.get('tone', ['unknown']))}")
                output_content.append(f"üè∑Ô∏è Tags: {', '.join(meta.get('tags', ['general']))}")
                output_content.append(f"üë• Characters: {', '.join(meta.get('characters', []))}")
                output_content.append(f"üìç Places: {', '.join(meta.get('places', []))}")
                output_content.append(f"üî∏ Objects: {', '.join(meta.get('objects', []))}")
                output_content.append(f"üí¨ Dialogue Pairs: {meta.get('dialogue_pairs', 0)}")
                output_content.append(
                    f"üìä Stats: Chars‚âà{meta.get('char_count', 0)} | Words‚âà{meta.get('word_count', 0)} | Paragraphs‚âà{meta.get('paragraph_count', 0)}"
                )
                output_content.append(f"‚úèÔ∏è Style Notes: {meta.get('style_notes', '')}")
                output_content.append(f"üìù One-line Summary: {meta.get('summary', '')}")
                if meta.get('anomalies'):
                    output_content.append(f"‚ö†Ô∏è Anomalies: {meta.get('anomalies')}")
                output_content.append(f"Confidence: {meta.get('confidence', 0.0)}")

            output_content.append("--- RAW ---")
            output_content.append(page['raw_text'])
            output_content.append("--- CLEANED ---")
            output_content.append(page['cleaned_text'])
            output_content.append("")

        clean_path.write_text('\n'.join(output_content), encoding='utf-8')

        # ‡πÄ‡∏Å‡πá‡∏ö training pairs (‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡∏ó‡∏µ‡πà validation ‡πÑ‡∏°‡πà FAIL)
        for page, val in zip(cleaned_pages, validation_results):
            if page.get('raw_text') and page.get('cleaned_text') and val['status'] != 'FAIL':
                self.training_pairs.append({
                    'input': page['raw_text'][:1000],
                    'output': page['cleaned_text'][:1000],
                    'source': f"{file_path.name}_page_{page['page_num']}",
                    'timestamp': timestamp,
                    'validation_score': val['score'],
                    'metadata': metadata,
                    'page_metadata': page_metadata_dict.get(page['page_num'], {})
                })

        # Update stats
        self.stats['processed'] += 1
        self.stats['multipage_files'] += 1
        for val in validation_results:
            self.stats[f"validation_{val['status'].lower()}"] += 1

        print(f"   ‚úÖ Saved: {clean_filename}")
        print(f"   üìÑ Pages: {len(cleaned_pages)}")
        print(f"   üî§ Tokens: {total_tokens:,}")
        print(f"   ‚è± Time: {processing_time:.1f}s")
        print(f"   üí∞ Cost: ${total_cost:.4f} (~{total_cost*35:.2f} ‡∏ö‡∏≤‡∏ó)")
        print(f"   üìã Validation: {validation_summary}")
        if self.analyze_metadata:
            print(f"   üîç Metadata analyzed: {len(page_metadata_dict)} pages")

        return {
            'success': True,
            'cleaned_path': str(clean_path),
            'pages_count': len(cleaned_pages),
            'tokens': total_tokens,
            'cost': total_cost,
            'processing_time': processing_time,
            'validation_summary': validation_summary,
            'metadata': metadata,
            'page_metadata': page_metadata_dict if self.analyze_metadata else {}
        }

    # ---------- Single-page with Metadata ----------
    def _process_single_page_file(self, file_path: Path, content: str) -> Dict:
        """Process single page file ‡∏û‡∏£‡πâ‡∏≠‡∏° metadata analysis"""
        print("   üìÑ Single page file")

        # Clean OCR
        if len(content) > 3000:
            chunks = self.chunker.smart_chunk_text(content, 2500)
            cleaned_chunks, total_cost, total_tokens, processing_time = [], 0, 0, 0
            print(f"   üì¶ Split into {len(chunks)} smart chunks")
            for i, chunk in enumerate(chunks, 1):
                print(f"      Chunk {i}/{len(chunks)}...")
                result = self.llm.clean_ocr_text(chunk, filename=f"{file_path.name}_chunk_{i}")
                cleaned_chunks.append(result['cleaned_text'])
                total_cost += result.get('cost', 0)
                total_tokens += result.get('tokens_used', 0)
                processing_time += result.get('processing_time', 0)
                time.sleep(1)
            cleaned_text = self.chunker.merge_chunks_with_dedup(cleaned_chunks)
        else:
            result = self.llm.clean_ocr_text(content, filename=file_path.name)
            cleaned_text = result['cleaned_text']
            total_cost = result['cost']
            total_tokens = result['tokens_used']
            processing_time = result.get('processing_time', 0)

        # Analyze metadata ‡∏ñ‡πâ‡∏≤‡πÄ‡∏õ‡∏¥‡∏î‡πÉ‡∏ä‡πâ‡∏á‡∏≤‡∏ô
        page_metadata = {}
        if self.analyze_metadata:
            print("   üîç Analyzing metadata...")
            page_metadata = self.llm.analyze_page_metadata(
                page_text=cleaned_text,
                page_num=1,
                book_info={'book_title': file_path.stem},
                filename=file_path.name
            )
            total_cost += page_metadata.get('cost', 0)
            total_tokens += page_metadata.get('tokens_used', 0)
            self.stats['metadata_analyzed'] += 1

        # Validation
        validation_result = self.validator.validate_thai_text(content, cleaned_text, file_path.name)

        # Create output file
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        clean_filename = (f"WARNING_{file_path.stem}_clean_{timestamp}.txt"
                          if validation_result['status'] == 'FAIL'
                          else f"{file_path.stem}_clean_{timestamp}.txt")
        clean_path = Path(Config.CLEANED_DIR) / clean_filename

        # Build output content
        output_lines = []
        # Header
        output_lines.append(f"### üìò File: {file_path.name}")
        output_lines.append(f"### üìÇ Format: CLEANED (v3.0)")
        output_lines.append(f"### ‚öôÔ∏è Processing: {datetime.now().isoformat()}")
        output_lines.append(f"### üìä Stats: {total_tokens:,} tokens, ${total_cost:.4f}")
        output_lines.append(f"### ‚úÖ Validation: {validation_result['status']} (score: {validation_result['score']:.2f})")
        output_lines.append("")

        # Metadata ‡∏ñ‡πâ‡∏≤‡∏°‡∏µ
        if page_metadata:
            output_lines.append("--- METADATA ---")
            output_lines.append(f"üó£Ô∏è Tone: {', '.join(page_metadata.get('tone', ['unknown']))}")
            output_lines.append(f"üè∑Ô∏è Tags: {', '.join(page_metadata.get('tags', ['general']))}")
            output_lines.append(f"üë• Characters: {', '.join(page_metadata.get('characters', []))}")
            output_lines.append(f"üìç Places: {', '.join(page_metadata.get('places', []))}")
            output_lines.append(f"üî∏ Objects: {', '.join(page_metadata.get('objects', []))}")
            output_lines.append(f"üí¨ Dialogue Pairs: {page_metadata.get('dialogue_pairs', 0)}")
            output_lines.append(
                f"üìä Stats: Chars‚âà{page_metadata.get('char_count', 0)} | Words‚âà{page_metadata.get('word_count', 0)} | Paragraphs‚âà{page_metadata.get('paragraph_count', 0)}"
            )
            output_lines.append(f"‚úèÔ∏è Style Notes: {page_metadata.get('style_notes', '')}")
            output_lines.append(f"üìù One-line Summary: {page_metadata.get('summary', '')}")
            if page_metadata.get('anomalies'):
                output_lines.append(f"‚ö†Ô∏è Anomalies: {page_metadata.get('anomalies')}")
            output_lines.append(f"Confidence: {page_metadata.get('confidence', 0.0)}")
            output_lines.append("")

        # Content
        output_lines.append("--- CLEANED ---")
        output_lines.append(cleaned_text)

        clean_path.write_text('\n'.join(output_lines), encoding='utf-8')

        # Save training pair
        if validation_result['status'] != 'FAIL':
            self.training_pairs.append({
                'input': content[:1000],
                'output': cleaned_text[:1000],
                'source': file_path.name,
                'timestamp': timestamp,
                'validation_score': validation_result['score'],
                'metadata': page_metadata if self.analyze_metadata else {}
            })

        # Update stats
        self.stats['processed'] += 1
        self.stats['single_page_files'] += 1
        self.stats[f"validation_{validation_result['status'].lower()}"] += 1

        print(f"   ‚úÖ Saved: {clean_filename}")
        print(f"   üìã Validation: {validation_result['status']} (score: {validation_result['score']:.2f})")
        print(f"   üí∞ Cost: ${total_cost:.4f}")
        if self.analyze_metadata:
            print(f"   üîç Metadata analyzed: 1 page")

        return {
            'success': True,
            'cleaned_path': str(clean_path),
            'tokens': total_tokens,
            'cost': total_cost,
            'processing_time': processing_time,
            'validation_result': validation_result,
            'metadata': page_metadata if self.analyze_metadata else {}
        }

    # ---------- Helper Methods ----------
    def _summarize_validation(self, validation_results: List[Dict]) -> str:
        """‡∏™‡∏£‡∏∏‡∏õ‡∏ú‡∏•‡∏Å‡∏≤‡∏£ validation"""
        if not validation_results:
            return "No validation data"
        pass_count = sum(1 for v in validation_results if v['status'] == 'PASS')
        warning_count = sum(1 for v in validation_results if v['status'] == 'WARNING')
        fail_count = sum(1 for v in validation_results if v['status'] == 'FAIL')
        avg_score = sum(v['score'] for v in validation_results) / len(validation_results)
        return f"‚úÖ{pass_count} ‚ö†Ô∏è{warning_count} ‚ùå{fail_count} (avg: {avg_score:.2f})"

    # ---------- Batch Processing ----------
    def process_batch(self, file_pattern: str = "*.txt", limit: int = None, analyze_metadata: bool = None) -> Dict:
        """
        Process ‡∏´‡∏•‡∏≤‡∏¢‡πÑ‡∏ü‡∏•‡πå‡∏à‡∏≤‡∏Å RAW_OCR_DIR

        Args:
            file_pattern: ‡∏£‡∏π‡∏õ‡πÅ‡∏ö‡∏ö‡πÑ‡∏ü‡∏•‡πå‡∏ó‡∏µ‡πà‡∏à‡∏∞‡∏õ‡∏£‡∏∞‡∏°‡∏ß‡∏•‡∏ú‡∏•
            limit: ‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡πÑ‡∏ü‡∏•‡πå‡∏™‡∏π‡∏á‡∏™‡∏∏‡∏î
            analyze_metadata: override ‡∏Å‡∏≤‡∏£‡∏ï‡∏±‡πâ‡∏á‡∏Ñ‡πà‡∏≤ metadata analysis
        """
        if analyze_metadata is not None:
            self.analyze_metadata = analyze_metadata

        raw_dir = Path(Config.RAW_OCR_DIR)
        files = sorted(list(raw_dir.glob(file_pattern)))
        if limit:
            files = files[:limit]

        print(f"\nüß∫ Batch: {len(files)} files found (pattern='{file_pattern}')")
        if self.analyze_metadata:
            print("   üîç Metadata analysis: ENABLED")
        else:
            print("   ‚ö° Metadata analysis: DISABLED (faster, cheaper)")

        results = []
        for i, f in enumerate(files, 1):
            print(f"\n[{i}/{len(files)}] {f.name}")
            res = self.process_file(f)
            results.append({'file': f.name, **res})

        print("\nüèÅ Batch done.")
        return {
            'count': len(results),
            'results': results,
            'stats': self.stats
        }

    # ---------- Export Functions ----------
    def export_training_pairs(self, out_name: str = None) -> str:
        """‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å training pairs ‡πÄ‡∏õ‡πá‡∏ô JSONL"""
        if not self.training_pairs:
            print("‚ö†Ô∏è ‡∏¢‡∏±‡∏á‡πÑ‡∏°‡πà‡∏°‡∏µ training_pairs ‡πÉ‡∏´‡πâ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å")
            return ""
        out_dir = Path(Config.TRAINING_PAIRS_DIR)
        out_dir.mkdir(parents=True, exist_ok=True)
        stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        out_name = out_name or f"training_pairs_{stamp}.jsonl"
        out_path = out_dir / out_name
        with open(out_path, 'w', encoding='utf-8') as f:
            for row in self.training_pairs:
                f.write(json.dumps(row, ensure_ascii=False) + "\n")
        print(f"‚úÖ Exported training pairs: {out_path.name} ({len(self.training_pairs)} rows)")
        return str(out_path)

    def export_metadata_analysis(self, out_name: str = None) -> str:
        """‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å metadata analysis ‡πÄ‡∏õ‡πá‡∏ô JSON"""
        if not self.metadata_collection:
            print("‚ö†Ô∏è ‡∏¢‡∏±‡∏á‡πÑ‡∏°‡πà‡∏°‡∏µ metadata ‡πÉ‡∏´‡πâ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å")
            return ""
        out_dir = Path(Config.CORPUS_DIR)
        out_dir.mkdir(parents=True, exist_ok=True)
        stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        out_name = out_name or f"metadata_analysis_{stamp}.json"
        out_path = out_dir / out_name
        with open(out_path, 'w', encoding='utf-8') as f:
            json.dump(self.metadata_collection, f, ensure_ascii=False, indent=2)
        print(f"‚úÖ Exported metadata analysis: {out_path.name} ({len(self.metadata_collection)} pages)")
        return str(out_path)

    def get_usage_summary(self) -> Dict:
        """‡∏™‡∏£‡∏∏‡∏õ‡∏Å‡∏≤‡∏£‡πÉ‡∏ä‡πâ‡∏á‡∏≤‡∏ô"""
        return self.llm.get_usage_summary()

print("‚úÖ Enhanced OCR Processor with Metadata ready")

‚úÖ Enhanced OCR Processor with Metadata ready


In [10]:
# ============================================
# üìå Block 10: Corpus Builder
# ============================================
class CorpusBuilder:
    """
    ‡∏£‡∏ß‡∏°‡πÑ‡∏ü‡∏•‡πå‡∏ó‡∏µ‡πà‡∏≠‡∏¢‡∏π‡πà‡πÉ‡∏ô CLEANED_DIR ‡πÉ‡∏´‡πâ‡πÄ‡∏õ‡πá‡∏ô‡∏Ñ‡∏≠‡∏£‡πå‡∏õ‡∏±‡∏™‡πÄ‡∏î‡∏µ‡∏¢‡∏ß
    - ‡∏î‡∏∂‡∏á‡πÄ‡∏°‡∏ó‡∏≤‡∏î‡∏≤‡∏ï‡πâ‡∏≤‡∏à‡∏≤‡∏Å‡∏´‡∏±‡∏ß‡πÑ‡∏ü‡∏•‡πå (### ... )
    - ‡πÄ‡∏Å‡πá‡∏ö‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤ CLEANED ‡∏ï‡πà‡∏≠‡∏´‡∏ô‡πâ‡∏≤
    - ‡πÅ‡∏õ‡∏•‡∏á‡πÄ‡∏õ‡πá‡∏ô JSONL/CSV ‡∏û‡∏£‡πâ‡∏≠‡∏°‡∏™‡∏ñ‡∏¥‡∏ï‡∏¥‡∏û‡∏∑‡πâ‡∏ô‡∏ê‡∏≤‡∏ô
    """

    def __init__(self):
        self.cleaned_dir = Path(Config.CLEANED_DIR)
        self.corpus_dir = Path(Config.CORPUS_DIR)
        self.corpus_dir.mkdir(parents=True, exist_ok=True)

    def _parse_cleaned_file(self, path: Path) -> Dict:
        """‡∏≠‡πà‡∏≤‡∏ô‡πÑ‡∏ü‡∏•‡πå CLEANED ‡∏´‡∏ô‡∏∂‡πà‡∏á‡πÑ‡∏ü‡∏•‡πå ‡πÅ‡∏•‡πâ‡∏ß‡∏Ñ‡∏∑‡∏ô metadata + pages"""
        text = path.read_text(encoding='utf-8', errors='ignore')
        meta = MultiPageParser.extract_metadata(text)

        # ‡πÅ‡∏¢‡∏Å‡∏´‡∏ô‡πâ‡∏≤ (‡∏ñ‡πâ‡∏≤‡πÄ‡∏õ‡πá‡∏ô‡πÑ‡∏ü‡∏•‡πå‡∏´‡∏•‡∏≤‡∏¢‡∏´‡∏ô‡πâ‡∏≤‡πÅ‡∏ö‡∏ö‡∏°‡∏µ --- PAGE: ... ---)
        pages = MultiPageParser.parse_multipage_file(text)
        if not pages:
            # ‡πÄ‡∏õ‡πá‡∏ô single text: ‡πÉ‡∏ä‡πâ‡∏ó‡∏±‡πâ‡∏á‡πÑ‡∏ü‡∏•‡πå‡πÄ‡∏õ‡πá‡∏ô‡∏´‡∏ô‡πâ‡∏≤‡πÄ‡∏î‡∏µ‡∏¢‡∏ß
            pages = [{
                'page_num': 1,
                'raw_text': "",
                'cleaned_text': ThaiTextUtils.normalize_unicode(text)
            }]

        # ‡∏£‡∏ß‡∏°‡πÄ‡∏â‡∏û‡∏≤‡∏∞ CLEANED
        all_cleaned = []
        for p in pages:
            ct = p.get('cleaned_text', '')
            if ct:
                all_cleaned.append(ct)

        return {
            'file': path.name,
            'meta': meta,
            'pages': pages,
            'cleaned_joined': "\n".join(all_cleaned).strip()
        }

    def build(self, pattern: str = "*.txt", out_stem: str = None) -> Dict:
        """‡∏£‡∏ß‡∏°‡∏ó‡∏∏‡∏Å‡πÑ‡∏ü‡∏•‡πå‡πÉ‡∏ô CLEANED_DIR ‡∏ó‡∏µ‡πà‡∏ï‡∏£‡∏á pattern ‡πÄ‡∏õ‡πá‡∏ô‡∏Ñ‡∏≠‡∏£‡πå‡∏õ‡∏±‡∏™"""
        files = sorted(self.cleaned_dir.glob(pattern))
        if not files:
            print("‚ö†Ô∏è ‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå‡πÉ‡∏ô CLEANED_DIR")
            return {'count': 0, 'jsonl': '', 'csv': ''}

        data_rows = []
        jsonl_rows = []
        total_chars = 0
        total_pages = 0

        for f in files:
            parsed = self._parse_cleaned_file(f)
            book = parsed['meta'].get('book_title', '')
            chapter = parsed['meta'].get('chapter', '')
            sub = parsed['meta'].get('sub_chapter', '')
            text_joined = parsed['cleaned_joined']
            n_pages = len(parsed['pages'])

            total_pages += n_pages
            total_chars += len(text_joined)

            # ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö JSONL (‡∏´‡∏ô‡∏∂‡πà‡∏á‡πÑ‡∏ü‡∏•‡πå‡∏´‡∏ô‡∏∂‡πà‡∏á‡πÅ‡∏ñ‡∏ß)
            jsonl_rows.append({
                'source_file': parsed['file'],
                'book_title': book,
                'chapter': chapter,
                'sub_chapter': sub,
                'pages': n_pages,
                'text': text_joined
            })

            # ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö CSV (‡πÄ‡∏Å‡πá‡∏ö‡∏™‡∏±‡πâ‡∏ô ‡πÜ)
            data_rows.append({
                'source_file': parsed['file'],
                'book_title': book,
                'chapter': chapter,
                'sub_chapter': sub,
                'pages': n_pages,
                'chars': len(text_joined)
            })

        stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        out_stem = out_stem or f"corpus_{stamp}"
        jsonl_path = self.corpus_dir / f"{out_stem}.jsonl"
        csv_path = self.corpus_dir / f"{out_stem}.csv"
        stats_path = self.corpus_dir / f"{out_stem}_stats.json"

        # ‡πÄ‡∏Ç‡∏µ‡∏¢‡∏ô JSONL
        with open(jsonl_path, "w", encoding="utf-8") as jf:
            for row in jsonl_rows:
                jf.write(json.dumps(row, ensure_ascii=False) + "\n")

        # ‡πÄ‡∏Ç‡∏µ‡∏¢‡∏ô CSV
        pd.DataFrame(data_rows).to_csv(csv_path, index=False, encoding='utf-8')

        # ‡πÄ‡∏Ç‡∏µ‡∏¢‡∏ô‡∏™‡∏ñ‡∏¥‡∏ï‡∏¥
        stats = {
            'files': len(files),
            'pages': total_pages,
            'chars': total_chars,
            'avg_chars_per_file': (total_chars / len(files)) if files else 0,
            'avg_pages_per_file': (total_pages / len(files)) if files else 0,
            'created_at': datetime.now().isoformat()
        }
        with open(stats_path, "w", encoding="utf-8") as sf:
            json.dump(stats, sf, ensure_ascii=False, indent=2)

        print(f"‚úÖ Corpus JSONL: {jsonl_path.name}")
        print(f"‚úÖ Corpus CSV:   {csv_path.name}")
        print(f"üìà Stats:        {stats_path.name}")
        print(f"üì¶ Files: {stats['files']}, Pages: {stats['pages']}, Chars: {stats['chars']:,}")

        return {
            'count': len(files),
            'jsonl': str(jsonl_path),
            'csv': str(csv_path),
            'stats': stats
        }

print("‚úÖ Corpus Builder ready")

‚úÖ Corpus Builder ready


In [11]:
# ============================================
# üìå Block 11: Quick Start Functions
# ============================================

def quick_setup():
    """Enhanced setup with validation"""
    print("\nüîß Enhanced Quick Setup v3.0")
    print("=" * 50)

    # 1) ‡∏ï‡∏£‡∏ß‡∏à API Keys
    if not Config.OPENAI_API_KEY:
        print("‚ö†Ô∏è ‡πÑ‡∏°‡πà‡∏û‡∏ö OpenAI API key")
        print(f"   ‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡∏™‡∏£‡πâ‡∏≤‡∏á‡πÑ‡∏ü‡∏•‡πå: {Config.BASE}/openai.env ‡∏´‡∏£‡∏∑‡∏≠ openai.env.txt")
        print("   ‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤: sk-xxxxxxxxxxxxxxxxxxxxxxxx")
    else:
        print("‚úÖ OpenAI API key: OK")

    # 2) ‡∏ï‡∏£‡∏ß‡∏à‡πÇ‡∏ü‡∏•‡πÄ‡∏î‡∏≠‡∏£‡πå
    print("\nüìÅ Directories")
    for p in [Config.RAW_OCR_DIR, Config.CLEANED_DIR, Config.CORPUS_DIR, Config.TRAINING_PAIRS_DIR, Config.LOGS_DIR]:
        Path(p).mkdir(parents=True, exist_ok=True)
        print(f"   - {p} ‚úîÔ∏è")

    # 3) ‡∏™‡∏£‡∏∏‡∏õ‡∏Ñ‡∏≠‡∏ô‡∏ü‡∏¥‡∏Å
    print("\n‚öôÔ∏è Config")
    print(f"   Model: {Config.MODEL}")
    print(f"   Max pages per batch: {Config.MAX_PAGES_PER_BATCH}")
    print(f"   Temperature: {Config.TEMPERATURE}")
    print(f"   Context overlap: {Config.CONTEXT_OVERLAP}")
    print("\n‚úÖ Setup complete.")


def quick_process_sample(filename: str):
    """
    ‡∏õ‡∏£‡∏∞‡∏°‡∏ß‡∏•‡∏ú‡∏•‡πÑ‡∏ü‡∏•‡πå‡πÄ‡∏î‡∏µ‡∏¢‡∏ß‡∏à‡∏≤‡∏Å RAW_OCR_DIR ‡πÇ‡∏î‡∏¢‡∏ä‡∏∑‡πà‡∏≠‡πÑ‡∏ü‡∏•‡πå‡∏ó‡∏µ‡πà‡∏£‡∏∞‡∏ö‡∏∏
    """
    path = Path(Config.RAW_OCR_DIR) / filename
    if not path.exists():
        print(f"‚ùå ‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå: {path}")
        return
    proc = OCRProcessor()
    result = proc.process_file(path)
    print("\nüì¶ Result (single):")
    print(json.dumps({k: v for k, v in result.items() if k != 'metadata'}, ensure_ascii=False, indent=2))
    return result


def quick_batch(pattern: str = "*.txt", limit: int = None):
    """
    ‡∏õ‡∏£‡∏∞‡∏°‡∏ß‡∏•‡∏ú‡∏•‡∏´‡∏•‡∏≤‡∏¢‡πÑ‡∏ü‡∏•‡πå‡∏à‡∏≤‡∏Å RAW_OCR_DIR ‡∏î‡πâ‡∏ß‡∏¢ pattern ‡∏ó‡∏µ‡πà‡∏Å‡∏≥‡∏´‡∏ô‡∏î
    """
    proc = OCRProcessor()
    summary = proc.process_batch(file_pattern=pattern, limit=limit)
    # export training pairs (‡∏ñ‡πâ‡∏≤‡∏°‡∏µ)
    out_pairs = proc.export_training_pairs()
    print("\nüßæ Usage summary (session):")
    print(json.dumps(proc.get_usage_summary(), ensure_ascii=False, indent=2))
    return {'batch': summary, 'training_pairs': out_pairs}


def quick_build_corpus(pattern: str = "*.txt", out_stem: str = None):
    """
    ‡∏£‡∏ß‡∏°‡πÑ‡∏ü‡∏•‡πå‡πÉ‡∏ô CLEANED_DIR ‡πÉ‡∏´‡πâ‡πÄ‡∏õ‡πá‡∏ô‡∏Ñ‡∏≠‡∏£‡πå‡∏õ‡∏±‡∏™ JSONL/CSV
    """
    builder = CorpusBuilder()
    return builder.build(pattern=pattern, out_stem=out_stem)


def show_usage_log_summary():
    """
    ‡πÅ‡∏™‡∏î‡∏á‡∏™‡∏£‡∏∏‡∏õ usage ‡∏à‡∏≤‡∏Å‡πÑ‡∏ü‡∏•‡πå logs/usage.csv ‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î
    """
    logger = UsageLogger()
    s = logger.get_summary()
    print("\nüìä Usage Log Summary (all runs)")
    print(json.dumps(s, ensure_ascii=False, indent=2))
    return s

print("‚úÖ Quick Start Functions ready")

‚úÖ Quick Start Functions ready


In [None]:
# ============================================
# üìå Block 12: Main / CLI Runner (Colab-safe, with guards)
# ============================================
import argparse
from pathlib import Path

# ---------- Helpers ----------
def _fallback_usage_summary():
    """‡∏™‡∏£‡∏∏‡∏õ usage ‡πÅ‡∏ö‡∏ö fallback ‡πÄ‡∏°‡∏∑‡πà‡∏≠ UsageLogger ‡∏¢‡∏±‡∏á‡πÑ‡∏°‡πà‡∏û‡∏£‡πâ‡∏≠‡∏°"""
    # ‡πÄ‡∏î‡∏≤ path ‡∏ó‡∏±‡πâ‡∏á 2 ‡πÇ‡∏´‡∏°‡∏î (Colab / Local)
    candidates = [
        Path("/content/drive/MyDrive/OCR/logs/usage.csv"),
        Path("./OCR/logs/usage.csv"),
    ]
    log_file = next((p for p in candidates if p.exists()), None)

    print("\nüìí Usage Log Summary (All time) [fallback]")
    print("=" * 50)
    if not log_file:
        print("‚ö†Ô∏è ‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå usage.csv ‡πÉ‡∏ô‡πÇ‡∏ü‡∏•‡πÄ‡∏î‡∏≠‡∏£‡πå logs")
        return
    try:
        import pandas as pd
        df = pd.read_csv(log_file, encoding="utf-8")
        if df.empty:
            print("‚ÑπÔ∏è usage.csv ‡∏ß‡πà‡∏≤‡∏á‡πÄ‡∏õ‡∏•‡πà‡∏≤")
            return
        total_files = len(df)
        total_pages = df["pages_count"].sum() if "pages_count" in df.columns else 0
        total_tokens = df["total_tokens"].sum() if "total_tokens" in df.columns else 0
        total_cost_usd = df["cost_usd"].sum() if "cost_usd" in df.columns else 0.0
        total_cost_thb = df["cost_thb"].sum() if "cost_thb" in df.columns else total_cost_usd * 35
        avg_cost_per_page = (total_cost_usd / total_pages) if total_pages else 0

        print(f"Total files         : {total_files}")
        print(f"Total pages         : {total_pages}")
        print(f"Total tokens        : {total_tokens:,}")
        print(f"Total cost (USD)    : ${total_cost_usd:.4f}")
        print(f"Total cost (THB)    : ~{total_cost_thb:.2f}")
        print(f"Avg cost per page   : ${avg_cost_per_page:.4f}")
    except Exception as e:
        print(f"‚ö†Ô∏è ‡∏≠‡πà‡∏≤‡∏ô usage.csv ‡πÑ‡∏°‡πà‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à: {e}")


def show_usage_log_summary():
    """‡πÅ‡∏™‡∏î‡∏á‡∏™‡∏£‡∏∏‡∏õ usage ‡πÇ‡∏î‡∏¢‡∏û‡∏¢‡∏≤‡∏¢‡∏≤‡∏°‡πÉ‡∏ä‡πâ UsageLogger ‡∏ñ‡πâ‡∏≤‡∏°‡∏µ ‡πÑ‡∏°‡πà‡∏á‡∏±‡πâ‡∏ô fallback"""
    if "UsageLogger" in globals():
        print("\nüìí Usage Log Summary (All time)")
        print("=" * 50)
        ul = UsageLogger()
        s = ul.get_summary()
        print(f"Total files         : {s.get('total_files',0)}")
        print(f"Total pages         : {s.get('total_pages',0)}")
        print(f"Total tokens        : {s.get('total_tokens',0):,}")
        print(f"Input tokens        : {s.get('input_tokens',0):,}")
        print(f"Output tokens       : {s.get('output_tokens',0):,}")
        print(f"Total cost (USD)    : ${s.get('total_cost_usd',0):.4f}")
        print(f"Total cost (THB)    : ~{s.get('total_cost_thb',0):.2f}")
        print(f"Avg cost per page   : ${s.get('avg_cost_per_page',0):.4f}")
        print(f"Most used model     : {s.get('most_used_model','N/A')}")
        print(f"Validation stats    : {s.get('validation_stats',{})}")
    else:
        _fallback_usage_summary()


def quick_sample(sample_name: str):
    print("\n‚ñ∂Ô∏è Quick Sample Mode")
    print("=" * 50)
    if not sample_name:
        print("‚ö†Ô∏è ‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡∏£‡∏∞‡∏ö‡∏∏ --sample ‡∏ä‡∏∑‡πà‡∏≠‡πÑ‡∏ü‡∏•‡πå ‡πÄ‡∏ä‡πà‡∏ô --sample sample.txt")
        return
    if "Config" not in globals() or "OCRProcessor" not in globals():
        print("‚ö†Ô∏è ‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡∏£‡∏±‡∏ô‡∏ö‡∏•‡πá‡∏≠‡∏Å 1‚Äì11 ‡πÉ‡∏´‡πâ‡∏Ñ‡∏£‡∏ö‡∏Å‡πà‡∏≠‡∏ô (‡∏ï‡πâ‡∏≠‡∏á‡∏°‡∏µ Config, OCRProcessor)")
        return
    file_path = Path(Config.RAW_OCR_DIR) / sample_name
    if not file_path.exists():
        print(f"‚ùå ‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå: {file_path}")
        return
    proc = OCRProcessor()
    result = proc.process_file(file_path)
    if result.get('success'):
        print(f"\n‚úÖ ‡πÄ‡∏™‡∏£‡πá‡∏à‡∏™‡∏¥‡πâ‡∏ô: {result['cleaned_path']}")
    else:
        print(f"\n‚ùå ‡∏•‡πâ‡∏°‡πÄ‡∏´‡∏•‡∏ß: {result.get('error','unknown error')}")


def quick_batch(limit: int = None):
    print("\nüöÄ Batch Mode")
    print("=" * 50)
    if "Config" not in globals() or "OCRProcessor" not in globals():
        print("‚ö†Ô∏è ‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡∏£‡∏±‡∏ô‡∏ö‡∏•‡πá‡∏≠‡∏Å 1‚Äì11 ‡πÉ‡∏´‡πâ‡∏Ñ‡∏£‡∏ö‡∏Å‡πà‡∏≠‡∏ô (‡∏ï‡πâ‡∏≠‡∏á‡∏°‡∏µ Config, OCRProcessor)")
        return
    raw_dir = Path(Config.RAW_OCR_DIR)
    files = sorted(raw_dir.glob("*.txt"))
    if not files:
        print(f"‚ö†Ô∏è ‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå‡πÉ‡∏ô {raw_dir}")
        return
    if limit is not None:
        files = files[:max(0, int(limit))]
    print(f"üìÅ ‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå {len(files)} ‡πÑ‡∏ü‡∏•‡πå")
    proc = OCRProcessor()
    for i, f in enumerate(files, 1):
        print(f"\n[{i}/{len(files)}] {f.name}")
        _ = proc.process_file(f)
    usage = proc.get_usage_summary()
    print("\nüìä ‡∏™‡∏£‡∏∏‡∏õ‡∏Å‡∏≤‡∏£‡πÉ‡∏ä‡πâ‡∏á‡∏≤‡∏ô (Session)")
    print("-" * 50)
    print(f"Total tokens     : {usage['total_tokens']:,}")
    print(f"Total cost (USD) : ${usage['total_cost_usd']:.4f}")
    print(f"Total cost (THB) : ~{usage['total_cost_thb']:.2f}")
    print(f"Files processed  : {usage['session_files']}")
    print(f"Pages processed  : {usage['session_pages']}")
    print(f"Avg cost/page    : ${usage['avg_cost_per_page']:.4f}")


def quick_build_corpus(merge_name: str = None):
    print("\nüß± Build Corpus")
    print("=" * 50)
    if "Config" not in globals():
        print("‚ö†Ô∏è ‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡∏£‡∏±‡∏ô‡∏ö‡∏•‡πá‡∏≠‡∏Å 1‚Äì11 ‡πÉ‡∏´‡πâ‡∏Ñ‡∏£‡∏ö‡∏Å‡πà‡∏≠‡∏ô (‡∏ï‡πâ‡∏≠‡∏á‡∏°‡∏µ Config)")
        return
    cleaned_dir = Path(Config.CLEANED_DIR)
    out_dir = Path(Config.CORPUS_DIR)
    out_dir.mkdir(parents=True, exist_ok=True)
    files = sorted(cleaned_dir.glob("*.txt"))
    if not files:
        print(f"‚ö†Ô∏è ‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå‡πÉ‡∏ô {cleaned_dir}")
        return
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    corpus_name = merge_name.strip() if merge_name else f"corpus_{ts}.txt"
    out_path = out_dir / corpus_name
    print(f"üìÅ ‡∏£‡∏ß‡∏°‡πÑ‡∏ü‡∏•‡πå‡∏à‡∏≤‡∏Å: {cleaned_dir}")
    print(f"üìù ‡∏õ‡∏•‡∏≤‡∏¢‡∏ó‡∏≤‡∏á: {out_path}")
    with out_path.open("w", encoding="utf-8") as w:
        for f in files:
            try:
                txt = f.read_text(encoding="utf-8")
                w.write(f"\n\n===== FILE: {f.name} =====\n\n")
                w.write(txt)
            except Exception as e:
                print(f"‚ö†Ô∏è ‡∏Ç‡πâ‡∏≤‡∏°‡πÑ‡∏ü‡∏•‡πå {f.name}: {e}")
    print(f"‚úÖ ‡πÄ‡∏™‡∏£‡πá‡∏à‡∏™‡∏¥‡πâ‡∏ô: {out_path}")


# ---------- CLI ----------
parser = argparse.ArgumentParser(description="Enhanced Thai Novel OCR Processor - Main Runner")
parser.add_argument("--sample", type=str, help="‡∏ä‡∏∑‡πà‡∏≠‡πÑ‡∏ü‡∏•‡πå‡πÉ‡∏ô RAW_OCR_DIR ‡∏ó‡∏µ‡πà‡∏ï‡πâ‡∏≠‡∏á‡∏Å‡∏≤‡∏£‡∏õ‡∏£‡∏∞‡∏°‡∏ß‡∏•‡∏ú‡∏• (‡πÄ‡∏ä‡πà‡∏ô sample.txt)")
parser.add_argument("--batch", action="store_true", help="‡∏õ‡∏£‡∏∞‡∏°‡∏ß‡∏•‡∏ú‡∏•‡πÑ‡∏ü‡∏•‡πå‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î‡πÉ‡∏ô RAW_OCR_DIR")
parser.add_argument("--limit", type=int, default=None, help="‡∏à‡∏≥‡∏Å‡∏±‡∏î‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡πÑ‡∏ü‡∏•‡πå‡πÉ‡∏ô batch")
parser.add_argument("--build-corpus", action="store_true", help="‡∏£‡∏ß‡∏°‡πÑ‡∏ü‡∏•‡πå CLEANED ‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î‡πÄ‡∏õ‡πá‡∏ô corpus")
parser.add_argument("--corpus-name", type=str, default=None, help="‡∏ï‡∏±‡πâ‡∏á‡∏ä‡∏∑‡πà‡∏≠‡πÑ‡∏ü‡∏•‡πå corpus ‡πÄ‡∏≠‡∏á (‡πÄ‡∏ä‡πà‡∏ô my_corpus.txt)")
parser.add_argument("--summary", action="store_true", help="‡πÅ‡∏™‡∏î‡∏á‡∏™‡∏£‡∏∏‡∏õ usage log ‡∏™‡∏∞‡∏™‡∏°‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î")

# ‡πÉ‡∏ä‡πâ parse_known_args ‡∏Å‡∏±‡∏ô -f kernel.json ‡πÉ‡∏ô Colab
args, _ = parser.parse_known_args()

def _main():
    ran_any = False
    if args.sample:
        ran_any = True
        quick_sample(args.sample)
    if args.batch:
        ran_any = True
        print("\nüì¶ ‡πÇ‡∏´‡∏°‡∏î Batch")
        quick_batch(limit=args.limit)
    if args.build_corpus:
        ran_any = True
        print("\nüìö ‡∏£‡∏ß‡∏° Corpus")
        quick_build_corpus(merge_name=args.corpus_name)
    if args.summary or not ran_any:
        if not ran_any:
            print("\n‚ÑπÔ∏è ‡πÑ‡∏°‡πà‡∏™‡πà‡∏á argument ‡πÉ‡∏î ‡πÜ ‡∏à‡∏∞‡πÅ‡∏™‡∏î‡∏á‡∏™‡∏£‡∏∏‡∏õ‡πÉ‡∏´‡πâ‡∏Å‡πà‡∏≠‡∏ô")
        print("\nüìà ‡∏™‡∏£‡∏∏‡∏õ‡∏Å‡∏≤‡∏£‡πÉ‡∏ä‡πâ‡∏á‡∏≤‡∏ô")
        show_usage_log_summary()

if __name__ == "__main__":
    _main()


‚ÑπÔ∏è ‡πÑ‡∏°‡πà‡∏™‡πà‡∏á argument ‡πÉ‡∏î ‡πÜ ‡∏à‡∏∞‡πÅ‡∏™‡∏î‡∏á‡∏™‡∏£‡∏∏‡∏õ‡πÉ‡∏´‡πâ‡∏Å‡πà‡∏≠‡∏ô

üìà ‡∏™‡∏£‡∏∏‡∏õ‡∏Å‡∏≤‡∏£‡πÉ‡∏ä‡πâ‡∏á‡∏≤‡∏ô

üìí Usage Log Summary (All time)
Total files         : 0
Total pages         : 0
Total tokens        : 0
Input tokens        : 0
Output tokens       : 0
Total cost (USD)    : $0.0000
Total cost (THB)    : ~0.00
Avg cost per page   : $0.0000
Most used model     : N/A
Validation stats    : {}


In [12]:
# üöÄ Run me
processor = OCRProcessor(analyze_metadata=True)  # True = ‡∏°‡∏µ metadata, False = ‡πÅ‡∏Ñ‡πà clean
result = processor.process_file(Path(Config.RAW_OCR_DIR) / "Namiya.txt")

print("\nüìä Run Summary:")
print("Success:", result.get('success'))
if result.get('success'):
    print("Output file:", result.get('cleaned_path'))
    print("Cost: $", result.get('cost'))
    print("Tokens used:", result.get('tokens'))

show_usage_log_summary()

‚úÖ OpenAI client ready (Model: gpt-4o-mini)

üìÑ Processing: Namiya.txt
   üîç Metadata analysis: ENABLED
   üìñ Multi-page file detected
   üìä Found 5 pages
   üìò Book: (‡∏õ‡∏≤‡∏è‡∏¥‡∏´‡∏≤‡∏£‡∏¢‡πå‡∏£‡πâ‡∏≤‡∏ô‡∏ä‡∏≥‡∏Ç‡∏≠‡∏á‡∏Ñ‡∏∏‡∏ì‡∏ô‡∏≤‡∏°‡∏¥‡∏¢‡∏∞)
   üßæ Chapter: (‡∏ö‡∏ó‡∏ó‡∏µ‡πà 2 ‡πÄ‡∏™‡∏µ‡∏¢‡∏á‡∏´‡∏µ‡∏ö‡πÄ‡∏û‡∏•‡∏á‡∏õ‡∏≤‡∏Å‡πÉ‡∏ô‡∏¢‡∏≤‡∏°‡∏î‡∏∂‡∏Å‡∏™‡∏á‡∏±‡∏î 2-8)
   üìè Total content: 4,250 chars (~3,188 tokens)
   üöÄ Processing all 5 pages in one batch...
   üîç Analyzing metadata for each page...
      Analyzing page 1...
      Analyzing page 2...
      Analyzing page 3...
      Analyzing page 4...
      Analyzing page 4...
   ‚úÖ Metadata analysis complete: 4,992 tokens, $0.0012
   ‚úÖ Saved: Namiya_clean_20250829_134344.txt
   üìÑ Pages: 5
   üî§ Tokens: 9,301
   ‚è± Time: 32.0s
   üí∞ Cost: $0.0027 (~0.09 ‡∏ö‡∏≤‡∏ó)
   üìã Validation: ‚úÖ0 ‚ö†Ô∏è5 ‚ùå0 (avg: 0.90)
   üîç Metadata analyzed: 4 pages

üìä Run Summary:
Success: True
Output file: /content/d

TypeError: Object of type int64 is not JSON serializable