<a href="https://colab.research.google.com/github/tanatet8/Colab_Script/blob/main/ThaiNovel_OCR_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
================================================================================
                OCR PROCESSING WITH API - AUTOMATED VERSION
                       สำหรับทำ Corpus นิยายไทย
================================================================================

Features:
1. Automated OCR → LLM → Clean corpus
2. ใช้ GPT-4o-mini (ถูกสุด) หรือ Claude Haiku
3. Quality validation & tracking
4. Save training pairs for fine-tuning

Requirements:
- pip install openai anthropic pandas tqdm
- API keys (OpenAI หรือ Anthropic)
================================================================================
"""

# ============================================
# 📌 Block 1: Setup & Import
# ============================================
import os
import re
import json
import time
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Tuple, Optional
import pandas as pd
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Mount Drive (สำหรับ Colab)
try:
    from google.colab import drive
    drive.mount('/content/drive')
    IN_COLAB = True
except:
    IN_COLAB = False

print("✅ Libraries loaded")

In [None]:
# ============================================
# 📌 Block 2: Configuration & API Setup
# ============================================
class Config:
    """Configuration สำหรับ API Processing"""

    # ⚠️ ใส่ API Keys ที่นี่ (หรือใช้ environment variables)
    OPENAI_API_KEY = ""  # ใส่ OpenAI API key
    ANTHROPIC_API_KEY = ""  # ใส่ Anthropic API key (ถ้าใช้ Claude)

    # เลือก Model (uncomment อันที่จะใช้)
    MODEL = "gpt-4o-mini"  # ถูกสุด แนะนำ!
    # MODEL = "gpt-3.5-turbo"
    # MODEL = "claude-3-haiku"

    # Paths (Google Drive)
    BASE = '/content/drive/MyDrive/OCR' if IN_COLAB else './OCR'

    RAW_OCR_DIR = f'{BASE}/raw_ocr'
    CLEANED_DIR = f'{BASE}/cleaned'
    CORPUS_DIR = f'{BASE}/final_corpus'
    TRAINING_PAIRS_DIR = f'{BASE}/training_pairs'
    LOGS_DIR = f'{BASE}/logs'

    # Processing settings
    MAX_PAGES_PER_BATCH = 5  # จำนวนหน้าต่อ API call
    MAX_RETRIES = 3  # retry ถ้า API error
    TEMPERATURE = 0.1  # ต่ำ = consistent output
    MAX_TOKENS = 4000  # max response length

    # Cost tracking
    PRICE_PER_1K_TOKENS = {
        'gpt-4o-mini': 0.00015,  # $0.15 per 1M
        'gpt-3.5-turbo': 0.0005,
        'claude-3-haiku': 0.00025
    }

# สร้าง folders
for folder in [Config.RAW_OCR_DIR, Config.CLEANED_DIR, Config.CORPUS_DIR,
               Config.TRAINING_PAIRS_DIR, Config.LOGS_DIR]:
    Path(folder).mkdir(parents=True, exist_ok=True)

print("✅ Config loaded")

In [None]:
# ============================================
# 📌 Block 3: API Clients
# ============================================
class LLMClient:
    """Universal LLM Client สำหรับ OpenAI และ Anthropic"""

    def __init__(self):
        self.model = Config.MODEL
        self.client = None
        self.total_tokens = 0
        self.total_cost = 0

        # Initialize ตาม model
        if 'gpt' in self.model:
            self._init_openai()
        elif 'claude' in self.model:
            self._init_anthropic()

    def _init_openai(self):
        """Initialize OpenAI client"""
        try:
            import openai

            # Set API key
            if Config.OPENAI_API_KEY:
                openai.api_key = Config.OPENAI_API_KEY
            else:
                # ลองหาจาก environment variable
                openai.api_key = os.getenv('OPENAI_API_KEY')

            if not openai.api_key:
                raise ValueError("❌ ไม่พบ OpenAI API key! กรุณาใส่ใน Config")

            self.client = openai.OpenAI(api_key=openai.api_key)
            print(f"✅ OpenAI client ready (Model: {self.model})")

        except ImportError:
            print("❌ ต้องติดตั้ง: pip install openai")
            raise

    def _init_anthropic(self):
        """Initialize Anthropic client"""
        try:
            import anthropic

            if Config.ANTHROPIC_API_KEY:
                api_key = Config.ANTHROPIC_API_KEY
            else:
                api_key = os.getenv('ANTHROPIC_API_KEY')

            if not api_key:
                raise ValueError("❌ ไม่พบ Anthropic API key!")

            self.client = anthropic.Anthropic(api_key=api_key)
            print(f"✅ Anthropic client ready (Model: {self.model})")

        except ImportError:
            print("❌ ต้องติดตั้ง: pip install anthropic")
            raise

    def clean_ocr_text(self, text: str, page_num: int = 1) -> Dict:
        """
        ส่ง OCR text ให้ LLM แก้ไข

        Returns:
            {
                'cleaned_text': str,
                'tokens_used': int,
                'cost': float,
                'changes': list
            }
        """
        # สร้าง prompt
        prompt = self._create_prompt(text)

        # เรียก API ตาม provider
        if 'gpt' in self.model:
            result = self._call_openai(prompt)
        else:
            result = self._call_anthropic(prompt)

        # Track usage
        self.total_tokens += result['tokens_used']
        self.total_cost += result['cost']

        return result

    def _create_prompt(self, text: str) -> str:
        """สร้าง prompt สำหรับ OCR cleaning"""
        return f"""แก้ไขข้อความ OCR จากนิยายภาษาไทยต่อไปนี้

กฎการแก้ไข:
1. แก้เฉพาะ typo และการสะกดผิด
2. แก้คำที่ขาดหาย/แตกหัก (เช่น "มา กำลัง" → "มากำลัง")
3. ลบตัวอักษรเดี่ยวที่ไม่มีความหมาย
4. รักษารูปแบบบทสนทนา (คำพูดใน "...")
5. ห้ามเพิ่มเนื้อหาใหม่
6. ห้ามเปลี่ยนความหมาย

ข้อความ OCR:
{text}

ข้อความที่แก้แล้ว:"""

    def _call_openai(self, prompt: str) -> Dict:
        """Call OpenAI API"""
        try:
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[
                    {"role": "system", "content": "คุณคือผู้เชี่ยวชาญแก้ไข OCR ภาษาไทย"},
                    {"role": "user", "content": prompt}
                ],
                temperature=Config.TEMPERATURE,
                max_tokens=Config.MAX_TOKENS
            )

            # Extract result
            cleaned_text = response.choices[0].message.content
            tokens = response.usage.total_tokens

            # Calculate cost
            price_per_token = Config.PRICE_PER_1K_TOKENS.get(self.model, 0.0005) / 1000
            cost = tokens * price_per_token

            return {
                'cleaned_text': cleaned_text,
                'tokens_used': tokens,
                'cost': cost,
                'model': self.model
            }

        except Exception as e:
            print(f"❌ OpenAI API error: {e}")
            # Retry logic
            for retry in range(Config.MAX_RETRIES):
                time.sleep(2 ** retry)  # Exponential backoff
                try:
                    return self._call_openai(prompt)
                except:
                    continue
            raise

    def _call_anthropic(self, prompt: str) -> Dict:
        """Call Anthropic API"""
        try:
            response = self.client.messages.create(
                model=self.model,
                max_tokens=Config.MAX_TOKENS,
                temperature=Config.TEMPERATURE,
                messages=[
                    {"role": "user", "content": prompt}
                ]
            )

            cleaned_text = response.content[0].text
            tokens = response.usage.input_tokens + response.usage.output_tokens

            price_per_token = Config.PRICE_PER_1K_TOKENS.get(self.model, 0.00025) / 1000
            cost = tokens * price_per_token

            return {
                'cleaned_text': cleaned_text,
                'tokens_used': tokens,
                'cost': cost,
                'model': self.model
            }

        except Exception as e:
            print(f"❌ Anthropic API error: {e}")
            raise

    def get_usage_summary(self) -> Dict:
        """สรุปการใช้งาน API"""
        return {
            'total_tokens': self.total_tokens,
            'total_cost_usd': self.total_cost,
            'total_cost_thb': self.total_cost * 35,  # ประมาณ
            'pages_processed': self.total_tokens // 500  # ประมาณ 500 tokens/page
        }

print("✅ LLM Client ready")

In [None]:

# ============================================
# 📌 Block 4: OCR Processor
# ============================================
class OCRProcessor:
    """Main processor สำหรับ OCR → LLM → Clean corpus"""

    def __init__(self):
        self.llm = LLMClient()
        self.stats = {
            'processed': 0,
            'failed': 0,
            'total_cost': 0
        }
        self.training_pairs = []

    def process_file(self, file_path: Path) -> Dict:
        """
        Process 1 ไฟล์ OCR พร้อม validation

        Returns:
            {
                'success': bool,
                'cleaned_path': str,
                'stats': dict,
                'validation': dict
            }
        """
        print(f"\n📄 Processing: {file_path.name}")

        try:
            # อ่านไฟล์ OCR
            raw_text = file_path.read_text(encoding='utf-8')

            # ถ้าไฟล์ใหญ่ ต้องแบ่ง chunks
            if len(raw_text) > 3000:
                chunks = self._split_text(raw_text)
                cleaned_chunks = []

                for i, chunk in enumerate(chunks):
                    print(f"   Chunk {i+1}/{len(chunks)}...")
                    result = self.llm.clean_ocr_text(chunk, i+1)
                    cleaned_chunks.append(result['cleaned_text'])
                    time.sleep(1)  # Rate limiting

                cleaned_text = '\n\n'.join(cleaned_chunks)
            else:
                # ไฟล์เล็ก ส่งทั้งหมด
                result = self.llm.clean_ocr_text(raw_text)
                cleaned_text = result['cleaned_text']

            # === VALIDATION ===
            print(f"   🔍 Validating...")
            validation = QualityValidator.enhanced_validate(
                raw_text,
                cleaned_text,
                file_path.name
            )

            # แสดงผล validation
            if validation['status'] == 'FAIL':
                print(f"   ❌ VALIDATION FAILED:")
                for issue in validation['issues']:
                    print(f"      {issue}")
            elif validation['status'] == 'WARNING':
                print(f"   ⚠️ VALIDATION WARNING:")
                if validation['warnings']:
                    print(f"      {validation['warnings'][0]}")
                if validation['suspicious']:
                    print(f"      {validation['suspicious'][0]}")
            else:
                print(f"   ✅ Validation passed (score: {validation['score']:.2f})")

            # บันทึก validation report ถ้ามี issues
            if validation['status'] in ['FAIL', 'WARNING']:
                val_report_path = QualityValidator.save_validation_report(validation)
                print(f"   📊 Validation report: {val_report_path.name}")

            # บันทึกผลลัพธ์ (แม้ validation จะ fail ก็ save ไว้ review)
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

            # ถ้า validation fail ใส่ prefix WARNING_
            if validation['status'] == 'FAIL':
                clean_filename = f"WARNING_{file_path.stem}_clean_{timestamp}.txt"
            else:
                clean_filename = f"{file_path.stem}_clean_{timestamp}.txt"

            clean_path = Path(Config.CLEANED_DIR) / clean_filename
            clean_path.write_text(cleaned_text, encoding='utf-8')

            # เก็บ training pair (เฉพาะที่ผ่าน validation)
            if validation['status'] != 'FAIL':
                self.training_pairs.append({
                    'input': raw_text[:1000],
                    'output': cleaned_text[:1000],
                    'source': file_path.name,
                    'timestamp': timestamp,
                    'validation_score': validation['score']
                })

            # Update stats
            self.stats['processed'] += 1
            if validation['status'] == 'FAIL':
                self.stats['validation_failed'] = self.stats.get('validation_failed', 0) + 1
            elif validation['status'] == 'WARNING':
                self.stats['validation_warning'] = self.stats.get('validation_warning', 0) + 1

            print(f"   ✅ Saved: {clean_filename}")
            print(f"   💰 Cost: ${result.get('cost', 0):.4f}")

            return {
                'success': True,
                'cleaned_path': str(clean_path),
                'tokens': result.get('tokens_used', 0),
                'cost': result.get('cost', 0),
                'validation': validation
            }

        except Exception as e:
            print(f"   ❌ Error: {e}")
            self.stats['failed'] += 1
            return {'success': False, 'error': str(e)}

    def _split_text(self, text: str, max_chars: int = 2500) -> List[str]:
        """แบ่ง text ยาวเป็น chunks"""
        # แบ่งตาม paragraph ถ้าเป็นไปได้
        paragraphs = text.split('\n\n')

        chunks = []
        current_chunk = []
        current_length = 0

        for para in paragraphs:
            para_length = len(para)

            if current_length + para_length > max_chars and current_chunk:
                # Chunk เต็ม - save และเริ่มใหม่
                chunks.append('\n\n'.join(current_chunk))
                current_chunk = [para]
                current_length = para_length
            else:
                current_chunk.append(para)
                current_length += para_length

        # Chunk สุดท้าย
        if current_chunk:
            chunks.append('\n\n'.join(current_chunk))

        return chunks

    def process_batch(self, file_pattern: str = "*.txt", limit: int = None):
        """
        Process หลายไฟล์

        Args:
            file_pattern: pattern ของไฟล์ที่จะ process
            limit: จำนวนไฟล์สูงสุด (None = ทั้งหมด)
        """
        # หาไฟล์ทั้งหมด
        raw_dir = Path(Config.RAW_OCR_DIR)
        files = list(raw_dir.glob(file_pattern))

        if limit:
            files = files[:limit]

        print(f"\n🚀 Processing {len(files)} files...")
        print("=" * 50)

        # Process แต่ละไฟล์
        results = []
        for file_path in tqdm(files, desc="Processing"):
            result = self.process_file(file_path)
            results.append(result)

            # Rate limiting
            time.sleep(0.5)

        # สรุปผล
        self._print_summary(results)

        # บันทึก training pairs
        self._save_training_pairs()

        return results

    def _print_summary(self, results: List[Dict]):
        """แสดงสรุปผลการ process พร้อม validation summary"""
        successful = [r for r in results if r.get('success')]
        total_tokens = sum(r.get('tokens', 0) for r in successful)
        total_cost = sum(r.get('cost', 0) for r in successful)

        # นับ validation status
        validation_stats = {
            'PASS': 0,
            'WARNING': 0,
            'FAIL': 0
        }

        for r in successful:
            if 'validation' in r:
                status = r['validation'].get('status', 'UNKNOWN')
                validation_stats[status] = validation_stats.get(status, 0) + 1

        print("\n" + "=" * 50)
        print("📊 PROCESSING SUMMARY")
        print("=" * 50)
        print(f"✅ Success: {len(successful)}/{len(results)}")
        print(f"❌ Failed: {len(results) - len(successful)}")
        print(f"🔤 Total tokens: {total_tokens:,}")
        print(f"💰 Total cost: ${total_cost:.4f} (~{total_cost*35:.2f} บาท)")

        # Validation summary
        print(f"\n📋 Validation Summary:")
        print(f"   ✅ Passed: {validation_stats.get('PASS', 0)}")
        print(f"   ⚠️ Warnings: {validation_stats.get('WARNING', 0)}")
        print(f"   ❌ Failed: {validation_stats.get('FAIL', 0)}")

        # แจ้งเตือนถ้ามี validation issues
        if validation_stats.get('WARNING', 0) > 0:
            print(f"\n💡 มี {validation_stats['WARNING']} ไฟล์ที่ควรตรวจสอบ")
            print(f"   ดู validation reports ใน: {Config.LOGS_DIR}")

        if validation_stats.get('FAIL', 0) > 0:
            print(f"\n⚠️ มี {validation_stats['FAIL']} ไฟล์ที่ validation ไม่ผ่าน")
            print(f"   ไฟล์เหล่านี้มี prefix 'WARNING_' ใน cleaned folder")

        print(f"\n📁 Cleaned files saved to: {Config.CLEANED_DIR}")

        # API usage summary
        usage = self.llm.get_usage_summary()
        print(f"\n📈 API Usage:")
        print(f"   Model: {Config.MODEL}")
        print(f"   Tokens: {usage['total_tokens']:,}")
        print(f"   Cost: ${usage['total_cost_usd']:.4f} (~{usage['total_cost_thb']:.2f} บาท)")

    def _save_training_pairs(self):
        """บันทึก training pairs สำหรับ fine-tuning"""
        if not self.training_pairs:
            return

        # Save as JSONL
        pairs_file = Path(Config.TRAINING_PAIRS_DIR) / f"pairs_{datetime.now().strftime('%Y%m%d')}.jsonl"

        with open(pairs_file, 'w', encoding='utf-8') as f:
            for pair in self.training_pairs:
                f.write(json.dumps(pair, ensure_ascii=False) + '\n')

        print(f"💾 Training pairs saved: {pairs_file}")

print("✅ OCR Processor ready")

In [None]:
# ============================================
# 📌 Block 5: Enhanced Quality Validator
# ============================================
class QualityValidator:
    """ตรวจสอบคุณภาพของ cleaned text แบบละเอียด"""

    @staticmethod
    def validate(raw_text: str, cleaned_text: str) -> Dict:
        """
        ตรวจสอบคุณภาพการ clean แบบพื้นฐาน
        """
        issues = []
        warnings = []

        # 1. ตรวจสอบความยาว
        len_ratio = len(cleaned_text) / len(raw_text) if raw_text else 0
        len_change = (len(cleaned_text) - len(raw_text)) / len(raw_text) * 100 if raw_text else 0

        if len_ratio < 0.8:
            issues.append(f"⚠️ ข้อความสั้นลง {abs(len_change):.1f}% (อาจมีการลบเนื้อหา)")
        elif len_ratio > 1.2:
            issues.append(f"⚠️ ข้อความยาวขึ้น {len_change:.1f}% (อาจมีการเพิ่มเนื้อหา)")
        elif len_ratio < 0.9:
            warnings.append(f"📝 ข้อความสั้นลง {abs(len_change):.1f}%")
        elif len_ratio > 1.1:
            warnings.append(f"📝 ข้อความยาวขึ้น {len_change:.1f}%")

        # 2. ตรวจสอบ quotes
        raw_quotes = len(re.findall(r'"[^"]*"', raw_text))
        clean_quotes = len(re.findall(r'"[^"]*"', cleaned_text))

        if abs(raw_quotes - clean_quotes) > 3:
            issues.append(f"⚠️ จำนวน quotes ต่างกันมาก ({raw_quotes} → {clean_quotes})")
        elif abs(raw_quotes - clean_quotes) > 1:
            warnings.append(f"📝 จำนวน quotes ต่างกัน ({raw_quotes} → {clean_quotes})")

        # 3. คำนวณ score
        score = 1.0
        score -= len(issues) * 0.2
        score -= len(warnings) * 0.05
        score = max(0, min(1, score))

        return {
            'valid': len(issues) == 0,
            'score': score,
            'issues': issues,
            'warnings': warnings,
            'length_ratio': len_ratio,
            'length_change_percent': len_change
        }

    @staticmethod
    def enhanced_validate(raw_text: str, cleaned_text: str, filename: str = "") -> Dict:
        """
        ตรวจสอบแบบละเอียด พร้อม diff และ suspicious changes
        """
        issues = []
        warnings = []
        suspicious_changes = []

        # 1. Basic validation
        basic_result = QualityValidator.validate(raw_text, cleaned_text)
        issues.extend(basic_result['issues'])
        warnings.extend(basic_result['warnings'])

        # 2. ตรวจคำที่เปลี่ยน
        raw_words = set(raw_text.split())
        clean_words = set(cleaned_text.split())

        added_words = clean_words - raw_words
        removed_words = raw_words - clean_words

        # 3. หาคำที่น่าสงสัย
        for word in added_words:
            # คำยาวเกิน 15 ตัว = น่าสงสัย
            if len(word) > 15:
                suspicious_changes.append(f"➕ เพิ่มคำยาว: '{word}'")
            # คำภาษาอังกฤษที่ไม่น่าอยู่ในนิยายไทย
            elif word.isascii() and len(word) > 3 and word.lower() not in ['okay', 'yes', 'no']:
                suspicious_changes.append(f"➕ เพิ่มภาษาอังกฤษ: '{word}'")

        # ตรวจคำที่หายไปเยอะ
        if len(removed_words) > 20:
            suspicious_changes.append(f"➖ คำหายไป {len(removed_words)} คำ")

        # 4. ตรวจ pattern ที่เปลี่ยนบ่อย
        pattern_changes = QualityValidator._check_common_patterns(raw_text, cleaned_text)
        if pattern_changes:
            warnings.extend(pattern_changes)

        # 5. สร้าง diff sample
        diff_sample = QualityValidator._get_diff_sample(raw_text, cleaned_text)

        # 6. สร้าง report
        status = 'PASS'
        if issues:
            status = 'FAIL'
        elif warnings or suspicious_changes:
            status = 'WARNING'

        report = {
            'filename': filename,
            'status': status,
            'score': basic_result['score'],
            'stats': {
                'length_change': f"{basic_result['length_change_percent']:+.1f}%",
                'words_added': len(added_words),
                'words_removed': len(removed_words),
                'quotes_change': f"{len(re.findall(r'\"', raw_text))} → {len(re.findall(r'\"', cleaned_text))}"
            },
            'issues': issues,
            'warnings': warnings,
            'suspicious': suspicious_changes[:5],  # แสดงแค่ 5 อันแรก
            'diff_sample': diff_sample,
            'timestamp': datetime.now().isoformat()
        }

        return report

    @staticmethod
    def _check_common_patterns(raw_text: str, cleaned_text: str) -> List[str]:
        """ตรวจ pattern ที่มักเปลี่ยน"""
        warnings = []

        # ตรวจ ๆ (ไม้ยมก)
        raw_yamok_space = raw_text.count(' ๆ ')
        clean_yamok_space = cleaned_text.count(' ๆ ')
        raw_yamok_no_space = raw_text.count('ๆ') - raw_yamok_space
        clean_yamok_no_space = cleaned_text.count('ๆ') - clean_yamok_space

        if raw_yamok_space != clean_yamok_space or raw_yamok_no_space != clean_yamok_no_space:
            warnings.append(f"📝 รูปแบบ 'ๆ' เปลี่ยน (space: {raw_yamok_space}→{clean_yamok_space}, no-space: {raw_yamok_no_space}→{clean_yamok_no_space})")

        # ตรวจตัวเลข
        raw_numbers = len(re.findall(r'\d+', raw_text))
        clean_numbers = len(re.findall(r'\d+', cleaned_text))
        if abs(raw_numbers - clean_numbers) > 2:
            warnings.append(f"📝 จำนวนตัวเลขเปลี่ยน ({raw_numbers} → {clean_numbers})")

        return warnings

    @staticmethod
    def _get_diff_sample(raw_text: str, cleaned_text: str, max_lines: int = 3) -> List[str]:
        """แสดงตัวอย่างที่เปลี่ยน"""
        import difflib

        # แบ่งเป็นบรรทัด
        raw_lines = raw_text[:500].split('\n')
        clean_lines = cleaned_text[:500].split('\n')

        # หา diff
        diff = difflib.unified_diff(
            raw_lines,
            clean_lines,
            lineterm='',
            n=0
        )

        changes = []
        for line in diff:
            if line.startswith('+') and not line.startswith('+++'):
                changes.append(f"✅ {line[1:][:100]}")  # จำกัด 100 chars
            elif line.startswith('-') and not line.startswith('---'):
                changes.append(f"❌ {line[1:][:100]}")

        return changes[:max_lines]

    @staticmethod
    def save_validation_report(report: Dict, output_dir: str = None):
        """บันทึก validation report"""
        if output_dir is None:
            output_dir = Config.LOGS_DIR

        Path(output_dir).mkdir(parents=True, exist_ok=True)

        # สร้างชื่อไฟล์
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = report.get('filename', 'unknown').replace('.txt', '')
        report_file = Path(output_dir) / f"validation_{filename}_{timestamp}.json"

        # บันทึก
        with open(report_file, 'w', encoding='utf-8') as f:
            json.dump(report, f, ensure_ascii=False, indent=2)

        return report_file

print("✅ Enhanced Quality Validator ready")

In [None]:
# ============================================
# 📌 Block 6: Main Menu
# ============================================
def main_menu():
    """Interactive menu สำหรับใช้งาน"""

    processor = OCRProcessor()

    while True:
        print("""
╔════════════════════════════════════════════╗
║     OCR PROCESSING WITH API v1.0            ║
║          Automated Thai Novel OCR           ║
╚════════════════════════════════════════════╝

[1] 🚀 Process ไฟล์เดียว
[2] 📦 Process หลายไฟล์ (Batch)
[3] 💰 Check API usage & cost
[4] 🔧 Test with sample text
[5] 📊 View statistics
[6] ⚙️ Settings
[7] ❌ Exit

        """)

        choice = input("Select (1-7): ").strip()

        if choice == '1':
            # Process single file
            print("\n📄 Single File Processing")
            print("-" * 40)

            # List available files
            raw_dir = Path(Config.RAW_OCR_DIR)
            files = list(raw_dir.glob("*.txt"))

            if not files:
                print("❌ ไม่พบไฟล์ใน raw_ocr/")
                input("\nPress Enter to continue...")
                continue

            print("พบไฟล์:")
            for i, f in enumerate(files[:10], 1):
                print(f"  [{i}] {f.name}")

            file_idx = input("\nเลือกไฟล์ (number): ").strip()

            try:
                selected_file = files[int(file_idx) - 1]
                processor.process_file(selected_file)
            except:
                print("❌ Invalid selection")

            input("\nPress Enter to continue...")

        elif choice == '2':
            # Batch processing
            print("\n📦 Batch Processing")
            print("-" * 40)

            limit = input("จำนวนไฟล์ที่จะ process (Enter = ทั้งหมด): ").strip()
            limit = int(limit) if limit else None

            confirm = input(f"\n⚠️ จะ process {limit or 'ทั้งหมด'} ไฟล์ ต้องการดำเนินการ? (y/n): ")

            if confirm.lower() == 'y':
                processor.process_batch(limit=limit)
            else:
                print("❌ Cancelled")

            input("\nPress Enter to continue...")

        elif choice == '3':
            # Check usage
            print("\n💰 API Usage & Cost")
            print("-" * 40)

            usage = processor.llm.get_usage_summary()
            print(f"Model: {Config.MODEL}")
            print(f"Total tokens: {usage['total_tokens']:,}")
            print(f"Total cost: ${usage['total_cost_usd']:.4f}")
            print(f"Total cost (THB): ~{usage['total_cost_thb']:.2f} บาท")
            print(f"Est. pages: ~{usage['pages_processed']}")

            input("\nPress Enter to continue...")

        elif choice == '4':
            # Test sample
            print("\n🔧 Test with Sample")
            print("-" * 40)

            sample = """หอประชุมเป็นอาคารที่มีหน้าตาคล้ายบ้านชั้นเดียวทั่วไป
แต่มีขนาดใหญ่กว่าเล็กน้อย ชายหญิงในชุดไว้ทุกข์หลายคน
เดินขวักไขว่ไปมาท่ามกลางความวุ่นวาย"""

            print("Sample text:")
            print(sample)
            print("\nProcessing...")

            result = processor.llm.clean_ocr_text(sample)

            print("\nCleaned text:")
            print(result['cleaned_text'])
            print(f"\nTokens: {result['tokens_used']}")
            print(f"Cost: ${result['cost']:.4f}")

            input("\nPress Enter to continue...")

        elif choice == '5':
            # Statistics
            print("\n📊 Statistics")
            print("-" * 40)

            # Count files
            raw_files = len(list(Path(Config.RAW_OCR_DIR).glob("*.txt")))
            clean_files = len(list(Path(Config.CLEANED_DIR).glob("*.txt")))

            print(f"Raw OCR files: {raw_files}")
            print(f"Cleaned files: {clean_files}")
            print(f"Success rate: {processor.stats['processed']}/{processor.stats['processed'] + processor.stats['failed']}")

            input("\nPress Enter to continue...")

        elif choice == '6':
            # Settings
            print("\n⚙️ Settings")
            print("-" * 40)
            print(f"Current model: {Config.MODEL}")
            print(f"Temperature: {Config.TEMPERATURE}")
            print(f"Max tokens: {Config.MAX_TOKENS}")

            change = input("\nChange model? (y/n): ")
            if change.lower() == 'y':
                print("\nAvailable models:")
                print("[1] gpt-4o-mini (cheapest)")
                print("[2] gpt-3.5-turbo")
                print("[3] claude-3-haiku")

                model_choice = input("Select: ").strip()
                if model_choice == '1':
                    Config.MODEL = 'gpt-4o-mini'
                elif model_choice == '2':
                    Config.MODEL = 'gpt-3.5-turbo'
                elif model_choice == '3':
                    Config.MODEL = 'claude-3-haiku'

                processor.llm = LLMClient()  # Reinitialize
                print(f"✅ Model changed to: {Config.MODEL}")

            input("\nPress Enter to continue...")

        elif choice == '7':
            print("\n👋 Goodbye!")
            break

        else:
            print("❌ Invalid choice")

print("✅ Main menu ready")

In [None]:
# ============================================
# 📌 Block 7: Quick Start Functions
# ============================================

def quick_setup():
    """Setup API key และ test connection"""
    print("\n🔧 Quick Setup")
    print("=" * 50)

    # Check API key
    if not Config.OPENAI_API_KEY and not Config.ANTHROPIC_API_KEY:
        print("\n⚠️ ไม่พบ API key!")
        print("\nวิธีใส่ API key:")
        print("1. แก้ใน Config class ด้านบน")
        print("2. หรือ set environment variable:")

        provider = input("\nใช้ [1] OpenAI หรือ [2] Anthropic? : ").strip()

        if provider == '1':
            key = input("Enter OpenAI API key: ").strip()
            Config.OPENAI_API_KEY = key
            Config.MODEL = 'gpt-4o-mini'
        else:
            key = input("Enter Anthropic API key: ").strip()
            Config.ANTHROPIC_API_KEY = key
            Config.MODEL = 'claude-3-haiku'

    # Test connection
    print("\n🔍 Testing API connection...")
    try:
        client = LLMClient()
        result = client.clean_ocr_text("ทดสอบ API")
        print("✅ API connection successful!")
        print(f"   Model: {Config.MODEL}")
        print(f"   Test cost: ${result['cost']:.4f}")
        return True
    except Exception as e:
        print(f"❌ API test failed: {e}")
        return False


def process_single_file_quick(filename: str):
    """Process 1 ไฟล์แบบเร็ว"""
    processor = OCRProcessor()
    file_path = Path(Config.RAW_OCR_DIR) / filename

    if not file_path.exists():
        print(f"❌ File not found: {filename}")
        return

    result = processor.process_file(file_path)

    if result['success']:
        print(f"\n✅ Success!")
        print(f"   Output: {result['cleaned_path']}")
        print(f"   Cost: ${result['cost']:.4f} (~{result['cost']*35:.2f} บาท)")
    else:
        print(f"\n❌ Failed: {result.get('error')}")

In [None]:
# ============================================
# 📌 Block 8: Main Execution
# ============================================

if __name__ == "__main__":
    print("""
    ================================================================================
                      OCR PROCESSING WITH API v1.0
                         Automated Thai Novel OCR
    ================================================================================

    🎯 Features:
       - Automated OCR cleaning with GPT/Claude
       - Cost tracking & optimization
       - Quality validation
       - Training pairs collection

    💰 Estimated cost:
       - GPT-4o-mini: ~0.01 บาท/หน้า
       - 100 หน้า = ~1 บาท
       - 1,000 หน้า = ~10 บาท

    """)

    # Quick setup ถ้ายังไม่มี API key
    if not Config.OPENAI_API_KEY and not Config.ANTHROPIC_API_KEY:
        print("📝 ต้อง setup API key ก่อน")
        if quick_setup():
            print("\n✅ Setup complete! Ready to use")
        else:
            print("\n❌ Setup failed. Please check API key")
            exit(1)

    # Run main menu
    print("\n🚀 Starting main menu...")
    main_menu()

    print("\n🎉 Thank you for using OCR Processor!")