<a href="https://colab.research.google.com/github/tanatet8/Colab_Script/blob/main/ThaiNovel_OCR_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cell ‡πÉ‡∏´‡∏°‡πà: Mount Drive + Create Folders
from google.colab import drive
drive.mount('/content/drive')

# ‡∏™‡∏£‡πâ‡∏≤‡∏á folders ‡πÉ‡∏ô Drive
import os
BASE = '/content/drive/MyDrive/OCR'  # ‚Üê ‡∏ä‡∏∑‡πà‡∏≠ folder ‡∏Ç‡∏≠‡∏á‡∏Ñ‡∏∏‡∏ì

for folder in ['raw_ocr', 'batches', 'cleaned_gpt', 'cleaned_claude', 'final_corpus', 'reports', 'training_pairs']:
    os.makedirs(f'{BASE}/{folder}', exist_ok=True)

print("‚úÖ Folders ready in Drive!")

In [None]:
# ============================================
# üìå Block 1: Setup & Import
# ============================================
import os
import re
import json
import difflib
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

# Check pyperclip
try:
    import pyperclip
    CLIPBOARD_AVAILABLE = True
except ImportError:
    CLIPBOARD_AVAILABLE = False
    print("‚ö†Ô∏è pyperclip not installed - ‡∏à‡∏∞‡πÉ‡∏ä‡πâ‡πÑ‡∏ü‡∏•‡πå‡πÅ‡∏ó‡∏ô clipboard")

print("‚úÖ Libraries loaded")

In [None]:
# ============================================
# üìå Block 2: Enhanced Configuration
# ============================================
class Config:
    """Configuration ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö OCR Processing - Thai Novel Optimized"""

# ‚ö†Ô∏è ‡πÅ‡∏Å‡πâ paths ‡∏ï‡∏£‡∏á‡∏ô‡∏µ‡πâ
    BASE = '/content/drive/MyDrive/OCR'  # ‚Üê folder ‡∏´‡∏•‡∏±‡∏Å‡πÉ‡∏ô Drive

    # Paths
    RAW_OCR_DIR = 'raw_ocr'
    BATCHES_DIR = 'batches'
    CLEANED_GPT_DIR = 'cleaned_gpt'
    CLEANED_CLAUDE_DIR = 'cleaned_claude'
    FINAL_DIR = 'final_corpus'
    REPORTS_DIR = 'reports'
    TRAINING_PAIRS_DIR = 'training_pairs'

    # Processing parameters
    MAX_PAGES_PER_BATCH = 20
    MIN_LINE_LENGTH = 3  # ‡∏ö‡∏£‡∏£‡∏ó‡∏±‡∏î‡∏ó‡∏µ‡πà‡∏™‡∏±‡πâ‡∏ô‡∏Å‡∏ß‡πà‡∏≤‡∏ô‡∏µ‡πâ‡∏ô‡πà‡∏≤‡∏à‡∏∞‡∏ú‡∏¥‡∏î

    # Enhanced OCR replacements for Thai novels
    OCR_REPLACEMENTS = {
        # Common OCR errors
        '‡πÄ‡πÄ': '‡πÅ',
        '‡πç‡∏≤': '‡∏≥',
        '‡πç ‡∏≤': '‡∏≥',
        '  ': ' ',
        '   ': ' ',
        '\t': ' ',

        # Punctuation fixes
        ' ‡πÜ ': '‡πÜ ',
        '‡πÜ ': '‡πÜ',
        ' ‡πÜ': '‡πÜ',
        ' "': '"',
        '" ': '"',
        ' ,': ',',
        ' .': '.',

        # Common Thai novel terms
        '‡∏û‡∏ß‡∏Å‡πÄ‡∏Ç‡πÖ': '‡∏û‡∏ß‡∏Å‡πÄ‡∏Ç‡∏≤',
        '‡∏ó‡πç‡∏≤': '‡∏ó‡∏≥',
        '‡∏à‡πÖ‡∏Å': '‡∏à‡∏≤‡∏Å',
        '‡∏î‡∏π‡πà': '‡∏î‡∏π',
    }

    # Suspicious patterns (‡∏ô‡πà‡∏≤‡∏à‡∏∞‡πÄ‡∏õ‡πá‡∏ô OCR error)
    SUSPICIOUS_PATTERNS = [
        r'^[‡∏Å-‡∏Æ]$',  # ‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£‡πÄ‡∏î‡∏µ‡πà‡∏¢‡∏ß
        r'^[a-zA-Z]$',  # ‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£‡∏†‡∏≤‡∏©‡∏≤‡∏≠‡∏±‡∏á‡∏Å‡∏§‡∏©‡πÄ‡∏î‡∏µ‡πà‡∏¢‡∏ß
        r'^.{1,2}$',  # ‡∏Ñ‡∏≥‡∏™‡∏±‡πâ‡∏ô‡∏°‡∏≤‡∏Å (1-2 ‡∏ï‡∏±‡∏ß)
        r'^\d+$',  # ‡∏ï‡∏±‡∏ß‡πÄ‡∏•‡∏Ç‡∏≠‡∏¢‡πà‡∏≤‡∏á‡πÄ‡∏î‡∏µ‡∏¢‡∏ß
    ]

print("‚úÖ Enhanced Config loaded")

In [None]:
# ============================================
# üìå Block 3: Novel Text Analyzer
# ‡∏ß‡∏¥‡πÄ‡∏Ñ‡∏£‡∏≤‡∏∞‡∏´‡πå‡∏•‡∏±‡∏Å‡∏©‡∏ì‡∏∞‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏ô‡∏¥‡∏¢‡∏≤‡∏¢
# ============================================
class NovelTextAnalyzer:
    """‡∏ß‡∏¥‡πÄ‡∏Ñ‡∏£‡∏≤‡∏∞‡∏´‡πå‡πÅ‡∏•‡∏∞‡πÅ‡∏Å‡πâ‡∏õ‡∏±‡∏ç‡∏´‡∏≤ OCR ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏ô‡∏¥‡∏¢‡∏≤‡∏¢‡πÑ‡∏ó‡∏¢"""

    @staticmethod
    def is_dialogue(text: str) -> bool:
        """‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏ß‡πà‡∏≤‡πÄ‡∏õ‡πá‡∏ô‡∏ö‡∏ó‡∏™‡∏ô‡∏ó‡∏ô‡∏≤‡∏´‡∏£‡∏∑‡∏≠‡πÑ‡∏°‡πà"""
        dialogue_patterns = [
            r'^".*"',  # ‡∏Ç‡∏∂‡πâ‡∏ô‡∏ï‡πâ‡∏ô‡πÅ‡∏•‡∏∞‡∏•‡∏á‡∏ó‡πâ‡∏≤‡∏¢‡∏î‡πâ‡∏ß‡∏¢ "
            r'".*"$',  # ‡∏°‡∏µ quote
            r'".*".*‡∏Å‡∏•‡πà‡∏≤‡∏ß',  # ‡∏°‡∏µ‡∏Ñ‡∏≥‡∏ß‡πà‡∏≤ ‡∏Å‡∏•‡πà‡∏≤‡∏ß
            r'".*".*‡∏û‡∏π‡∏î',  # ‡∏°‡∏µ‡∏Ñ‡∏≥‡∏ß‡πà‡∏≤ ‡∏û‡∏π‡∏î
            r'".*".*‡∏ï‡∏≠‡∏ö',  # ‡∏°‡∏µ‡∏Ñ‡∏≥‡∏ß‡πà‡∏≤ ‡∏ï‡∏≠‡∏ö
            r'".*".*‡∏ñ‡∏≤‡∏°',  # ‡∏°‡∏µ‡∏Ñ‡∏≥‡∏ß‡πà‡∏≤ ‡∏ñ‡∏≤‡∏°
            r'".*".*‡∏£‡πâ‡∏≠‡∏á',  # ‡∏°‡∏µ‡∏Ñ‡∏≥‡∏ß‡πà‡∏≤ ‡∏£‡πâ‡∏≠‡∏á
            r'".*".*‡∏ö‡πà‡∏ô',  # ‡∏°‡∏µ‡∏Ñ‡∏≥‡∏ß‡πà‡∏≤ ‡∏ö‡πà‡∏ô
        ]

        for pattern in dialogue_patterns:
            if re.search(pattern, text):
                return True
        return False

    @staticmethod
    def is_incomplete_line(text: str) -> bool:
        """‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏ß‡πà‡∏≤‡πÄ‡∏õ‡πá‡∏ô‡∏ö‡∏£‡∏£‡∏ó‡∏±‡∏î‡∏ó‡∏µ‡πà‡πÑ‡∏°‡πà‡∏™‡∏°‡∏ö‡∏π‡∏£‡∏ì‡πå‡∏´‡∏£‡∏∑‡∏≠‡πÑ‡∏°‡πà"""
        # ‡∏ö‡∏£‡∏£‡∏ó‡∏±‡∏î‡∏ó‡∏µ‡πà‡∏ô‡πà‡∏≤‡∏à‡∏∞‡πÑ‡∏°‡πà‡∏™‡∏°‡∏ö‡∏π‡∏£‡∏ì‡πå
        if len(text) < Config.MIN_LINE_LENGTH:
            return True

        # ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö patterns ‡∏ó‡∏µ‡πà‡∏ô‡πà‡∏≤‡∏™‡∏á‡∏™‡∏±‡∏¢
        for pattern in Config.SUSPICIOUS_PATTERNS:
            if re.match(pattern, text.strip()):
                return True

        # ‡∏ñ‡πâ‡∏≤‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏™‡∏£‡∏∞‡πÄ‡∏•‡∏¢ ‡∏ô‡πà‡∏≤‡∏à‡∏∞‡∏ú‡∏¥‡∏î
        thai_vowels = '‡∏∞‡∏≤‡∏¥‡∏µ‡∏∂‡∏∑‡∏∏‡∏π‡πÄ‡πÅ‡πÇ‡πÉ‡πÑ‡πá‡πà‡πâ‡πä‡πã‡∏≥'
        if not any(v in text for v in thai_vowels):
            return True

        return False

    @staticmethod
    def should_merge_lines(prev_line: str, curr_line: str) -> bool:
        """‡∏ï‡∏±‡∏î‡∏™‡∏¥‡∏ô‡πÉ‡∏à‡∏ß‡πà‡∏≤‡∏Ñ‡∏ß‡∏£‡∏£‡∏ß‡∏° 2 ‡∏ö‡∏£‡∏£‡∏ó‡∏±‡∏î‡∏´‡∏£‡∏∑‡∏≠‡πÑ‡∏°‡πà"""
        # ‡∏ñ‡πâ‡∏≤‡∏ö‡∏£‡∏£‡∏ó‡∏±‡∏î‡∏Å‡πà‡∏≠‡∏ô‡∏´‡∏ô‡πâ‡∏≤‡πÑ‡∏°‡πà‡∏à‡∏ö‡∏õ‡∏£‡∏∞‡πÇ‡∏¢‡∏Ñ
        if prev_line and not prev_line[-1] in '.!? ':
            # ‡πÅ‡∏•‡∏∞‡∏ö‡∏£‡∏£‡∏ó‡∏±‡∏î‡∏õ‡∏±‡∏à‡∏à‡∏∏‡∏ö‡∏±‡∏ô‡πÑ‡∏°‡πà‡πÉ‡∏ä‡πà‡∏ö‡∏ó‡∏™‡∏ô‡∏ó‡∏ô‡∏≤‡πÉ‡∏´‡∏°‡πà
            if not NovelTextAnalyzer.is_dialogue(curr_line):
                # ‡πÅ‡∏•‡∏∞‡πÑ‡∏°‡πà‡πÉ‡∏ä‡πà paragraph ‡πÉ‡∏´‡∏°‡πà (‡πÑ‡∏°‡πà‡∏Ç‡∏∂‡πâ‡∏ô‡∏ï‡πâ‡∏ô‡∏î‡πâ‡∏ß‡∏¢‡∏Å‡∏≤‡∏£‡πÄ‡∏ß‡πâ‡∏ô‡∏ß‡∏£‡∏£‡∏Ñ)
                if not curr_line.startswith(('  ', '\t')):
                    return True
        return False

    @staticmethod
    def fix_broken_words(text: str) -> str:
        """‡πÅ‡∏Å‡πâ‡∏Ñ‡∏≥‡∏ó‡∏µ‡πà‡πÅ‡∏ï‡∏Å‡∏´‡∏±‡∏Å"""
        # Pattern ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏´‡∏≤‡∏Ñ‡∏≥‡∏ó‡∏µ‡πà‡∏ô‡πà‡∏≤‡∏à‡∏∞‡πÅ‡∏ï‡∏Å
        lines = text.split('\n')
        fixed_lines = []

        i = 0
        while i < len(lines):
            curr_line = lines[i].strip()

            # ‡∏ñ‡πâ‡∏≤‡πÄ‡∏õ‡πá‡∏ô‡∏ö‡∏£‡∏£‡∏ó‡∏±‡∏î‡∏ó‡∏µ‡πà‡∏ô‡πà‡∏≤‡∏™‡∏á‡∏™‡∏±‡∏¢
            if NovelTextAnalyzer.is_incomplete_line(curr_line):
                # ‡∏•‡∏≠‡∏á‡∏î‡∏π‡∏ß‡πà‡∏≤‡∏Ñ‡∏ß‡∏£‡∏£‡∏ß‡∏°‡∏Å‡∏±‡∏ö‡∏ö‡∏£‡∏£‡∏ó‡∏±‡∏î‡∏Å‡πà‡∏≠‡∏ô‡∏´‡∏ô‡πâ‡∏≤‡∏´‡∏£‡∏∑‡∏≠‡∏ñ‡∏±‡∏î‡πÑ‡∏õ‡πÑ‡∏´‡∏°
                if i > 0 and fixed_lines:
                    # ‡∏•‡∏≠‡∏á‡∏£‡∏ß‡∏°‡∏Å‡∏±‡∏ö‡∏ö‡∏£‡∏£‡∏ó‡∏±‡∏î‡∏Å‡πà‡∏≠‡∏ô‡∏´‡∏ô‡πâ‡∏≤
                    prev = fixed_lines[-1]
                    if not prev.endswith(('.', '!', '?', '"')):
                        fixed_lines[-1] = prev + curr_line
                        i += 1
                        continue

                if i < len(lines) - 1:
                    # ‡∏•‡∏≠‡∏á‡∏£‡∏ß‡∏°‡∏Å‡∏±‡∏ö‡∏ö‡∏£‡∏£‡∏ó‡∏±‡∏î‡∏ñ‡∏±‡∏î‡πÑ‡∏õ
                    next_line = lines[i + 1].strip()
                    if not NovelTextAnalyzer.is_dialogue(next_line):
                        fixed_lines.append(curr_line + next_line)
                        i += 2
                        continue

            # ‡∏ñ‡πâ‡∏≤‡πÑ‡∏°‡πà‡∏ï‡πâ‡∏≠‡∏á‡∏£‡∏ß‡∏° ‡∏Å‡πá‡πÄ‡∏û‡∏¥‡πà‡∏°‡∏õ‡∏Å‡∏ï‡∏¥
            if curr_line:  # ‡πÑ‡∏°‡πà‡πÄ‡∏û‡∏¥‡πà‡∏°‡∏ö‡∏£‡∏£‡∏ó‡∏±‡∏î‡∏ß‡πà‡∏≤‡∏á
                fixed_lines.append(curr_line)
            i += 1

        return '\n'.join(fixed_lines)

print("‚úÖ NovelTextAnalyzer ready")

In [None]:
# ============================================
# üìå Block 4: Enhanced BatchPreparer
# ============================================
class EnhancedBatchPreparer:
    """Enhanced batch preparer ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏ô‡∏¥‡∏¢‡∏≤‡∏¢‡πÑ‡∏ó‡∏¢"""

    def __init__(self, input_folder=None, output_folder=None):
        self.input_folder = Path(input_folder or Config.RAW_OCR_DIR)
        self.output_folder = Path(output_folder or Config.BATCHES_DIR)
        self.input_folder.mkdir(exist_ok=True)
        self.output_folder.mkdir(exist_ok=True)
        self.analyzer = NovelTextAnalyzer()

    def pre_clean_text(self, text: str) -> str:
        """‡∏ó‡∏≥‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏∞‡∏≠‡∏≤‡∏î OCR text ‡πÅ‡∏ö‡∏ö enhanced"""

        # Step 1: Basic replacements
        for old, new in Config.OCR_REPLACEMENTS.items():
            text = text.replace(old, new)

        # Step 2: Fix broken words
        text = self.analyzer.fix_broken_words(text)

        # Step 3: Smart paragraph detection
        text = self._smart_paragraph_split(text)

        # Step 4: Clean up spacing
        text = re.sub(r'\n{3,}', '\n\n', text)  # ‡∏•‡∏î‡∏ö‡∏£‡∏£‡∏ó‡∏±‡∏î‡∏ß‡πà‡∏≤‡∏á‡πÄ‡∏Å‡∏¥‡∏ô
        text = re.sub(r' {2,}', ' ', text)  # ‡∏•‡∏î space ‡∏ã‡πâ‡∏≥

        return text.strip()

    def _smart_paragraph_split(self, text: str) -> str:
        """‡πÅ‡∏ö‡πà‡∏á paragraph ‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏ä‡∏≤‡∏ç‡∏â‡∏•‡∏≤‡∏î"""
        lines = text.split('\n')
        paragraphs = []
        current_para = []

        for i, line in enumerate(lines):
            line = line.strip()

            if not line:
                # ‡∏ö‡∏£‡∏£‡∏ó‡∏±‡∏î‡∏ß‡πà‡∏≤‡∏á = ‡∏à‡∏ö paragraph
                if current_para:
                    paragraphs.append(' '.join(current_para))
                    current_para = []
                continue

            # ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏ß‡πà‡∏≤‡πÄ‡∏õ‡πá‡∏ô‡∏ö‡∏ó‡∏™‡∏ô‡∏ó‡∏ô‡∏≤‡πÉ‡∏´‡∏°‡πà‡∏´‡∏£‡∏∑‡∏≠‡πÑ‡∏°‡πà
            if self.analyzer.is_dialogue(line):
                # ‡∏ñ‡πâ‡∏≤‡∏°‡∏µ paragraph ‡∏Å‡πà‡∏≠‡∏ô‡∏´‡∏ô‡πâ‡∏≤ ‡πÉ‡∏´‡πâ‡∏à‡∏ö‡∏°‡∏±‡∏ô‡∏Å‡πà‡∏≠‡∏ô
                if current_para and not self.analyzer.is_dialogue(current_para[-1]):
                    paragraphs.append(' '.join(current_para))
                    current_para = [line]
                else:
                    current_para.append(line)
            else:
                # ‡∏ñ‡πâ‡∏≤‡πÑ‡∏°‡πà‡πÉ‡∏ä‡πà‡∏ö‡∏ó‡∏™‡∏ô‡∏ó‡∏ô‡∏≤
                if i > 0 and current_para:
                    # ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏ß‡πà‡∏≤‡∏Ñ‡∏ß‡∏£‡∏£‡∏ß‡∏°‡∏Å‡∏±‡∏ö‡∏ö‡∏£‡∏£‡∏ó‡∏±‡∏î‡∏Å‡πà‡∏≠‡∏ô‡∏´‡∏ô‡πâ‡∏≤‡πÑ‡∏´‡∏°
                    if self.analyzer.should_merge_lines(current_para[-1], line):
                        current_para.append(line)
                    else:
                        # ‡πÄ‡∏£‡∏¥‡πà‡∏° paragraph ‡πÉ‡∏´‡∏°‡πà
                        paragraphs.append(' '.join(current_para))
                        current_para = [line]
                else:
                    current_para.append(line)

        # ‡πÄ‡∏û‡∏¥‡πà‡∏° paragraph ‡∏™‡∏∏‡∏î‡∏ó‡πâ‡∏≤‡∏¢
        if current_para:
            paragraphs.append(' '.join(current_para))

        return '\n'.join(paragraphs)

    def create_enhanced_prompt(self, batch_text: str) -> str:
        """‡∏™‡∏£‡πâ‡∏≤‡∏á prompt ‡∏ó‡∏µ‡πà‡∏•‡∏∞‡πÄ‡∏≠‡∏µ‡∏¢‡∏î‡∏Ç‡∏∂‡πâ‡∏ô‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏ô‡∏¥‡∏¢‡∏≤‡∏¢"""
        prompt = f"""‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡πÅ‡∏Å‡πâ‡πÑ‡∏Ç‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏° OCR ‡∏à‡∏≤‡∏Å‡∏ô‡∏¥‡∏¢‡∏≤‡∏¢‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢‡∏ï‡πà‡∏≠‡πÑ‡∏õ‡∏ô‡∏µ‡πâ

‡∏Å‡∏é‡∏Å‡∏≤‡∏£‡πÅ‡∏Å‡πâ‡πÑ‡∏Ç:
1. ‡πÅ‡∏Å‡πâ‡∏Ñ‡∏≥‡∏ú‡∏¥‡∏î typo ‡πÅ‡∏•‡∏∞‡∏Å‡∏≤‡∏£‡∏™‡∏∞‡∏Å‡∏î‡∏ú‡∏¥‡∏î
2. ‡πÅ‡∏Å‡πâ‡∏Ñ‡∏≥‡∏ó‡∏µ‡πà‡∏Ç‡∏≤‡∏î‡∏´‡∏≤‡∏¢/‡πÅ‡∏ï‡∏Å‡∏´‡∏±‡∏Å (‡πÄ‡∏ä‡πà‡∏ô "‡∏à‡∏∞‡∏≠‡∏¢‡πà‡∏≤‡∏á" "‡∏õ‡∏£‡∏∞‡∏ï‡∏π" "‡∏≤‡∏ô‡∏ä‡∏≥" ‡∏ó‡∏µ‡πà‡∏Ñ‡∏ß‡∏£‡πÄ‡∏õ‡πá‡∏ô‡∏õ‡∏£‡∏∞‡πÇ‡∏¢‡∏Ñ‡πÄ‡∏î‡∏µ‡∏¢‡∏ß‡∏Å‡∏±‡∏ô)
3. ‡∏•‡∏ö‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£‡πÄ‡∏î‡∏µ‡πà‡∏¢‡∏ß‡πÜ ‡∏ó‡∏µ‡πà‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏°‡∏´‡∏°‡∏≤‡∏¢ (‡πÄ‡∏ä‡πà‡∏ô ‡∏Å, ‡∏õ, ‡∏á, T)
4. ‡∏£‡∏±‡∏Å‡∏©‡∏≤‡∏£‡∏π‡∏õ‡πÅ‡∏ö‡∏ö‡∏ö‡∏ó‡∏™‡∏ô‡∏ó‡∏ô‡∏≤ (‡∏Ñ‡∏≥‡∏û‡∏π‡∏î‡πÉ‡∏ô‡πÄ‡∏Ñ‡∏£‡∏∑‡πà‡∏≠‡∏á‡∏´‡∏°‡∏≤‡∏¢ "...")
5. ‡∏à‡∏±‡∏î paragraph ‡πÉ‡∏´‡πâ‡πÄ‡∏´‡∏°‡∏≤‡∏∞‡∏™‡∏° - ‡∏ö‡∏ó‡∏™‡∏ô‡∏ó‡∏ô‡∏≤‡πÅ‡∏¢‡∏Å‡∏ö‡∏£‡∏£‡∏ó‡∏±‡∏î, ‡∏ö‡∏£‡∏£‡∏¢‡∏≤‡∏¢‡∏£‡∏ß‡∏°‡∏Å‡∏±‡∏ô‡πÄ‡∏õ‡πá‡∏ô paragraph
6. ‡∏Ñ‡∏á‡∏£‡∏π‡∏õ‡πÅ‡∏ö‡∏ö markers [PAGE_XXX] ‡πÅ‡∏•‡∏∞ [END_PAGE_XXX] ‡πÑ‡∏ß‡πâ‡∏ó‡∏∏‡∏Å‡∏ï‡∏±‡∏ß
7. ‡∏´‡πâ‡∏≤‡∏°‡πÄ‡∏û‡∏¥‡πà‡∏°‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤‡∏ó‡∏µ‡πà‡πÑ‡∏°‡πà‡∏°‡∏µ‡πÉ‡∏ô‡∏ï‡πâ‡∏ô‡∏â‡∏ö‡∏±‡∏ö

‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏Å‡∏≤‡∏£‡πÅ‡∏Å‡πâ:
‚ùå OCR ‡∏ú‡∏¥‡∏î:
"‡∏à‡∏∞‡∏≠‡∏¢‡πà‡∏≤‡∏á
‡∏õ‡∏£‡∏∞‡∏ï‡∏π
‡∏≤‡∏ô‡∏ä‡∏≥
‡∏Å
‡πÑ‡∏´‡∏ô"

‚úÖ ‡πÅ‡∏Å‡πâ‡πÄ‡∏õ‡πá‡∏ô:
"[‡∏õ‡∏£‡∏∞‡πÇ‡∏¢‡∏Ñ‡∏ó‡∏µ‡πà‡∏™‡∏°‡∏ö‡∏π‡∏£‡∏ì‡πå‡∏ï‡∏≤‡∏°‡∏ö‡∏£‡∏¥‡∏ö‡∏ó]"

‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏ó‡∏µ‡πà‡∏ï‡πâ‡∏≠‡∏á‡πÅ‡∏Å‡πâ:

{batch_text}

‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡πÅ‡∏Å‡πâ‡πÑ‡∏Ç‡πÅ‡∏•‡πâ‡∏ß‡∏Ñ‡∏∑‡∏ô‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î‡∏û‡∏£‡πâ‡∏≠‡∏° markers"""

        return prompt

    def create_batch(self, max_pages: int = None) -> Tuple[str, int]:
        """‡∏™‡∏£‡πâ‡∏≤‡∏á batch ‡∏û‡∏£‡πâ‡∏≠‡∏° pre-cleaning ‡∏Ç‡∏±‡πâ‡∏ô‡∏™‡∏π‡∏á"""
        max_pages = max_pages or Config.MAX_PAGES_PER_BATCH
        files = sorted(self.input_folder.glob("*.txt"))[:max_pages]

        if not files:
            print("‚ùå ‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå‡πÉ‡∏ô folder raw_ocr/")
            return "", 0

        batch_parts = ["[START_BATCH]"]
        stats = {'total_lines': 0, 'suspicious_lines': 0, 'merged_lines': 0}

        for i, file_path in enumerate(files, 1):
            try:
                text = file_path.read_text(encoding='utf-8')

                # ‡∏ô‡∏±‡∏ö‡∏™‡∏ñ‡∏¥‡∏ï‡∏¥‡∏Å‡πà‡∏≠‡∏ô clean
                original_lines = len(text.split('\n'))

                # Clean text
                cleaned_text = self.pre_clean_text(text)

                # ‡∏ô‡∏±‡∏ö‡∏™‡∏ñ‡∏¥‡∏ï‡∏¥‡∏´‡∏•‡∏±‡∏á clean
                cleaned_lines = len(cleaned_text.split('\n'))
                stats['total_lines'] += original_lines
                stats['merged_lines'] += (original_lines - cleaned_lines)

                # ‡πÄ‡∏û‡∏¥‡πà‡∏° markers
                page_marker = f"[PAGE_{i:03d}]"
                end_marker = f"[END_PAGE_{i:03d}]"
                batch_parts.append(f"\n{page_marker}\n{cleaned_text}\n{end_marker}")

            except Exception as e:
                print(f"‚ö†Ô∏è Error reading {file_path.name}: {e}")
                continue

        batch_parts.append("\n[END_BATCH]")
        batch_text = ''.join(batch_parts)

        # ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å batch
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        batch_file = self.output_folder / f"batch_{timestamp}.txt"
        batch_file.write_text(batch_text, encoding='utf-8')

        print(f"‚úÖ ‡∏™‡∏£‡πâ‡∏≤‡∏á batch ‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à: {batch_file.name}")
        print(f"   üìÑ ‡∏à‡∏≥‡∏ô‡∏ß‡∏ô: {len(files)} ‡∏´‡∏ô‡πâ‡∏≤")
        print(f"   üìä ‡∏™‡∏ñ‡∏¥‡∏ï‡∏¥:")
        print(f"      - ‡∏ö‡∏£‡∏£‡∏ó‡∏±‡∏î‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î: {stats['total_lines']}")
        print(f"      - ‡∏ö‡∏£‡∏£‡∏ó‡∏±‡∏î‡∏ó‡∏µ‡πà‡∏£‡∏ß‡∏°: {stats['merged_lines']}")
        print(f"   üíæ ‡∏Ç‡∏ô‡∏≤‡∏î: ~{len(batch_text.split())} ‡∏Ñ‡∏≥")

        return batch_text, len(files)

    def prepare_and_copy(self, max_pages: int = None):
        """‡πÄ‡∏ï‡∏£‡∏µ‡∏¢‡∏° batch ‡πÅ‡∏•‡∏∞ copy/save"""
        batch_text, page_count = self.create_batch(max_pages)

        if page_count == 0:
            return

        # ‡∏™‡∏£‡πâ‡∏≤‡∏á enhanced prompt
        prompt = self.create_enhanced_prompt(batch_text)

        # Estimate tokens
        estimated_tokens = len(prompt) // 2

        # Save or copy
        if CLIPBOARD_AVAILABLE:
            try:
                pyperclip.copy(prompt)
                print(f"\n‚úÖ Copied to clipboard!")
            except Exception as e:
                print(f"‚ö†Ô∏è Cannot copy: {e}")
                self._save_prompt_to_file(prompt)
        else:
            self._save_prompt_to_file(prompt)

        print(f"üìä ‡∏õ‡∏£‡∏∞‡∏°‡∏≤‡∏ì {estimated_tokens:,} tokens")
        print(f"\nüìù ‡∏Ç‡∏±‡πâ‡∏ô‡∏ï‡∏≠‡∏ô‡∏ï‡πà‡∏≠‡πÑ‡∏õ:")
        print("   1. ‡πÄ‡∏õ‡∏¥‡∏î ChatGPT/Claude")
        print("   2. Paste prompt")
        print("   3. ‡∏£‡∏≠‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå")
        print("   4. Copy ‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå")
        print("   5. Run parse_results")

    def _save_prompt_to_file(self, prompt: str):
        prompt_file = self.output_folder / "latest_prompt.txt"
        prompt_file.write_text(prompt, encoding='utf-8')
        print(f"üíæ ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å prompt ‡πÑ‡∏ß‡πâ‡∏ó‡∏µ‡πà: {prompt_file}")

print("‚úÖ EnhancedBatchPreparer ready")

In [None]:
# ============================================
# üìå Block 5: Quality Validator
# ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏Ñ‡∏∏‡∏ì‡∏†‡∏≤‡∏û‡∏´‡∏•‡∏±‡∏á LLM ‡πÅ‡∏Å‡πâ
# ============================================
class QualityValidator:
    """‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏Ñ‡∏∏‡∏ì‡∏†‡∏≤‡∏û‡∏Ç‡∏≠‡∏á‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏ó‡∏µ‡πà‡∏ú‡πà‡∏≤‡∏ô LLM ‡πÅ‡∏•‡πâ‡∏ß"""

    @staticmethod
    def validate_text(original: str, cleaned: str) -> Dict:
        """‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏Ñ‡∏∏‡∏ì‡∏†‡∏≤‡∏û‡∏Å‡∏≤‡∏£‡πÅ‡∏Å‡πâ‡πÑ‡∏Ç"""
        issues = []

        # 1. ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏Ñ‡∏ß‡∏≤‡∏°‡∏¢‡∏≤‡∏ß
        len_ratio = len(cleaned) / len(original) if len(original) > 0 else 0
        if len_ratio < 0.5:
            issues.append("‚ö†Ô∏è ‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏±‡πâ‡∏ô‡∏•‡∏á‡∏°‡∏≤‡∏Å (‡∏≠‡∏≤‡∏à‡∏°‡∏µ‡∏Å‡∏≤‡∏£‡∏•‡∏ö‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤)")
        elif len_ratio > 1.5:
            issues.append("‚ö†Ô∏è ‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏¢‡∏≤‡∏ß‡∏Ç‡∏∂‡πâ‡∏ô‡∏°‡∏≤‡∏Å (‡∏≠‡∏≤‡∏à‡∏°‡∏µ‡∏Å‡∏≤‡∏£‡πÄ‡∏û‡∏¥‡πà‡∏°‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤)")

        # 2. ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏ö‡∏ó‡∏™‡∏ô‡∏ó‡∏ô‡∏≤
        orig_quotes = len(re.findall(r'"[^"]*"', original))
        clean_quotes = len(re.findall(r'"[^"]*"', cleaned))
        if abs(orig_quotes - clean_quotes) > 2:
            issues.append(f"‚ö†Ô∏è ‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡∏ö‡∏ó‡∏™‡∏ô‡∏ó‡∏ô‡∏≤‡∏ï‡πà‡∏≤‡∏á‡∏Å‡∏±‡∏ô‡∏°‡∏≤‡∏Å ({orig_quotes} -> {clean_quotes})")

        # 3. ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏ä‡∏∑‡πà‡∏≠‡∏ï‡∏±‡∏ß‡∏•‡∏∞‡∏Ñ‡∏£ (‡∏ñ‡πâ‡∏≤‡∏û‡∏ö‡πÉ‡∏ô‡∏ï‡πâ‡∏ô‡∏â‡∏ö‡∏±‡∏ö)
        character_names = re.findall(r'(‡πÇ‡∏Ñ‡πÄ‡∏Æ|‡πÇ‡∏ä‡∏ï‡∏∞|‡∏≠‡∏±‡∏ï‡∏™‡∏∂‡∏¢‡∏∞)', original)
        for name in set(character_names):
            orig_count = original.count(name)
            clean_count = cleaned.count(name)
            if clean_count < orig_count * 0.8:
                issues.append(f"‚ö†Ô∏è ‡∏ä‡∏∑‡πà‡∏≠ '{name}' ‡∏´‡∏≤‡∏¢‡πÑ‡∏õ ({orig_count} -> {clean_count})")

        # 4. ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö paragraph structure
        orig_paragraphs = len([p for p in original.split('\n\n') if p.strip()])
        clean_paragraphs = len([p for p in cleaned.split('\n') if p.strip()])

        return {
            'valid': len(issues) == 0,
            'issues': issues,
            'stats': {
                'length_ratio': len_ratio,
                'dialogue_count': clean_quotes,
                'paragraph_count': clean_paragraphs,
            }
        }

    @staticmethod
    def generate_quality_report(validations: List[Dict]) -> str:
        """‡∏™‡∏£‡πâ‡∏≤‡∏á quality report"""
        report = "üìä Quality Validation Report\n"
        report += "=" * 50 + "\n\n"

        total = len(validations)
        valid = sum(1 for v in validations if v['valid'])

        report += f"‚úÖ Valid: {valid}/{total} ({valid/total*100:.1f}%)\n"
        report += f"‚ö†Ô∏è Issues found: {total - valid}\n\n"

        if total - valid > 0:
            report += "Issues detail:\n"
            for i, val in enumerate(validations):
                if not val['valid']:
                    report += f"\nPage {i+1}:\n"
                    for issue in val['issues']:
                        report += f"  {issue}\n"

        return report

print("‚úÖ QualityValidator ready")