<a href="https://colab.research.google.com/github/tanatet8/Colab_Script/blob/main/ThaiNovel_OCR_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Cell ใหม่: Mount Drive + Create Folders
from google.colab import drive
drive.mount('/content/drive')

# สร้าง folders ใน Drive
import os
BASE = '/content/drive/MyDrive/OCR'  # ← ชื่อ folder ของคุณ

for folder in ['raw_ocr', 'batches', 'cleaned_gpt', 'cleaned_claude', 'final_corpus', 'reports', 'training_pairs']:
    os.makedirs(f'{BASE}/{folder}', exist_ok=True)

print("✅ Folders ready in Drive!")

Mounted at /content/drive
✅ Folders ready in Drive!


In [2]:
# ============================================
# 📌 Block 1: Setup & Import
# ============================================
import os
import re
import json
import difflib
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

# Check pyperclip
try:
    import pyperclip
    CLIPBOARD_AVAILABLE = True
except ImportError:
    CLIPBOARD_AVAILABLE = False
    print("⚠️ pyperclip not installed - จะใช้ไฟล์แทน clipboard")

print("✅ Libraries loaded")

✅ Libraries loaded


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# ============================================
# 📌 Block 1: Setup & Import
# ============================================
import os
import re
import json
import difflib
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

try:
    import pyperclip
    CLIPBOARD_AVAILABLE = True
except ImportError:
    CLIPBOARD_AVAILABLE = False
    print("⚠️ pyperclip not installed - จะใช้ไฟล์แทน clipboard")

print("✅ Libraries loaded")

# ============================================
# 📌 Block 2: Enhanced Configuration
# ============================================
class Config:
    """Configuration สำหรับ OCR Processing - Thai Novel Optimized"""

    # Paths - ชี้ไปที่ Drive
    BASE = '/content/drive/MyDrive/OCR'

    RAW_OCR_DIR = f'{BASE}/raw_ocr'
    BATCHES_DIR = f'{BASE}/batches'
    CLEANED_GPT_DIR = f'{BASE}/cleaned_gpt'
    CLEANED_CLAUDE_DIR = f'{BASE}/cleaned_claude'
    FINAL_DIR = f'{BASE}/final_corpus'
    REPORTS_DIR = f'{BASE}/reports'
    TRAINING_PAIRS_DIR = f'{BASE}/training_pairs'

    # Processing parameters
    MAX_PAGES_PER_BATCH = 20
    MIN_LINE_LENGTH = 3

    # Enhanced OCR replacements for Thai novels
    OCR_REPLACEMENTS = {
        # Common OCR errors
        'เเ': 'แ',
        'ํา': 'ำ',
        'ํ า': 'ำ',
        '  ': ' ',
        '   ': ' ',
        '\t': ' ',

        # Punctuation fixes
        ' ๆ ': 'ๆ ',
        'ๆ ': 'ๆ',
        ' ๆ': 'ๆ',
        ' "': '"',
        '" ': '"',
        ' ,': ',',
        ' .': '.',

        # Common Thai novel terms
        'พวกเขๅ': 'พวกเขา',
        'ทํา': 'ทำ',
        'จๅก': 'จาก',
        'ดู่': 'ดู',
    }

    SUSPICIOUS_PATTERNS = [
        r'^[ก-ฮ]$',
        r'^[a-zA-Z]$',
        r'^.{1,2}$',
        r'^\d+$',
    ]

print("✅ Config loaded")

In [None]:
# ============================================
# 📌 Block 3: Novel Text Analyzer
# ============================================
class NovelTextAnalyzer:
    """วิเคราะห์และแก้ปัญหา OCR สำหรับนิยายไทย"""

    @staticmethod
    def is_dialogue(text: str) -> bool:
        dialogue_patterns = [
            r'^".*"',
            r'".*"$',
            r'".*".*กล่าว',
            r'".*".*พูด',
            r'".*".*ตอบ',
            r'".*".*ถาม',
            r'".*".*ร้อง',
            r'".*".*บ่น',
        ]
        for pattern in dialogue_patterns:
            if re.search(pattern, text):
                return True
        return False

    @staticmethod
    def is_incomplete_line(text: str) -> bool:
        if len(text) < Config.MIN_LINE_LENGTH:
            return True
        for pattern in Config.SUSPICIOUS_PATTERNS:
            if re.match(pattern, text.strip()):
                return True
        thai_vowels = 'ะาิีึืุูเแโใไ็่้๊๋ำ'
        if not any(v in text for v in thai_vowels):
            return True
        return False

    @staticmethod
    def should_merge_lines(prev_line: str, curr_line: str) -> bool:
        if prev_line and not prev_line[-1] in '.!? ':
            if not NovelTextAnalyzer.is_dialogue(curr_line):
                if not curr_line.startswith(('  ', '\t')):
                    return True
        return False

    @staticmethod
    def fix_broken_words(text: str) -> str:
        lines = text.split('\n')
        fixed_lines = []
        i = 0
        while i < len(lines):
            curr_line = lines[i].strip()
            if NovelTextAnalyzer.is_incomplete_line(curr_line):
                if i > 0 and fixed_lines:
                    prev = fixed_lines[-1]
                    if not prev.endswith(('.', '!', '?', '"')):
                        fixed_lines[-1] = prev + curr_line
                        i += 1
                        continue
                if i < len(lines) - 1:
                    next_line = lines[i + 1].strip()
                    if not NovelTextAnalyzer.is_dialogue(next_line):
                        fixed_lines.append(curr_line + next_line)
                        i += 2
                        continue
            if curr_line:
                fixed_lines.append(curr_line)
            i += 1
        return '\n'.join(fixed_lines)

# ============================================
# 📌 Block 4: Enhanced BatchPreparer
# ============================================
class EnhancedBatchPreparer:
    """Enhanced batch preparer สำหรับนิยายไทย"""

    def __init__(self, input_folder=None, output_folder=None):
        self.input_folder = Path(input_folder or Config.RAW_OCR_DIR)
        self.output_folder = Path(output_folder or Config.BATCHES_DIR)
        self.input_folder.mkdir(exist_ok=True)
        self.output_folder.mkdir(exist_ok=True)
        self.analyzer = NovelTextAnalyzer()

    def pre_clean_text(self, text: str) -> str:
        for old, new in Config.OCR_REPLACEMENTS.items():
            text = text.replace(old, new)
        text = self.analyzer.fix_broken_words(text)
        text = self._smart_paragraph_split(text)
        text = re.sub(r'\n{3,}', '\n\n', text)
        text = re.sub(r' {2,}', ' ', text)
        return text.strip()

    def _smart_paragraph_split(self, text: str) -> str:
        lines = text.split('\n')
        paragraphs = []
        current_para = []

        for i, line in enumerate(lines):
            line = line.strip()
            if not line:
                if current_para:
                    paragraphs.append(' '.join(current_para))
                    current_para = []
                continue

            if self.analyzer.is_dialogue(line):
                if current_para and not self.analyzer.is_dialogue(current_para[-1]):
                    paragraphs.append(' '.join(current_para))
                    current_para = [line]
                else:
                    current_para.append(line)
            else:
                if i > 0 and current_para:
                    if self.analyzer.should_merge_lines(current_para[-1], line):
                        current_para.append(line)
                    else:
                        paragraphs.append(' '.join(current_para))
                        current_para = [line]
                else:
                    current_para.append(line)

        if current_para:
            paragraphs.append(' '.join(current_para))
        return '\n'.join(paragraphs)

    def create_enhanced_prompt(self, batch_text: str) -> str:
        prompt = f"""กรุณาแก้ไขข้อความ OCR จากนิยายภาษาไทยต่อไปนี้

กฎการแก้ไข:
1. แก้คำผิด typo และการสะกดผิด
2. แก้คำที่ขาดหาย/แตกหัก
3. ลบตัวอักษรเดี่ยวๆ ที่ไม่มีความหมาย
4. รักษารูปแบบบทสนทนา (คำพูดในเครื่องหมาย "...")
5. จัด paragraph ให้เหมาะสม
6. คงรูปแบบ markers [PAGE_XXX] และ [END_PAGE_XXX] ไว้ทุกตัว
7. ห้ามเพิ่มเนื้อหาที่ไม่มีในต้นฉบับ

ข้อความที่ต้องแก้:

{batch_text}

กรุณาแก้ไขแล้วคืนข้อความทั้งหมดพร้อม markers"""
        return prompt

    def create_batch(self, max_pages: int = None) -> Tuple[str, int]:
        max_pages = max_pages or Config.MAX_PAGES_PER_BATCH
        files = sorted(self.input_folder.glob("*.txt"))[:max_pages]

        if not files:
            print("❌ ไม่พบไฟล์ใน folder raw_ocr/")
            return "", 0

        batch_parts = ["[START_BATCH]"]
        stats = {'total_lines': 0, 'suspicious_lines': 0, 'merged_lines': 0}

        for i, file_path in enumerate(files, 1):
            try:
                text = file_path.read_text(encoding='utf-8')
                original_lines = len(text.split('\n'))
                cleaned_text = self.pre_clean_text(text)
                cleaned_lines = len(cleaned_text.split('\n'))
                stats['total_lines'] += original_lines
                stats['merged_lines'] += (original_lines - cleaned_lines)

                page_marker = f"[PAGE_{i:03d}]"
                end_marker = f"[END_PAGE_{i:03d}]"
                batch_parts.append(f"\n{page_marker}\n{cleaned_text}\n{end_marker}")

            except Exception as e:
                print(f"⚠️ Error reading {file_path.name}: {e}")
                continue

        batch_parts.append("\n[END_BATCH]")
        batch_text = ''.join(batch_parts)

        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        batch_file = self.output_folder / f"batch_{timestamp}.txt"
        batch_file.write_text(batch_text, encoding='utf-8')

        print(f"✅ สร้าง batch สำเร็จ: {batch_file.name}")
        print(f"   📄 จำนวน: {len(files)} หน้า")
        print(f"   💾 ขนาด: ~{len(batch_text.split())} คำ")

        return batch_text, len(files)

    def prepare_and_copy(self, max_pages: int = None):
        batch_text, page_count = self.create_batch(max_pages)
        if page_count == 0:
            return

        prompt = self.create_enhanced_prompt(batch_text)
        estimated_tokens = len(prompt) // 2

        prompt_file = self.output_folder / "latest_prompt.txt"
        prompt_file.write_text(prompt, encoding='utf-8')
        print(f"💾 บันทึก prompt ไว้ที่: {prompt_file}")
        print(f"📊 ประมาณ {estimated_tokens:,} tokens")
        print(f"\n📝 ขั้นตอนต่อไป:")
        print("   1. เปิดไฟล์ prompt ใน Drive")
        print("   2. Copy ไปใส่ ChatGPT/Claude")
        print("   3. Copy ผลลัพธ์กลับมา")
        print("   4. Run menu option 2")

print("✅ Batch Preparer ready")

In [None]:
# ============================================
# 📌 Block 5: Quality Validator
# ============================================
class QualityValidator:
    @staticmethod
    def validate_text(original: str, cleaned: str) -> Dict:
        issues = []
        len_ratio = len(cleaned) / len(original) if len(original) > 0 else 0
        if len_ratio < 0.5:
            issues.append("⚠️ ข้อความสั้นลงมาก")
        elif len_ratio > 1.5:
            issues.append("⚠️ ข้อความยาวขึ้นมาก")

        orig_quotes = len(re.findall(r'"[^"]*"', original))
        clean_quotes = len(re.findall(r'"[^"]*"', cleaned))
        if abs(orig_quotes - clean_quotes) > 2:
            issues.append(f"⚠️ จำนวนบทสนทนาต่างกัน ({orig_quotes} -> {clean_quotes})")

        return {
            'valid': len(issues) == 0,
            'issues': issues,
            'stats': {
                'length_ratio': len_ratio,
                'dialogue_count': clean_quotes,
            }
        }

# ============================================
# 📌 Block 6: Enhanced Result Parser
# ============================================
class EnhancedResultParser:
    def __init__(self, output_folder=None, report_folder=None):
        llm_type = input("ผลลัพธ์จาก [1] GPT หรือ [2] Claude? (1/2): ").strip()

        if llm_type == '2':
            self.output_folder = Path(output_folder or Config.CLEANED_CLAUDE_DIR)
        else:
            self.output_folder = Path(output_folder or Config.CLEANED_GPT_DIR)

        self.report_folder = Path(report_folder or Config.REPORTS_DIR)
        self.output_folder.mkdir(exist_ok=True)
        self.report_folder.mkdir(exist_ok=True)
        self.validator = QualityValidator()

    def extract_pages(self, llm_output: str) -> Dict[int, str]:
        pages = {}
        pattern = r'\[PAGE_(\d{3})\](.*?)\[END_PAGE_\1\]'
        matches = re.findall(pattern, llm_output, re.DOTALL)
        for page_num, content in matches:
            page_number = int(page_num)
            pages[page_number] = content.strip()
        return pages

    def save_pages(self, pages: Dict[int, str]) -> int:
        saved = 0
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        for page_num, content in pages.items():
            filename = f"page_{page_num:03d}_clean_{timestamp}.txt"
            filepath = self.output_folder / filename
            try:
                filepath.write_text(content, encoding='utf-8')
                saved += 1
            except Exception as e:
                print(f"⚠️ Error saving {filename}: {e}")
        return saved

    def parse_from_clipboard(self):
        llm_output = ""

        # Try reading from file
        print("📝 กรุณา paste ผลลัพธ์ในไฟล์ 'llm_output.txt'")
        output_file = Path("llm_output.txt")
        if output_file.exists():
            llm_output = output_file.read_text(encoding='utf-8')
        else:
            print("❌ ไม่พบไฟล์ llm_output.txt")
            return

        print(f"📋 รับข้อความ {len(llm_output)} characters")
        pages = self.extract_pages(llm_output)

        if not pages:
            print("❌ ไม่พบ page markers")
            return

        print(f"✅ พบ {len(pages)} หน้า")
        saved = self.save_pages(pages)
        print(f"💾 บันทึก {saved} ไฟล์ไปที่ {self.output_folder}/")

print("✅ Parser ready")

In [None]:
# ============================================
# 📌 Block 7: Main Menu
# ============================================
def main_menu():
    """Interactive main menu"""

    while True:
        print("""
╔════════════════════════════════════════════╗
║   ENHANCED OCR SCRIPTS - THAI NOVEL v2.0   ║
╚════════════════════════════════════════════╝

[1] 📦 Prepare Batch - รวมไฟล์พร้อม smart cleaning
[2] 📋 Parse Results - แยกผลพร้อม quality check
[3] 🔍 Compare Versions - เปรียบเทียบ GPT vs Claude
[4] 📊 Show Statistics - ดูสถิติ corpus
[5] 🔧 Test Analyzer - ทดสอบ text analyzer
[6] ❌ Exit

        """)

        choice = input("Select (1-6): ").strip()

        if choice == '1':
            print("\n🚀 Running Enhanced Batch Preparer...")
            print("-" * 40)
            preparer = EnhancedBatchPreparer()
            preparer.prepare_and_copy()
            input("\nPress Enter to continue...")

        elif choice == '2':
            print("\n🚀 Running Enhanced Result Parser...")
            print("-" * 40)
            parser = EnhancedResultParser()
            parser.parse_from_clipboard()
            input("\nPress Enter to continue...")

        elif choice == '3':
            print("🔍 Compare feature - Coming soon!")
            input("\nPress Enter to continue...")

        elif choice == '4':
            print("\n📊 Statistics:")
            raw = len(list(Path(Config.RAW_OCR_DIR).glob("*.txt")))
            print(f"Raw OCR files: {raw}")
            input("\nPress Enter to continue...")

        elif choice == '5':
            print("🔧 Test Analyzer - Coming soon!")
            input("\nPress Enter to continue...")

        elif choice == '6':
            print("\n👋 Goodbye!")
            break

        else:
            print("❌ Invalid choice")

print("✅ Main menu ready!")
print("\n🎯 Run: main_menu() to start")