<a href="https://colab.research.google.com/github/tanatet8/Colab_Script/blob/main/Add%20redundancy%20checker%20for%2046-type%20dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ============================================
# REDUNDANCY CHECKER FIXED VERSION
# แก้ปัญหา tier ที่เป็น '13B' และ error อื่นๆ
# ============================================

# ============================================
# 📌 Block 1: Setup & Mount
# ติดตั้งไลบรารี + Mount Google Drive
# ============================================
from google.colab import drive
drive.mount('/content/drive')

!pip install -q pandas numpy scikit-learn matplotlib seaborn openpyxl tqdm

import pandas as pd
import numpy as np
from pathlib import Path
import re
import json
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

print("✅ Libraries loaded")

In [None]:
# ============================================
# 📌 Block 2: Configuration
# ตั้งค่าพารามิเตอร์หลัก เช่น path, batch size, threshold
# ============================================
class Config:
    # Paths - แก้ตรงนี้ให้เป็น path dataset และที่บันทึกผล
    DATASET_DIR = '/content/drive/MyDrive/Dataset_Curation'
    OUTPUT_DIR = '/content/drive/MyDrive/Dataset_Curation/redundancy_reports'

    # Processing
    BATCH_SIZE = 500  # จำนวน prompt ที่ประมวลผลต่อ batch
    MAX_PROMPTS = None  # None = ใช้ทั้งหมด หรือใส่ตัวเลขเพื่อจำกัด

    # Redundancy Thresholds by Type & Tier
    THRESHOLDS = {
        # Type: [Tier1-2, Tier3-4, Tier5-6]
        'causal_reasoning': [0.75, 0.60, 0.40],
        'symbolic_reasoning': [0.70, 0.55, 0.35],
        'meta_reasoning': [0.60, 0.45, 0.30],
        'moral_ambiguity_tradeoff': [0.50, 0.35, 0.25],
        'philosophical_logic': [0.45, 0.30, 0.20],
        # Default สำหรับ type อื่น ๆ
        'default': [0.65, 0.50, 0.35]
    }

print("✅ Config loaded")

In [None]:
# ============================================
# 📌 Block 3: Data Extraction
# โหลดไฟล์ markdown, แยกข้อมูล metadata และ prompt
# ============================================
class DataExtractor:
    @staticmethod
    def parse_block(block_text):
        """Extract data จาก 1 block ของ prompt"""
        data = {}

        # Metadata section
        meta_match = re.search(r'###\s*Metadata\s*\n(.*?)(?=\n###|\n##|$)',
                              block_text, re.DOTALL)
        if meta_match:
            for line in meta_match.group(1).split('\n'):
                if ':' in line:
                    key, value = line.split(':', 1)
                    data[key.strip()] = value.strip()

        # Prompts (TH, EN, ZH)
        for lang in ['TH', 'EN', 'ZH']:
            pattern = rf'###?\s*Prompt\s*\({lang}\)\s*\n(.*?)(?=\n###|\n##|$)'
            match = re.search(pattern, block_text, re.DOTALL)
            if match:
                data[f'prompt_{lang.lower()}'] = match.group(1).strip()

        # Reasoning
        reason_match = re.search(r'###\s*Reasoning\s*\n(.*?)(?=\n###|$)',
                                block_text, re.DOTALL)
        if reason_match:
            data['reasoning'] = reason_match.group(1).strip()

        return data

    @staticmethod
    def load_all_files(dataset_dir, max_prompts=None):
        """โหลดไฟล์ MD ทั้งหมด"""
        all_prompts = []
        md_files = sorted(Path(dataset_dir).glob('*_batch_*.md'))

        print(f"📂 พบ {len(md_files)} batch files")

        for file_path in tqdm(md_files, desc="Loading files"):
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()

            # แบ่งโดย ## Prompt N
            blocks = re.split(r'##\s*Prompt\s+\d+', content)[1:]

            for i, block in enumerate(blocks):
                if max_prompts and len(all_prompts) >= max_prompts:
                    break

                prompt_data = DataExtractor.parse_block(block)
                prompt_data['file'] = file_path.name
                prompt_data['prompt_id'] = f"{file_path.stem}_p{i+1:03d}"

                # Clean tier value (handle '13B' case)
                if 'tier' in prompt_data:
                    tier_val = prompt_data['tier']
                    # ถ้า tier เป็น '13B' หรือค่าที่มี 'B' ให้ใช้ default tier
                    if 'B' in str(tier_val).upper():
                        prompt_data['model_size'] = tier_val
                        prompt_data['tier'] = '2'  # Default tier
                    elif not str(tier_val).isdigit():
                        prompt_data['tier'] = '2'  # Default if not numeric
                    else:
                        # Convert to int to validate
                        try:
                            tier_int = int(tier_val)
                            if tier_int > 6:
                                prompt_data['model_size'] = f"{tier_int}B"
                                prompt_data['tier'] = '2'
                            else:
                                prompt_data['tier'] = str(tier_int)
                        except:
                            prompt_data['tier'] = '2'

                all_prompts.append(prompt_data)

            if max_prompts and len(all_prompts) >= max_prompts:
                break

        return pd.DataFrame(all_prompts)

print("✅ Data Extractor ready")

In [None]:
# ============================================
# 📌 Block 4: Redundancy Analysis - FIXED
# วิเคราะห์ความซ้ำซ้อนของ prompt ด้วย TF-IDF + cosine similarity
# ============================================
class RedundancyAnalyzer:
    def __init__(self, df, config):
        self.df = df
        self.config = config
        self.similarity_matrices = {}

        # Clean tier column
        if 'tier' in self.df.columns:
            self.df['tier'] = self.df['tier'].apply(self.clean_tier_value)

    def clean_tier_value(self, tier_val):
        """Clean tier value to ensure it's numeric 1-6"""
        if pd.isna(tier_val):
            return 2

        tier_str = str(tier_val)

        # Handle '13B' or similar cases
        if 'B' in tier_str.upper():
            return 2  # Default tier for model sizes

        # Try to extract numeric value
        try:
            tier_num = int(re.search(r'\d+', tier_str).group())
            if 1 <= tier_num <= 6:
                return tier_num
            else:
                return 2  # Default if out of range
        except:
            return 2  # Default if can't parse

    def get_threshold(self, reasoning_type, tier):
        """ดึง threshold ตาม reasoning_type และ tier"""
        # Ensure tier is int
        try:
            tier = int(tier)
        except:
            tier = 2

        tier_idx = min((tier-1)//2, 2)

        if reasoning_type in self.config.THRESHOLDS:
            thresholds = self.config.THRESHOLDS[reasoning_type]
        else:
            thresholds = self.config.THRESHOLDS['default']
        return thresholds[tier_idx]

    def calculate_similarity_batch(self, df_batch, text_col='prompt_en'):
        """คำนวณ similarity สำหรับ batch"""
        # Use prompt_th as fallback if prompt_en doesn't exist
        if text_col not in df_batch.columns or df_batch[text_col].isna().all():
            text_col = 'prompt_th'

        valid_df = df_batch[df_batch[text_col].notna()].reset_index(drop=True)
        if len(valid_df) < 2:
            return None, []

        # TF-IDF
        try:
            vectorizer = TfidfVectorizer(
                max_features=500,
                ngram_range=(1, 3),
                min_df=1,  # Changed from 2 to 1 for small datasets
                max_df=0.95
            )
            tfidf_matrix = vectorizer.fit_transform(valid_df[text_col])
            sim_matrix = cosine_similarity(tfidf_matrix)
        except Exception as e:
            print(f"⚠️ Error in TF-IDF: {e}")
            return None, []

        # หา pair ที่เกิน threshold
        pairs = []
        for i in range(len(sim_matrix)):
            for j in range(i+1, len(sim_matrix)):
                row_i = valid_df.iloc[i]
                row_j = valid_df.iloc[j]

                tier_i = int(row_i.get('tier', 2))
                tier_j = int(row_j.get('tier', 2))
                type_i = row_i.get('reasoning_type', 'default')

                threshold = self.get_threshold(type_i, max(tier_i, tier_j))

                if sim_matrix[i][j] >= threshold:
                    pairs.append({
                        'idx1': i,
                        'idx2': j,
                        'prompt1_id': row_i['prompt_id'],
                        'prompt2_id': row_j['prompt_id'],
                        'similarity': sim_matrix[i][j],
                        'threshold_used': threshold,
                        'type1': type_i,
                        'type2': row_j.get('reasoning_type', 'default'),
                        'tier1': tier_i,
                        'tier2': tier_j,
                        'prompt1': row_i[text_col][:100] if text_col in row_i.index else 'N/A',
                        'prompt2': row_j[text_col][:100] if text_col in row_j.index else 'N/A'
                    })
        return sim_matrix, pairs

    def analyze_all(self):
        """วิเคราะห์ทุก prompt ใน batch"""
        all_pairs = []
        batch_size = self.config.BATCH_SIZE
        n_batches = (len(self.df) + batch_size - 1) // batch_size

        for batch_idx in tqdm(range(n_batches), desc="Analyzing batches"):
            start = batch_idx * batch_size
            end = min(start + batch_size, len(self.df))
            df_batch = self.df.iloc[start:end]
            _, pairs = self.calculate_similarity_batch(df_batch)
            all_pairs.extend(pairs)

        return all_pairs

    def get_distribution_stats(self):
        """คำนวณสถิติการกระจายข้อมูล"""
        stats = {}
        categories = ['reasoning_type', 'sub_type', 'domain_context', 'difficulty', 'tier']

        for cat in categories:
            if cat in self.df.columns:
                value_counts = self.df[cat].value_counts()
                stats[cat] = {
                    'distribution': value_counts.to_dict(),
                    'unique': len(value_counts),
                    'max': value_counts.max(),
                    'min': value_counts.min(),
                    'std': value_counts.std(),
                    'imbalance_ratio': value_counts.max() / value_counts.min() if value_counts.min() > 0 else float('inf')
                }
        return stats

print("✅ Redundancy Analyzer ready")

In [None]:
# ============================================
# 📌 Block 5: Fix Suggestions
# สร้างข้อเสนอแนะแก้ไขสำหรับคู่ prompt ที่ซ้ำ
# ============================================
class RedundancyFixer:
    @staticmethod
    def suggest_fixes(similar_pairs):
        """Suggest fixes for redundant pairs"""
        suggestions = []

        for pair in similar_pairs:
            fix_options = []

            # Domain shift suggestion
            if pair['similarity'] > 0.9:
                fix_options.append({
                    'method': 'domain_shift',
                    'priority': 'high',
                    'description': 'Change domain context completely'
                })

            # Complexity change
            if pair['tier1'] == pair['tier2']:
                fix_options.append({
                    'method': 'complexity_change',
                    'priority': 'medium',
                    'description': f'Adjust complexity (current tier: {pair["tier1"]})'
                })

            # Angle change
            if 0.7 < pair['similarity'] <= 0.9:
                fix_options.append({
                    'method': 'angle_change',
                    'priority': 'medium',
                    'description': 'Change questioning angle or framing'
                })

            suggestions.append({
                'pair': pair,
                'fixes': fix_options
            })

        return suggestions

print("✅ Fixer ready")

In [None]:
# ============================================
# 📌 Block 6: Report Generation
# สร้างรายงาน Excel ครบชุด + หา coverage gaps
# ============================================
class ReportGenerator:
    @staticmethod
    def create_excel_report(df, similar_pairs, stats, suggestions, output_path):
        """Generate comprehensive Excel report"""

        with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
            # Sheet 1: Overview
            overview_data = {
                'Metric': ['Total Prompts', 'Redundant Pairs', 'Avg Similarity',
                          'Types Count', 'Tiers Range'],
                'Value': [
                    len(df),
                    len(similar_pairs),
                    np.mean([p['similarity'] for p in similar_pairs]) if similar_pairs else 0,
                    df['reasoning_type'].nunique() if 'reasoning_type' in df.columns else 0,
                    f"{df['tier'].min()}-{df['tier'].max()}" if 'tier' in df.columns else 'N/A'
                ]
            }
            pd.DataFrame(overview_data).to_excel(writer, sheet_name='Overview', index=False)

            # Sheet 2: Similar Pairs
            if similar_pairs:
                df_pairs = pd.DataFrame(similar_pairs)
                df_pairs = df_pairs.sort_values('similarity', ascending=False)
                df_pairs.to_excel(writer, sheet_name='Similar_Pairs', index=False)

            # Sheet 3: Distribution
            dist_data = []
            for cat, cat_stats in stats.items():
                for value, count in cat_stats['distribution'].items():
                    dist_data.append({
                        'Category': cat,
                        'Value': value,
                        'Count': count,
                        'Percentage': count / len(df) * 100
                    })
            pd.DataFrame(dist_data).to_excel(writer, sheet_name='Distribution', index=False)

            # Sheet 4: Fix Suggestions
            fix_data = []
            for sug in suggestions[:100]:  # Top 100
                pair = sug['pair']
                for fix in sug['fixes']:
                    fix_data.append({
                        'Prompt1_ID': pair['prompt1_id'],
                        'Prompt2_ID': pair['prompt2_id'],
                        'Similarity': pair['similarity'],
                        'Fix_Method': fix['method'],
                        'Priority': fix['priority'],
                        'Description': fix['description']
                    })
            if fix_data:
                pd.DataFrame(fix_data).to_excel(writer, sheet_name='Fix_Suggestions', index=False)

            # Sheet 5: Gaps Analysis
            gaps = ReportGenerator.find_gaps(df)
            if gaps:
                pd.DataFrame(gaps).to_excel(writer, sheet_name='Gaps', index=False)

        print(f"✅ Report saved to: {output_path}")

    @staticmethod
    def find_gaps(df):
        """Find coverage gaps"""
        gaps = []

        if 'reasoning_type' in df.columns and 'difficulty' in df.columns:
            # Check all combinations
            types = df['reasoning_type'].unique()
            difficulties = ['easy', 'medium', 'hard']

            for t in types:
                for d in difficulties:
                    count = len(df[(df['reasoning_type'] == t) &
                                  (df['difficulty'] == d)])
                    if count < 10:  # Threshold
                        gaps.append({
                            'Type': t,
                            'Difficulty': d,
                            'Current_Count': count,
                            'Target': 10,
                            'Gap': 10 - count
                        })

        return gaps

print("✅ Report Generator ready")

In [None]:
# ============================================
# 📌 Block 7: Visualization
# ทำกราฟสรุปการกระจายประเภท, tier, difficulty, domain และการกระจาย similarity
# ============================================
def create_visualizations(df, similar_pairs, stats):
    """Create analysis visualizations"""

    fig, axes = plt.subplots(2, 3, figsize=(18, 12))

    # 1. Type distribution
    if 'reasoning_type' in df.columns:
        type_counts = df['reasoning_type'].value_counts().head(15)
        type_counts.plot(kind='bar', ax=axes[0,0], color='skyblue')
        axes[0,0].set_title('Top 15 Reasoning Types')
        axes[0,0].set_xlabel('Type')
        axes[0,0].set_ylabel('Count')
        axes[0,0].tick_params(axis='x', rotation=45)

    # 2. Tier distribution
    if 'tier' in df.columns:
        # Ensure tier values are clean
        tier_counts = df['tier'].value_counts().sort_index()
        tier_counts.plot(kind='bar', ax=axes[0,1], color='lightgreen')
        axes[0,1].set_title('Tier Distribution')
        axes[0,1].set_xlabel('Tier')
        axes[0,1].set_ylabel('Count')

    # 3. Similarity distribution
    if similar_pairs:
        similarities = [p['similarity'] for p in similar_pairs]
        axes[0,2].hist(similarities, bins=30, color='coral', edgecolor='black')
        axes[0,2].set_title('Similarity Score Distribution')
        axes[0,2].set_xlabel('Similarity')
        axes[0,2].set_ylabel('Frequency')
        axes[0,2].axvline(x=0.7, color='r', linestyle='--', label='Threshold')
        axes[0,2].legend()

    # 4. Difficulty distribution
    if 'difficulty' in df.columns:
        diff_counts = df['difficulty'].value_counts()
        diff_counts.plot(kind='pie', ax=axes[1,0], autopct='%1.1f%%')
        axes[1,0].set_title('Difficulty Distribution')

    # 5. Domain distribution
    if 'domain_context' in df.columns:
        domain_counts = df['domain_context'].value_counts().head(10)
        domain_counts.plot(kind='barh', ax=axes[1,1], color='plum')
        axes[1,1].set_title('Top 10 Domains')
        axes[1,1].set_xlabel('Count')

    # 6. Redundancy by Type
    if similar_pairs and 'reasoning_type' in df.columns:
        redundancy_by_type = defaultdict(int)
        for pair in similar_pairs:
            redundancy_by_type[pair['type1']] += 1

        if redundancy_by_type:
            top_redundant = dict(sorted(redundancy_by_type.items(),
                                       key=lambda x: x[1], reverse=True)[:10])
            axes[1,2].bar(range(len(top_redundant)), list(top_redundant.values()),
                         color='salmon')
            axes[1,2].set_xticks(range(len(top_redundant)))
            axes[1,2].set_xticklabels(list(top_redundant.keys()), rotation=45, ha='right')
            axes[1,2].set_title('Top 10 Types with Redundancy')
            axes[1,2].set_ylabel('Redundant Pairs')

    plt.tight_layout()

    # Save figure
    output_path = '/content/redundancy_analysis.png'
    plt.savefig(output_path, dpi=150, bbox_inches='tight')
    print(f"📊 Visualization saved to: {output_path}")
    plt.show()

print("✅ Visualization ready")

In [None]:
# ============================================
# 📌 Block 8: Main Pipeline
# รันทั้งกระบวนการ: โหลด, วิเคราะห์, สถิติ, ข้อเสนอแนะ, รายงาน, กราฟ, สรุป
# ============================================
def main():
    """Main analysis pipeline"""

    print("="*60)
    print("🔍 REDUNDANCY CHECKER - 46 TYPE DATASET")
    print("="*60)

    # Create output directory
    Path(Config.OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

    # Load data
    print("\n📊 Loading data...")
    df = DataExtractor.load_all_files(Config.DATASET_DIR, Config.MAX_PROMPTS)
    print(f"✅ Loaded {len(df)} prompts")

    # Basic info
    print("\n📋 Dataset Info:")
    print(f"  Files: {df['file'].nunique()}")
    print(f"  Types: {df['reasoning_type'].nunique() if 'reasoning_type' in df.columns else 'N/A'}")

    if 'tier' in df.columns:
        unique_tiers = df['tier'].unique().tolist()
        print(f"  Tiers: {sorted([t for t in unique_tiers if str(t).isdigit()])}")

    # Check for model_size column
    if 'model_size' in df.columns:
        print(f"  Model sizes: {df['model_size'].dropna().unique().tolist()}")

    # Analyze redundancy
    print("\n🔍 Analyzing redundancy...")
    analyzer = RedundancyAnalyzer(df, Config)
    similar_pairs = analyzer.analyze_all()
    print(f"✅ Found {len(similar_pairs)} redundant pairs")

    # Get statistics
    print("\n📈 Calculating statistics...")
    stats = analyzer.get_distribution_stats()

    # Generate suggestions
    print("\n💡 Generating fix suggestions...")
    fixer = RedundancyFixer()
    suggestions = fixer.suggest_fixes(similar_pairs)

    # Create report
    print("\n📝 Creating Excel report...")
    report_path = Path(Config.OUTPUT_DIR) / 'redundancy_report.xlsx'
    ReportGenerator.create_excel_report(df, similar_pairs, stats, suggestions, report_path)

    # Visualizations
    print("\n📊 Creating visualizations...")
    create_visualizations(df, similar_pairs, stats)

    # Summary
    print("\n" + "="*60)
    print("✅ ANALYSIS COMPLETE!")
    print("="*60)
    print(f"\n📊 Summary:")
    print(f"  Total Prompts: {len(df)}")
    print(f"  Redundant Pairs: {len(similar_pairs)}")

    if len(df) > 1:
        max_pairs = len(df) * (len(df) - 1) / 2
        redundancy_rate = len(similar_pairs) / max_pairs * 100
        print(f"  Redundancy Rate: {redundancy_rate:.2f}%")

    if stats.get('reasoning_type'):
        print(f"  Type Imbalance: {stats['reasoning_type']['imbalance_ratio']:.2f}x")

    print(f"\n📁 Files Generated:")
    print(f"  - {report_path}")
    print(f"  - /content/redundancy_analysis.png")

    return df, similar_pairs, stats

print("✅ All functions ready")

In [None]:
# ============================================
# 📌 Block 9: Run
# จุดเริ่มรันโปรแกรมหลักใน Colab
# ============================================
if __name__ == "__main__":
    df, pairs, stats = main()
    print("\n🎉 Done! Check the output files in your Drive.")