<a href="https://colab.research.google.com/github/tanatet8/Colab_Script/blob/main/Add%20redundancy%20checker%20for%2046-type%20dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ============================================
# üìå Block 1: Setup & Mount
# ‡∏ï‡∏¥‡∏î‡∏ï‡∏±‡πâ‡∏á‡πÑ‡∏•‡∏ö‡∏£‡∏≤‡∏£‡∏µ + Mount Google Drive
# ============================================
from google.colab import drive
drive.mount('/content/drive')

!pip install -q pandas numpy scikit-learn matplotlib seaborn openpyxl tqdm

import pandas as pd
import numpy as np
from pathlib import Path
import re
import json
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [None]:
# ============================================
# üìå Block 2: Configuration
# ‡∏ï‡∏±‡πâ‡∏á‡∏Ñ‡πà‡∏≤‡∏û‡∏≤‡∏£‡∏≤‡∏°‡∏¥‡πÄ‡∏ï‡∏≠‡∏£‡πå‡∏´‡∏•‡∏±‡∏Å ‡πÄ‡∏ä‡πà‡∏ô path, batch size, threshold
# ============================================
class Config:
    # Paths - ‡πÅ‡∏Å‡πâ‡∏ï‡∏£‡∏á‡∏ô‡∏µ‡πâ‡πÉ‡∏´‡πâ‡πÄ‡∏õ‡πá‡∏ô path dataset ‡πÅ‡∏•‡∏∞‡∏ó‡∏µ‡πà‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏ú‡∏•
    DATASET_DIR = '/content/drive/MyDrive/Dataset_Curation'
    OUTPUT_DIR = '/content/drive/MyDrive/Dataset_Curation/redundancy_reports'

    # Processing
    BATCH_SIZE = 500  # ‡∏à‡∏≥‡∏ô‡∏ß‡∏ô prompt ‡∏ó‡∏µ‡πà‡∏õ‡∏£‡∏∞‡∏°‡∏ß‡∏•‡∏ú‡∏•‡∏ï‡πà‡∏≠ batch
    MAX_PROMPTS = None  # None = ‡πÉ‡∏ä‡πâ‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î ‡∏´‡∏£‡∏∑‡∏≠‡πÉ‡∏™‡πà‡∏ï‡∏±‡∏ß‡πÄ‡∏•‡∏Ç‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏à‡∏≥‡∏Å‡∏±‡∏î

    # Redundancy Thresholds by Type & Tier
    THRESHOLDS = {
        # Type: [Tier1-2, Tier3-4, Tier5-6]
        'causal_reasoning': [0.75, 0.60, 0.40],
        'symbolic_reasoning': [0.70, 0.55, 0.35],
        'meta_reasoning': [0.60, 0.45, 0.30],
        'moral_ambiguity_tradeoff': [0.50, 0.35, 0.25],
        'philosophical_logic': [0.45, 0.30, 0.20],
        # Default ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö type ‡∏≠‡∏∑‡πà‡∏ô ‡πÜ
        'default': [0.65, 0.50, 0.35]
    }

In [None]:
# ============================================
# üìå Block 3: Data Extraction
# ‡πÇ‡∏´‡∏•‡∏î‡πÑ‡∏ü‡∏•‡πå markdown, ‡πÅ‡∏¢‡∏Å‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏• metadata ‡πÅ‡∏•‡∏∞ prompt
# ============================================
class DataExtractor:
    @staticmethod
    def parse_block(block_text):
        """Extract data ‡∏à‡∏≤‡∏Å 1 block ‡∏Ç‡∏≠‡∏á prompt"""
        data = {}

        # Metadata section
        meta_match = re.search(r'###\s*Metadata\s*\n(.*?)(?=\n###|\n##|$)',
                              block_text, re.DOTALL)
        if meta_match:
            for line in meta_match.group(1).split('\n'):
                if ':' in line:
                    key, value = line.split(':', 1)
                    data[key.strip()] = value.strip()

        # Prompts (TH, EN, ZH)
        for lang in ['TH', 'EN', 'ZH']:
            pattern = rf'###?\s*Prompt\s*\({lang}\)\s*\n(.*?)(?=\n###|\n##|$)'
            match = re.search(pattern, block_text, re.DOTALL)
            if match:
                data[f'prompt_{lang.lower()}'] = match.group(1).strip()

        # Reasoning
        reason_match = re.search(r'###\s*Reasoning\s*\n(.*?)(?=\n###|$)',
                                block_text, re.DOTALL)
        if reason_match:
            data['reasoning'] = reason_match.group(1).strip()

        return data

    @staticmethod
    def load_all_files(dataset_dir, max_prompts=None):
        """‡πÇ‡∏´‡∏•‡∏î‡πÑ‡∏ü‡∏•‡πå MD ‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î"""
        all_prompts = []
        md_files = sorted(Path(dataset_dir).glob('*_batch_*.md'))

        print(f"üìÇ ‡∏û‡∏ö {len(md_files)} batch files")

        for file_path in tqdm(md_files, desc="Loading files"):
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()

            # ‡πÅ‡∏ö‡πà‡∏á‡πÇ‡∏î‡∏¢ ## Prompt N
            blocks = re.split(r'##\s*Prompt\s+\d+', content)[1:]

            for i, block in enumerate(blocks):
                if max_prompts and len(all_prompts) >= max_prompts:
                    break

                prompt_data = DataExtractor.parse_block(block)
                prompt_data['file'] = file_path.name
                prompt_data['prompt_id'] = f"{file_path.stem}_p{i+1:03d}"
                all_prompts.append(prompt_data)

            if max_prompts and len(all_prompts) >= max_prompts:
                break

        return pd.DataFrame(all_prompts)

In [None]:
# ============================================
# üìå Block 4: Redundancy Analysis
# ‡∏ß‡∏¥‡πÄ‡∏Ñ‡∏£‡∏≤‡∏∞‡∏´‡πå‡∏Ñ‡∏ß‡∏≤‡∏°‡∏ã‡πâ‡∏≥‡∏ã‡πâ‡∏≠‡∏ô‡∏Ç‡∏≠‡∏á prompt ‡∏î‡πâ‡∏ß‡∏¢ TF-IDF + cosine similarity
# ============================================
class RedundancyAnalyzer:
    def __init__(self, df, config):
        self.df = df
        self.config = config
        self.similarity_matrices = {}

    def get_threshold(self, reasoning_type, tier):
        """‡∏î‡∏∂‡∏á threshold ‡∏ï‡∏≤‡∏° reasoning_type ‡πÅ‡∏•‡∏∞ tier"""
        tier_idx = min((tier-1)//2, 2)
        if reasoning_type in self.config.THRESHOLDS:
            thresholds = self.config.THRESHOLDS[reasoning_type]
        else:
            thresholds = self.config.THRESHOLDS['default']
        return thresholds[tier_idx]

    def calculate_similarity_batch(self, df_batch, text_col='prompt_en'):
        """‡∏Ñ‡∏≥‡∏ô‡∏ß‡∏ì similarity ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö batch"""
        valid_df = df_batch[df_batch[text_col].notna()].reset_index(drop=True)
        if len(valid_df) < 2:
            return None, []

        # TF-IDF
        vectorizer = TfidfVectorizer(
            max_features=500,
            ngram_range=(1, 3),
            min_df=2,
            max_df=0.95
        )
        tfidf_matrix = vectorizer.fit_transform(valid_df[text_col])
        sim_matrix = cosine_similarity(tfidf_matrix)

        # ‡∏´‡∏≤ pair ‡∏ó‡∏µ‡πà‡πÄ‡∏Å‡∏¥‡∏ô threshold
        pairs = []
        for i in range(len(sim_matrix)):
            for j in range(i+1, len(sim_matrix)):
                row_i = valid_df.iloc[i]
                row_j = valid_df.iloc[j]
                tier_i = int(row_i.get('tier', 2))
                tier_j = int(row_j.get('tier', 2))
                type_i = row_i.get('reasoning_type', 'default')
                threshold = self.get_threshold(type_i, max(tier_i, tier_j))

                if sim_matrix[i][j] >= threshold:
                    pairs.append({
                        'idx1': i,
                        'idx2': j,
                        'prompt1_id': row_i['prompt_id'],
                        'prompt2_id': row_j['prompt_id'],
                        'similarity': sim_matrix[i][j],
                        'threshold_used': threshold,
                        'type1': type_i,
                        'type2': row_j.get('reasoning_type', 'default'),
                        'tier1': tier_i,
                        'tier2': tier_j,
                        'prompt1': row_i[text_col][:100],
                        'prompt2': row_j[text_col][:100]
                    })
        return sim_matrix, pairs

    def analyze_all(self):
        """‡∏ß‡∏¥‡πÄ‡∏Ñ‡∏£‡∏≤‡∏∞‡∏´‡πå‡∏ó‡∏∏‡∏Å prompt ‡πÉ‡∏ô batch"""
        all_pairs = []
        batch_size = self.config.BATCH_SIZE
        n_batches = (len(self.df) + batch_size - 1) // batch_size

        for batch_idx in tqdm(range(n_batches), desc="Analyzing batches"):
            start = batch_idx * batch_size
            end = min(start + batch_size, len(self.df))
            df_batch = self.df.iloc[start:end]
            _, pairs = self.calculate_similarity_batch(df_batch)
            all_pairs.extend(pairs)

        return all_pairs

    def get_distribution_stats(self):
        """‡∏Ñ‡∏≥‡∏ô‡∏ß‡∏ì‡∏™‡∏ñ‡∏¥‡∏ï‡∏¥‡∏Å‡∏≤‡∏£‡∏Å‡∏£‡∏∞‡∏à‡∏≤‡∏¢‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•"""
        stats = {}
        categories = ['reasoning_type', 'sub_type', 'domain_context', 'difficulty', 'tier']

        for cat in categories:
            if cat in self.df.columns:
                value_counts = self.df[cat].value_counts()
                stats[cat] = {
                    'distribution': value_counts.to_dict(),
                    'unique': len(value_counts),
                    'max': value_counts.max(),
                    'min': value_counts.min(),
                    'std': value_counts.std(),
                    'imbalance_ratio': value_counts.max() / value_counts.min() if value_counts.min() > 0 else float('inf')
                }
        return stats

In [None]:
# ============================================
# üìå Block 5: Fix Suggestions
# ‡∏™‡∏£‡πâ‡∏≤‡∏á‡∏Ç‡πâ‡∏≠‡πÄ‡∏™‡∏ô‡∏≠‡πÅ‡∏ô‡∏∞‡πÅ‡∏Å‡πâ‡πÑ‡∏Ç‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏Ñ‡∏π‡πà prompt ‡∏ó‡∏µ‡πà‡∏ã‡πâ‡∏≥ (‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô‡πÇ‡∏î‡πÄ‡∏°‡∏ô/‡∏Ñ‡∏ß‡∏≤‡∏°‡∏ã‡∏±‡∏ö‡∏ã‡πâ‡∏≠‡∏ô/‡∏°‡∏∏‡∏°‡∏Ñ‡∏≥‡∏ñ‡∏≤‡∏°)
# ============================================
class RedundancyFixer:
    @staticmethod
    def suggest_fixes(similar_pairs):
        """Suggest fixes for redundant pairs"""
        suggestions = []

        for pair in similar_pairs:
            fix_options = []

            # Domain shift suggestion
            if pair['similarity'] > 0.9:
                fix_options.append({
                    'method': 'domain_shift',
                    'priority': 'high',
                    'description': 'Change domain context completely'
                })

            # Complexity change
            if pair['tier1'] == pair['tier2']:
                fix_options.append({
                    'method': 'complexity_change',
                    'priority': 'medium',
                    'description': f'Adjust complexity (current tier: {pair["tier1"]})'
                })

            # Angle change
            if 0.7 < pair['similarity'] <= 0.9:
                fix_options.append({
                    'method': 'angle_change',
                    'priority': 'medium',
                    'description': 'Change questioning angle or framing'
                })

            suggestions.append({
                'pair': pair,
                'fixes': fix_options
            })

        return suggestions

In [None]:
# ============================================
# üìå Block 6: Report Generation
# ‡∏™‡∏£‡πâ‡∏≤‡∏á‡∏£‡∏≤‡∏¢‡∏á‡∏≤‡∏ô Excel ‡∏Ñ‡∏£‡∏ö‡∏ä‡∏∏‡∏î + ‡∏´‡∏≤ coverage gaps
# ============================================
class ReportGenerator:
    @staticmethod
    def create_excel_report(df, similar_pairs, stats, suggestions, output_path):
        """Generate comprehensive Excel report"""

        with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
            # Sheet 1: Overview
            overview_data = {
                'Metric': ['Total Prompts', 'Redundant Pairs', 'Avg Similarity',
                          'Types Count', 'Tiers Range'],
                'Value': [
                    len(df),
                    len(similar_pairs),
                    np.mean([p['similarity'] for p in similar_pairs]) if similar_pairs else 0,
                    df['reasoning_type'].nunique() if 'reasoning_type' in df.columns else 0,
                    f"{df['tier'].min()}-{df['tier'].max()}" if 'tier' in df.columns else 'N/A'
                ]
            }
            pd.DataFrame(overview_data).to_excel(writer, sheet_name='Overview', index=False)

            # Sheet 2: Similar Pairs
            if similar_pairs:
                df_pairs = pd.DataFrame(similar_pairs)
                df_pairs = df_pairs.sort_values('similarity', ascending=False)
                df_pairs.to_excel(writer, sheet_name='Similar_Pairs', index=False)

            # Sheet 3: Distribution
            dist_data = []
            for cat, cat_stats in stats.items():
                for value, count in cat_stats['distribution'].items():
                    dist_data.append({
                        'Category': cat,
                        'Value': value,
                        'Count': count,
                        'Percentage': count / len(df) * 100
                    })
            pd.DataFrame(dist_data).to_excel(writer, sheet_name='Distribution', index=False)

            # Sheet 4: Fix Suggestions
            fix_data = []
            for sug in suggestions[:100]:  # Top 100
                pair = sug['pair']
                for fix in sug['fixes']:
                    fix_data.append({
                        'Prompt1_ID': pair['prompt1_id'],
                        'Prompt2_ID': pair['prompt2_id'],
                        'Similarity': pair['similarity'],
                        'Fix_Method': fix['method'],
                        'Priority': fix['priority'],
                        'Description': fix['description']
                    })
            pd.DataFrame(fix_data).to_excel(writer, sheet_name='Fix_Suggestions', index=False)

            # Sheet 5: Gaps Analysis
            gaps = ReportGenerator.find_gaps(df)
            pd.DataFrame(gaps).to_excel(writer, sheet_name='Gaps', index=False)

        print(f"‚úÖ Report saved to: {output_path}")

    @staticmethod
    def find_gaps(df):
        """Find coverage gaps"""
        gaps = []

        if 'reasoning_type' in df.columns and 'difficulty' in df.columns:
            # Check all combinations
            types = df['reasoning_type'].unique()
            difficulties = ['easy', 'medium', 'hard']

            for t in types:
                for d in difficulties:
                    count = len(df[(df['reasoning_type'] == t) &
                                  (df['difficulty'] == d)])
                    if count < 10:  # Threshold
                        gaps.append({
                            'Type': t,
                            'Difficulty': d,
                            'Current_Count': count,
                            'Target': 10,
                            'Gap': 10 - count
                        })

        return gaps

In [None]:
# ============================================
# üìå Block 7: Visualization
# ‡∏ó‡∏≥‡∏Å‡∏£‡∏≤‡∏ü‡∏™‡∏£‡∏∏‡∏õ‡∏Å‡∏≤‡∏£‡∏Å‡∏£‡∏∞‡∏à‡∏≤‡∏¢‡∏õ‡∏£‡∏∞‡πÄ‡∏†‡∏ó, tier, difficulty, domain ‡πÅ‡∏•‡∏∞‡∏Å‡∏≤‡∏£‡∏Å‡∏£‡∏∞‡∏à‡∏≤‡∏¢ similarity
# ============================================
def create_visualizations(df, similar_pairs, stats):
    """Create analysis visualizations"""

    fig, axes = plt.subplots(2, 3, figsize=(18, 12))

    # 1. Type distribution
    if 'reasoning_type' in df.columns:
        type_counts = df['reasoning_type'].value_counts().head(15)
        type_counts.plot(kind='bar', ax=axes[0,0], color='skyblue')
        axes[0,0].set_title('Top 15 Reasoning Types')
        axes[0,0].set_xlabel('Type')
        axes[0,0].set_ylabel('Count')
        axes[0,0].tick_params(axis='x', rotation=45)

    # 2. Tier distribution
    if 'tier' in df.columns:
        tier_counts = df['tier'].value_counts().sort_index()
        tier_counts.plot(kind='bar', ax=axes[0,1], color='lightgreen')
        axes[0,1].set_title('Tier Distribution')
        axes[0,1].set_xlabel('Tier')
        axes[0,1].set_ylabel('Count')

    # 3. Similarity distribution
    if similar_pairs:
        similarities = [p['similarity'] for p in similar_pairs]
        axes[0,2].hist(similarities, bins=30, color='coral', edgecolor='black')
        axes[0,2].set_title('Similarity Score Distribution')
        axes[0,2].set_xlabel('Similarity')
        axes[0,2].set_ylabel('Frequency')
        axes[0,2].axvline(x=0.7, color='r', linestyle='--', label='Threshold')
        axes[0,2].legend()

    # 4. Difficulty distribution
    if 'difficulty' in df.columns:
        diff_counts = df['difficulty'].value_counts()
        diff_counts.plot(kind='pie', ax=axes[1,0], autopct='%1.1f%%')
        axes[1,0].set_title('Difficulty Distribution')

    # 5. Domain distribution
    if 'domain_context' in df.columns:
        domain_counts = df['domain_context'].value_counts().head(10)
        domain_counts.plot(kind='barh', ax=axes[1,1], color='plum')
        axes[1,1].set_title('Top 10 Domains')
        axes[1,1].set_xlabel('Count')

    # 6. Redundancy by Type
    if similar_pairs and 'reasoning_type' in df.columns:
        from collections import defaultdict
        redundancy_by_type = defaultdict(int)
        for pair in similar_pairs:
            redundancy_by_type[pair['type1']] += 1

        top_redundant = dict(sorted(redundancy_by_type.items(),
                                   key=lambda x: x[1], reverse=True)[:10])
        axes[1,2].bar(range(len(top_redundant)), list(top_redundant.values()),
                     color='salmon')
        axes[1,2].set_xticks(range(len(top_redundant)))
        axes[1,2].set_xticklabels(list(top_redundant.keys()), rotation=45, ha='right')
        axes[1,2].set_title('Top 10 Types with Redundancy')
        axes[1,2].set_ylabel('Redundant Pairs')

    plt.tight_layout()
    plt.savefig('/content/redundancy_analysis.png', dpi=150, bbox_inches='tight')
    plt.show()

In [None]:
# ============================================
# üìå Block 8: Main Pipeline
# ‡∏£‡∏±‡∏ô‡∏ó‡∏±‡πâ‡∏á‡∏Å‡∏£‡∏∞‡∏ö‡∏ß‡∏ô‡∏Å‡∏≤‡∏£: ‡πÇ‡∏´‡∏•‡∏î, ‡∏ß‡∏¥‡πÄ‡∏Ñ‡∏£‡∏≤‡∏∞‡∏´‡πå, ‡∏™‡∏ñ‡∏¥‡∏ï‡∏¥, ‡∏Ç‡πâ‡∏≠‡πÄ‡∏™‡∏ô‡∏≠‡πÅ‡∏ô‡∏∞, ‡∏£‡∏≤‡∏¢‡∏á‡∏≤‡∏ô, ‡∏Å‡∏£‡∏≤‡∏ü, ‡∏™‡∏£‡∏∏‡∏õ
# ============================================
def main():
    """Main analysis pipeline"""

    print("="*60)
    print("üîç REDUNDANCY CHECKER - 46 TYPE DATASET")
    print("="*60)

    # Create output directory
    Path(Config.OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

    # Load data
    print("\nüìä Loading data...")
    df = DataExtractor.load_all_files(Config.DATASET_DIR, Config.MAX_PROMPTS)
    print(f"‚úÖ Loaded {len(df)} prompts")

    # Basic info
    print("\nüìã Dataset Info:")
    print(f"  Files: {df['file'].nunique()}")
    print(f"  Types: {df['reasoning_type'].nunique() if 'reasoning_type' in df.columns else 'N/A'}")
    print(f"  Tiers: {df['tier'].unique().tolist() if 'tier' in df.columns else 'N/A'}")

    # Analyze redundancy
    print("\nüîç Analyzing redundancy...")
    analyzer = RedundancyAnalyzer(df, Config)
    similar_pairs = analyzer.analyze_all()
    print(f"‚úÖ Found {len(similar_pairs)} redundant pairs")

    # Get statistics
    print("\nüìà Calculating statistics...")
    stats = analyzer.get_distribution_stats()

    # Generate suggestions
    print("\nüí° Generating fix suggestions...")
    fixer = RedundancyFixer()
    suggestions = fixer.suggest_fixes(similar_pairs)

    # Create report
    print("\nüìù Creating Excel report...")
    report_path = Path(Config.OUTPUT_DIR) / 'redundancy_report.xlsx'
    ReportGenerator.create_excel_report(df, similar_pairs, stats, suggestions, report_path)

    # Visualizations
    print("\nüìä Creating visualizations...")
    create_visualizations(df, similar_pairs, stats)

    # Summary
    print("\n" + "="*60)
    print("‚úÖ ANALYSIS COMPLETE!")
    print("="*60)
    print(f"\nüìä Summary:")
    print(f"  Total Prompts: {len(df)}")
    print(f"  Redundant Pairs: {len(similar_pairs)}")
    print(f"  Redundancy Rate: {len(similar_pairs) / (len(df)*(len(df)-1)/2) * 100:.2f}%")

    if stats.get('reasoning_type'):
        print(f"  Type Imbalance: {stats['reasoning_type']['imbalance_ratio']:.2f}x")

    print(f"\nüìÅ Files Generated:")
    print(f"  - {report_path}")
    print(f"  - /content/redundancy_analysis.png")

    return df, similar_pairs, stats

In [None]:
# ============================================
# üìå Block 9: Run
# ‡∏à‡∏∏‡∏î‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏£‡∏±‡∏ô‡πÇ‡∏õ‡∏£‡πÅ‡∏Å‡∏£‡∏°‡∏´‡∏•‡∏±‡∏Å‡πÉ‡∏ô Colab
# ============================================
if __name__ == "__main__":
    df, pairs, stats = main()