In [3]:
import pandas as pd
import numpy as np
import os
from collections import Counter

# Configuration - UPDATE THESE PATHS
INPUT_CSV_PATH = "/data/users2/nblair7/analysis_results/eval_results_new.csv"  # TODO: Update with your evaluation results file
OUTPUT_CSV_PATH = "/data/users2/nblair7/analysis_results/low_score_cases.csv"   # TODO: Update with desired output path
ANALYSIS_REPORT_PATH = "/data/users2/nblair7/analysis_results/low_score_analysis_report.txt"  # TODO: Update with desired report path

# Configuration
THRESHOLD = 2  # Extract scores <= 2 (includes scores 1 and 2)

def extract_low_scores(input_csv_path, output_csv_path, threshold=2):
    """
    Extract participants with scores <= threshold and create analysis CSV
    
    Parameters:
    - input_csv_path: Path to the input CSV file with evaluation results
    - output_csv_path: Path to save the low score cases
    - threshold: Score threshold (default 2, includes scores 1 and 2)
    """
    
    print(f"=== LOW SCORE EXTRACTOR (Threshold: ≤{threshold}) ===")
    print(f"Reading from: {input_csv_path}")
    print(f"Output will be saved to: {output_csv_path}")
    
    # Load the data
    try:
        df = pd.read_csv(input_csv_path)
        print(f"Loaded {len(df)} participants from input file")
        print(f"Columns found: {list(df.columns)}")
    except Exception as e:
        print(f"Error loading input file: {e}")
        return None
    
    # Define the scoring metrics
    metrics = ['coherence', 'completeness', 'specificity', 'accuracy']
    
    # Initialize list to store low score cases
    low_score_cases = []
    
    print("\nScanning for low scores...")
    
    # Process each participant
    for index, row in df.iterrows():
        participant_id = row['participant_id']
        
        # Check each metric
        for metric in metrics:
            score_col = metric
            explanation_col = f"{metric}_explanation"
            
            # Check if columns exist
            if score_col in df.columns and explanation_col in df.columns:
                score = row[score_col]
                explanation = row[explanation_col]
                
                # Check if score is low (≤ threshold)
                if pd.notna(score) and score <= threshold:
                    low_score_cases.append({
                        'participant_id': participant_id,
                        'metric': metric,
                        'score': int(score),
                        'explanation': explanation
                    })
                    
                    print(f"  Found: Participant {participant_id}, {metric} = {score}")
            else:
                if index == 0:  # Only warn once
                    print(f"  Warning: Column {score_col} or {explanation_col} not found")
    
    # Convert to DataFrame and save
    if low_score_cases:
        low_scores_df = pd.DataFrame(low_score_cases)
        low_scores_df.to_csv(output_csv_path, index=False)
        
        # Print summary statistics
        print(f"\n=== SUMMARY STATISTICS ===")
        print(f"Total participants with low scores (≤{threshold}): {low_scores_df['participant_id'].nunique()}")
        print(f"Total low score instances: {len(low_scores_df)}")
        
        # Breakdown by metric
        print(f"\nBreakdown by metric:")
        metric_counts = low_scores_df['metric'].value_counts()
        for metric, count in metric_counts.items():
            percentage = (count / len(low_scores_df)) * 100
            print(f"  {metric}: {count} cases ({percentage:.1f}%)")
        
        # Breakdown by score
        print(f"\nBreakdown by score:")
        score_counts = low_scores_df['score'].value_counts().sort_index()
        for score, count in score_counts.items():
            percentage = (count / len(low_scores_df)) * 100
            print(f"  Score {score}: {count} cases ({percentage:.1f}%)")
        
        # Participants with multiple low scores
        participant_low_counts = low_scores_df.groupby('participant_id').size()
        multi_low_participants = participant_low_counts[participant_low_counts > 1]
        
        print(f"\nParticipants with multiple low scores: {len(multi_low_participants)}")
        
        if len(multi_low_participants) > 0:
            print("Most problematic participants:")
            top_problematic = participant_low_counts.sort_values(ascending=False).head(10)
            for pid, count in top_problematic.items():
                affected_metrics = low_scores_df[low_scores_df['participant_id'] == pid]['metric'].tolist()
                print(f"  Participant {pid}: {count} low scores ({', '.join(affected_metrics)})")
        
        print(f"\nLow score cases saved to: {output_csv_path}")
        print(f"CSV columns: participant_id, metric, score, explanation")
        
        return low_scores_df
        
    else:
        print(f"No participants found with scores ≤ {threshold}")
        return None

def analyze_explanations_by_metric(low_scores_df):
    """
    Analyze explanations grouped by metric to find common patterns
    """
    
    if low_scores_df is None or len(low_scores_df) == 0:
        print("No low score data to analyze")
        return {}
    
    print(f"\n=== ANALYZING EXPLANATIONS BY METRIC ===")
    
    analysis_results = {}
    
    # Define keywords to search for in each metric's explanations
    metric_keywords = {
        'coherence': [
            'contradiction', 'inconsistent', 'logical', 'conflicting', 'unclear', 
            'contradicts', 'confusing', 'inconsistency', 'incoherent'
        ],
        'completeness': [
            'missing', 'omitted', 'lacks', 'incomplete', 'PHQ-8', 'symptoms', 
            'duration', 'frequency', 'missed', 'absent', 'overlooks', 'fails to'
        ],
        'specificity': [
            'vague', 'generic', 'general', 'specific', 'seems', 'appears', 
            'unclear', 'broad', 'superficial', 'lacks detail'
        ],
        'accuracy': [
            'incorrect', 'wrong', 'DSM-5', 'PHQ-8', 'inaccurate', 'misaligned', 
            'error', 'mistaken', 'false', 'erroneous'
        ]
    }
    
    for metric in low_scores_df['metric'].unique():
        metric_data = low_scores_df[low_scores_df['metric'] == metric]
        explanations = metric_data['explanation'].tolist()
        
        print(f"\n--- {metric.upper()} ANALYSIS ({len(metric_data)} cases) ---")
        
        # Count keyword occurrences in explanations
        keyword_counts = {}
        if metric in metric_keywords:
            for keyword in metric_keywords[metric]:
                count = sum(1 for exp in explanations if keyword.lower() in str(exp).lower())
                if count > 0:
                    keyword_counts[keyword] = count
        
        # Score distribution for this metric
        score_dist = metric_data['score'].value_counts().sort_index()
        
        # Get sample explanations (first 3 different ones)
        sample_explanations = []
        for i in range(min(3, len(explanations))):
            participant_id = metric_data.iloc[i]['participant_id']
            score = metric_data.iloc[i]['score']
            explanation = explanations[i]
            sample_explanations.append({
                'participant_id': participant_id,
                'score': score,
                'explanation': explanation[:300] + "..." if len(explanation) > 300 else explanation
            })
        
        # Store results
        analysis_results[metric] = {
            'total_cases': len(metric_data),
            'score_distribution': dict(score_dist),
            'avg_score': metric_data['score'].mean(),
            'keyword_frequencies': keyword_counts,
            'sample_explanations': sample_explanations,
            'participants': metric_data['participant_id'].tolist()
        }
        
        # Print summary for this metric
        print(f"  Total cases: {len(metric_data)}")
        print(f"  Average score: {metric_data['score'].mean():.2f}")
        print(f"  Score distribution: {dict(score_dist)}")
        
        if keyword_counts:
            # Sort keywords by frequency
            sorted_keywords = sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True)
            print(f"  Most common issue indicators:")
            for keyword, count in sorted_keywords[:5]:
                print(f"    '{keyword}': appears in {count}/{len(metric_data)} explanations ({count/len(metric_data)*100:.1f}%)")
        else:
            print(f"  No common keywords found")
    
    return analysis_results

def create_detailed_report(low_scores_df, analysis_results, report_path):
    """
    Create a comprehensive text report with all findings
    """
    
    if low_scores_df is None:
        print("No data available to create report")
        return
    
    print(f"\nCreating detailed analysis report...")
    
    try:
        with open(report_path, 'w', encoding='utf-8') as f:
            f.write("LOW SCORE ANALYSIS REPORT\n")
            f.write("=" * 80 + "\n\n")
            f.write(f"Analysis Date: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write(f"Threshold: Scores ≤ {THRESHOLD}\n\n")
            
            # EXECUTIVE SUMMARY
            f.write("EXECUTIVE SUMMARY\n")
            f.write("-" * 40 + "\n")
            f.write(f"Total participants with low scores: {low_scores_df['participant_id'].nunique()}\n")
            f.write(f"Total low score instances: {len(low_scores_df)}\n")
            f.write(f"Metrics analyzed: {', '.join(low_scores_df['metric'].unique())}\n\n")
            
            # OVERALL STATISTICS
            f.write("OVERALL STATISTICS\n")
            f.write("-" * 40 + "\n")
            
            # Metric breakdown
            metric_counts = low_scores_df['metric'].value_counts()
            f.write("Issues by metric:\n")
            for metric, count in metric_counts.items():
                percentage = (count / len(low_scores_df)) * 100
                f.write(f"  {metric.capitalize()}: {count} cases ({percentage:.1f}%)\n")
            f.write("\n")
            
            # Score distribution
            score_counts = low_scores_df['score'].value_counts().sort_index()
            f.write("Score distribution:\n")
            for score, count in score_counts.items():
                percentage = (count / len(low_scores_df)) * 100
                f.write(f"  Score {score}: {count} cases ({percentage:.1f}%)\n")
            f.write("\n")
            
            # DETAILED METRIC ANALYSIS
            f.write("=" * 80 + "\n")
            f.write("DETAILED ANALYSIS BY METRIC\n")
            f.write("=" * 80 + "\n\n")
            
            for metric, results in analysis_results.items():
                f.write(f"{metric.upper()} - {results['total_cases']} LOW SCORE CASES\n")
                f.write("-" * 60 + "\n")
                f.write(f"Average score: {results['avg_score']:.2f}\n")
                f.write(f"Score distribution: {results['score_distribution']}\n")
                f.write(f"Affected participants: {len(set(results['participants']))}\n\n")
                
                if results['keyword_frequencies']:
                    f.write("Common problem indicators (keyword analysis):\n")
                    sorted_keywords = sorted(results['keyword_frequencies'].items(), 
                                           key=lambda x: x[1], reverse=True)
                    for keyword, count in sorted_keywords:
                        percentage = (count / results['total_cases']) * 100
                        f.write(f"  '{keyword}': {count} cases ({percentage:.1f}%)\n")
                    f.write("\n")
                else:
                    f.write("No common problem indicators identified.\n\n")
                
                f.write("SAMPLE EXPLANATIONS:\n")
                f.write("." * 40 + "\n")
                for i, sample in enumerate(results['sample_explanations'], 1):
                    f.write(f"{i}. Participant {sample['participant_id']} (Score: {sample['score']})\n")
                    f.write(f"   {sample['explanation']}\n\n")
                
                f.write("\n")
            
            # MOST PROBLEMATIC PARTICIPANTS
            f.write("=" * 80 + "\n")
            f.write("PARTICIPANTS WITH MULTIPLE ISSUES\n")
            f.write("=" * 80 + "\n\n")
            
            participant_counts = low_scores_df.groupby('participant_id').size().sort_values(ascending=False)
            multi_problem = participant_counts[participant_counts > 1]
            
            if len(multi_problem) > 0:
                f.write(f"Found {len(multi_problem)} participants with multiple low scores:\n\n")
                for pid, count in multi_problem.items():
                    participant_data = low_scores_df[low_scores_df['participant_id'] == pid]
                    f.write(f"Participant {pid}: {count} low scores\n")
                    for _, row in participant_data.iterrows():
                        f.write(f"  - {row['metric']}: Score {row['score']}\n")
                    f.write("\n")
            else:
                f.write("No participants have multiple low scores.\n")
                f.write("Each low score case is from a different participant.\n")
        
        print(f"Detailed analysis report saved to: {report_path}")
        
    except Exception as e:
        print(f"Error creating report: {e}")

def main():
    """
    Main execution function
    """
    print("=" * 80)
    print("LOW SCORE CASES ANALYZER")
    print("=" * 80)
    print(f"Input file: {INPUT_CSV_PATH}")
    print(f"Output CSV: {OUTPUT_CSV_PATH}")
    print(f"Report file: {ANALYSIS_REPORT_PATH}")
    print(f"Score threshold: ≤ {THRESHOLD}")
    print()
    
    # Check if input file exists
    if not os.path.exists(INPUT_CSV_PATH):
        print(f"ERROR: Input file not found: {INPUT_CSV_PATH}")
        print("Please update the INPUT_CSV_PATH variable with the correct path.")
        return
    
    # Step 1: Extract low score cases
    print("Step 1: Extracting low score cases...")
    low_scores_df = extract_low_scores(INPUT_CSV_PATH, OUTPUT_CSV_PATH, THRESHOLD)
    
    if low_scores_df is not None and len(low_scores_df) > 0:
        # Step 2: Analyze explanations by metric
        print("\nStep 2: Analyzing explanation patterns...")
        analysis_results = analyze_explanations_by_metric(low_scores_df)
        
        # Step 3: Create detailed report
        print("\nStep 3: Creating comprehensive report...")
        create_detailed_report(low_scores_df, analysis_results, ANALYSIS_REPORT_PATH)
        
        # Final summary
        print("\n" + "=" * 80)
        print("ANALYSIS COMPLETE")
        print("=" * 80)
        print("Files created:")
        print(f"1. Low score cases CSV: {OUTPUT_CSV_PATH}")
        print(f"   - Contains {len(low_scores_df)} low score cases")
        print(f"   - Columns: participant_id, metric, score, explanation")
        print(f"2. Analysis report: {ANALYSIS_REPORT_PATH}")
        print(f"   - Comprehensive analysis with patterns and samples")
        print()
        print("Key findings:")
        print(f"- {low_scores_df['participant_id'].nunique()} participants have scores ≤ {THRESHOLD}")
        print(f"- {len(low_scores_df)} total low score instances")
        
        metric_counts = low_scores_df['metric'].value_counts()
        worst_metric = metric_counts.index[0]
        print(f"- Most problematic metric: {worst_metric} ({metric_counts[worst_metric]} cases)")
        
    else:
        print(f"\nNo low score cases found with threshold ≤ {THRESHOLD}")
        print("Analysis complete - no issues detected.")

if __name__ == "__main__":
    main()

LOW SCORE CASES ANALYZER
Input file: /data/users2/nblair7/analysis_results/eval_results_new.csv
Output CSV: /data/users2/nblair7/analysis_results/low_score_cases.csv
Report file: /data/users2/nblair7/analysis_results/low_score_analysis_report.txt
Score threshold: ≤ 2

Step 1: Extracting low score cases...
=== LOW SCORE EXTRACTOR (Threshold: ≤2) ===
Reading from: /data/users2/nblair7/analysis_results/eval_results_new.csv
Output will be saved to: /data/users2/nblair7/analysis_results/low_score_cases.csv
Loaded 142 participants from input file
Columns found: ['participant_id', 'coherence', 'coherence_explanation', 'completeness', 'completeness_explanation', 'specificity', 'specificity_explanation', 'accuracy', 'accuracy_explanation']

Scanning for low scores...
  Found: Participant 326, completeness = 2
  Found: Participant 326, specificity = 2
  Found: Participant 330, specificity = 2
  Found: Participant 336, accuracy = 2
  Found: Participant 340, completeness = 2
  Found: Participant 3