In [None]:
# COMPREHENSIVE EVALUATION OF ASL CLASSIFICATION RESULTS
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import precision_recall_fscore_support
import numpy as np
from collections import Counter
from datetime import datetime

def evaluate_asl_predictions(results_df):
    """
    Comprehensive evaluation of ASL classification results
    
    Args:
        results_df: DataFrame with 'label' (true) and 'prediction' columns
    
    Returns:
        Dictionary with all evaluation metrics
    """
    
    # Basic metrics
    y_true = results_df['label']
    y_pred = results_df['prediction']
    
    # Remove any failed predictions ("?") for cleaner analysis
    valid_mask = (y_pred != "?") & (y_true != "?") & (~pd.isna(y_pred)) & (~pd.isna(y_true))
    y_true_clean = y_true[valid_mask]
    y_pred_clean = y_pred[valid_mask]
    
    print("="*60)
    print("🔍 ASL CLASSIFICATION EVALUATION RESULTS")
    print("="*60)
    
    # 1. OVERALL ACCURACY
    overall_accuracy = accuracy_score(y_true_clean, y_pred_clean)
    failed_predictions = (~valid_mask).sum()
    
    print(f"\n📊 OVERALL METRICS:")
    print(f"   Total Images: {len(results_df)}")
    print(f"   Valid Predictions: {len(y_true_clean)} ({len(y_true_clean)/len(results_df)*100:.1f}%)")
    print(f"   Failed Predictions: {failed_predictions} ({failed_predictions/len(results_df)*100:.1f}%)")
    print(f"   Overall Accuracy: {overall_accuracy:.3f} ({overall_accuracy*100:.1f}%)")
    
    # 2. PER-CLASS ACCURACY
    print(f"\n📈 PER-LETTER ACCURACY:")
    per_class_accuracy = {}
    letters_list = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
    
    for letter in sorted(letters_list):
        letter_mask = y_true_clean == letter
        if letter_mask.sum() > 0:
            letter_accuracy = (y_true_clean[letter_mask] == y_pred_clean[letter_mask]).mean()
            per_class_accuracy[letter] = letter_accuracy
            print(f"   {letter}: {letter_accuracy:.3f} ({letter_accuracy*100:.1f}%) - {letter_mask.sum()} samples")
    
    # 3. BEST AND WORST PERFORMING LETTERS
    if per_class_accuracy:
        sorted_accuracy = sorted(per_class_accuracy.items(), key=lambda x: x[1], reverse=True)
        print(f"\n🏆 BEST PERFORMING LETTERS:")
        for letter, acc in sorted_accuracy[:5]:
            print(f"   {letter}: {acc:.3f} ({acc*100:.1f}%)")
        
        print(f"\n⚠️  WORST PERFORMING LETTERS:")
        for letter, acc in sorted_accuracy[-5:]:
            print(f"   {letter}: {acc:.3f} ({acc*100:.1f}%)")
    
    # 4. CONFUSION ANALYSIS
    print(f"\n🔄 MOST COMMON MISCLASSIFICATIONS:")
    confusion_pairs = []
    for i in range(len(y_true_clean)):
        if y_true_clean.iloc[i] != y_pred_clean.iloc[i]:
            confusion_pairs.append((y_true_clean.iloc[i], y_pred_clean.iloc[i]))
    
    common_mistakes = Counter(confusion_pairs).most_common(10)
    for (true_letter, pred_letter), count in common_mistakes:
        print(f"   {true_letter} → {pred_letter}: {count} times")
    
    # 5. DETAILED CLASSIFICATION REPORT
    print(f"\n📋 DETAILED CLASSIFICATION REPORT:")
    try:
        report = classification_report(y_true_clean, y_pred_clean, zero_division=0)
        print(report)
    except Exception as e:
        print(f"Could not generate classification report: {e}")
    
    return {
        'overall_accuracy': overall_accuracy,
        'per_class_accuracy': per_class_accuracy,
        'failed_predictions': failed_predictions,
        'total_samples': len(results_df),
        'valid_samples': len(y_true_clean),
        'y_true_clean': y_true_clean,
        'y_pred_clean': y_pred_clean
    }

def plot_evaluation_results(results_df, save_plots=True):
    """
    Create visualization plots for evaluation results
    """
    y_true = results_df['label']
    y_pred = results_df['prediction']
    
    # Remove failed predictions
    valid_mask = (y_pred != "?") & (y_true != "?") & (~pd.isna(y_pred)) & (~pd.isna(y_true))
    y_true_clean = y_true[valid_mask]
    y_pred_clean = y_pred[valid_mask]
    
    if len(y_true_clean) == 0:
        print("⚠️ No valid predictions to plot!")
        return
    
    # Create figure with subplots
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('ASL Classification Evaluation Results', fontsize=16, fontweight='bold')
    
    # 1. Confusion Matrix (simplified for readability)
    letters_in_data = sorted(list(set(y_true_clean) | set(y_pred_clean)))
    cm = confusion_matrix(y_true_clean, y_pred_clean, labels=letters_in_data)
    
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=letters_in_data, yticklabels=letters_in_data, ax=axes[0,0])
    axes[0,0].set_title('Confusion Matrix')
    axes[0,0].set_xlabel('Predicted')
    axes[0,0].set_ylabel('Actual')
    
    # 2. Per-class Accuracy
    per_class_acc = {}
    for letter in letters_in_data:
        letter_mask = y_true_clean == letter
        if letter_mask.sum() > 0:
            per_class_acc[letter] = (y_true_clean[letter_mask] == y_pred_clean[letter_mask]).mean()
    
    if per_class_acc:
        letters = list(per_class_acc.keys())
        accuracies = list(per_class_acc.values())
        
        bars = axes[0,1].bar(letters, accuracies, color='skyblue', alpha=0.7)
        axes[0,1].set_title('Per-Letter Accuracy')
        axes[0,1].set_xlabel('Letters')
        axes[0,1].set_ylabel('Accuracy')
        axes[0,1].set_ylim(0, 1)
        axes[0,1].tick_params(axis='x', rotation=45)
        
        # Add value labels on bars
        for bar, acc in zip(bars, accuracies):
            if bar.get_height() > 0:
                axes[0,1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
                              f'{acc:.2f}', ha='center', va='bottom', fontsize=8)
    
    # 3. Sample Distribution
    true_counts = y_true_clean.value_counts().sort_index()
    axes[1,0].bar(true_counts.index, true_counts.values, color='lightgreen', alpha=0.7)
    axes[1,0].set_title('Sample Distribution by Letter')
    axes[1,0].set_xlabel('Letters')
    axes[1,0].set_ylabel('Number of Samples')
    axes[1,0].tick_params(axis='x', rotation=45)
    
    # 4. Overall Statistics
    overall_acc = accuracy_score(y_true_clean, y_pred_clean)
    failed_preds = len(results_df) - len(y_true_clean)
    
    stats_text = f"""Overall Statistics:
    
Total Images: {len(results_df)}
Valid Predictions: {len(y_true_clean)}
Failed Predictions: {failed_preds}
Overall Accuracy: {overall_acc:.3f}

Letters Evaluated: {len(letters_in_data)}
Unique True Labels: {len(set(y_true_clean))}
Unique Predictions: {len(set(y_pred_clean))}"""
    
    axes[1,1].text(0.1, 0.5, stats_text, transform=axes[1,1].transAxes, 
                   fontsize=12, verticalalignment='center',
                   bbox=dict(boxstyle="round,pad=0.3", facecolor="lightblue", alpha=0.5))
    axes[1,1].set_xlim(0, 1)
    axes[1,1].set_ylim(0, 1)
    axes[1,1].axis('off')
    axes[1,1].set_title('Summary Statistics')
    
    plt.tight_layout()
    
    if save_plots:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f'asl_evaluation_{timestamp}.png'
        plt.savefig(filename, dpi=300, bbox_inches='tight')
        print(f"📊 Plots saved as: {filename}")
    
    plt.show()

def create_detailed_report(results_df, save_report=True):
    """
    Create a detailed text report of the evaluation
    """
    metrics = evaluate_asl_predictions(results_df)
    
    if save_report:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        report_filename = f'asl_evaluation_report_{timestamp}.txt'
        
        with open(report_filename, 'w') as f:
            f.write("ASL CLASSIFICATION EVALUATION REPORT\n")
            f.write("="*50 + "\n\n")
            f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
            
            f.write("SUMMARY METRICS:\n")
            f.write(f"- Total Images: {metrics['total_samples']}\n")
            f.write(f"- Valid Predictions: {metrics['valid_samples']}\n")
            f.write(f"- Failed Predictions: {metrics['failed_predictions']}\n")
            f.write(f"- Overall Accuracy: {metrics['overall_accuracy']:.3f}\n\n")
            
            if metrics['per_class_accuracy']:
                f.write("PER-CLASS ACCURACY:\n")
                for letter, acc in sorted(metrics['per_class_accuracy'].items()):
                    f.write(f"- {letter}: {acc:.3f}\n")
        
        print(f"📄 Detailed report saved as: {report_filename}")
    
    return metrics

# USAGE EXAMPLES:
print("""
🚀 HOW TO USE THESE EVALUATION FUNCTIONS:

1. After running your classification pipeline:
   results_df = classification_pipeline(img_df)

2. Get comprehensive evaluation:
   metrics = evaluate_asl_predictions(results_df)

3. Create visualizations:
   plot_evaluation_results(results_df, save_plots=True)

4. Generate detailed report:
   create_detailed_report(results_df, save_report=True)

5. Quick accuracy check:
   accuracy = (results_df['label'] == results_df['prediction']).mean()
   print(f'Quick Accuracy: {accuracy:.3f}')
""")

In [None]:
import os
from pathlib import Path
import pandas as pd
from typing import Dict
 
ALPHABETS = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ")

def count_images_per_split(dataset_dir: str) -> pd.DataFrame:
    """
    Count images per ASL letter in train and test splits.

    Args:
        dataset_dir: Root dataset directory containing 'train' and 'test' subfolders.

    Returns:
        DataFrame with columns: ['Letter', 'Train', 'Test', 'Total']
    """
    dataset_path = Path(dataset_dir)
    train_path = dataset_path / 'train'
    test_path = dataset_path / 'test'


    def count_in_split(split_path: Path) -> Dict[str, int]:
        counts: Dict[str, int] = {letter: 0 for letter in ALPHABETS}
        if not split_path.exists():
            return counts
        for letter in ALPHABETS:
            letter_dir = split_path / letter
            if letter_dir.exists() and letter_dir.is_dir():
                num_files = sum(
                    1 for p in letter_dir.iterdir()
                    if p.is_file() and p.suffix.lower() in {'.jpg', '.jpeg', '.png'}
                )
                counts[letter] = num_files
        return counts

    train_counts = count_in_split(train_path)
    test_counts = count_in_split(test_path)

    rows = []
    for letter in ALPHABETS:
        tr = train_counts.get(letter, 0)
        te = test_counts.get(letter, 0)
        rows.append({
            'Letter': letter,
            'Train': tr,
            'Test': te,
            'Total': tr + te,
        })

    df = pd.DataFrame(rows)
    return df

