In [122]:
import sys
import os
from pathlib import Path
import numpy as np
import pickle
import json
from typing import Dict, List, Tuple, Optional
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, auc, average_precision_score
from scipy import stats
from tqdm import tqdm
import pandas as pd
from datetime import datetime

In [123]:
NOTEBOOK_DIR = Path.cwd()

def find_project_root(start: Path, target_folder="RIG"):
  for parent in [start] + list(start.parents):
    if parent.name == target_folder:
      return parent
  raise RuntimeError(f"Could not find project root '{target_folder}'")

PROJECT_ROOT = find_project_root(NOTEBOOK_DIR)
OUTPUT_ROOT = PROJECT_ROOT / "output" / "v0"

print(f"Project Root: {PROJECT_ROOT}")
print(f"Output Root: {OUTPUT_ROOT}")

output_root = Path(OUTPUT_ROOT)
embeddings_root = output_root / 'embeddings'
results_dir = output_root / 'evaluation_results'
plots_dir = results_dir / 'plots'
plots_dir.mkdir(parents=True, exist_ok=True)

models = [
  'adaface_ir_50',
  'adaface_ir_101',
  'arcface_ir_50',
  'arcface_ir_101'
]

Project Root: d:\KEVIN\0SLC\RIG
Output Root: d:\KEVIN\0SLC\RIG\output\v0


In [124]:
def load_embeddings(model_name: str) -> Dict:
  model_dir = embeddings_root / model_name
  
  if not model_dir.exists():
    raise FileNotFoundError(f"Model directory not found: {model_dir}")
  
  embeddings = {}

  embedding_files = {
    'gallery_oneshot_base': 'gallery_one-shot_base.pkl',
    'gallery_oneshot_augmented': 'gallery_one-shot_augmented.pkl',
    'gallery_fewshot_base': 'gallery_few-shot_base.pkl',
    'gallery_fewshot_augmented': 'gallery_few-shot_augmented.pkl',
    'probe_positive_unsegmented': 'probe_positive_unsegmented.pkl',
    'probe_positive_segmented': 'probe_positive_segmented.pkl',
    'probe_negative': 'probe_negative.pkl'
  }
  
  for key, filename in embedding_files.items():
    file_path = model_dir / filename
    if file_path.exists():
      with open(file_path, 'rb') as f:
        embeddings[key] = pickle.load(f)
    else:
        embeddings[key] = None
  
  return embeddings


In [125]:
def cosine_similarity(emb1: np.ndarray, emb2: np.ndarray) -> float:
    norm1 = np.linalg.norm(emb1)
    norm2 = np.linalg.norm(emb2)
    if abs(norm1 - 1.0) < 0.01 and abs(norm2 - 1.0) < 0.01:
        return np.dot(emb1, emb2)
    return np.dot(emb1, emb2) / (norm1 * norm2)

def compute_all_similarities(probe_emb: np.ndarray, 
                            gallery_embeddings: Dict[str, Dict]) -> List[Tuple[str, float]]:
    similarities = []
    for name, data in gallery_embeddings.items():
        gallery_embs = data['embeddings']
        for gallery_emb in gallery_embs:
            sim = cosine_similarity(probe_emb, gallery_emb)
            similarities.append((name, sim))
    return similarities

In [126]:
def aggregate_max(similarities: List[float]) -> float:
    return max(similarities) if similarities else -1

def aggregate_mean(similarities: List[float]) -> float:
    return np.mean(similarities) if similarities else -1

def aggregate_topk(similarities: List[float], k: int = 3) -> float:
    if not similarities:
        return -1
    sorted_sims = sorted(similarities, reverse=True)
    return np.mean(sorted_sims[:min(k, len(sorted_sims))])

In [127]:
def identify_probe(probe_embedding: np.ndarray, 
                   gallery_embeddings: Dict[str, Dict],
                   threshold: float,
                   aggregation: str = 'mean',
                   k: int = 3) -> Tuple[Optional[str], float, Dict[str, float]]:
    identity_scores = {}
    
    for name, data in gallery_embeddings.items():
        gallery_embs = data['embeddings']
        similarities = [cosine_similarity(probe_embedding, g_emb) for g_emb in gallery_embs]
        
        if aggregation == 'max':
            score = aggregate_max(similarities)
        elif aggregation == 'mean':
            score = aggregate_mean(similarities)
        elif aggregation == 'topk':
            score = aggregate_topk(similarities, k)
        else:
            score = aggregate_max(similarities)
        
        identity_scores[name] = score
    
    if not identity_scores:
        return None, -1, {}
    
    sorted_identities = sorted(identity_scores.items(), key=lambda x: x[1], reverse=True)
    best_name, best_score = sorted_identities[0]
    
    if best_score < threshold:
        return None, best_score, identity_scores
    
    return best_name, best_score, identity_scores

In [128]:
def compute_rank_metrics(identity_scores: Dict[str, float], 
                        true_identity: str,
                        ranks: List[int] = [1, 5, 10]) -> Dict[str, bool]:
    sorted_identities = sorted(identity_scores.items(), key=lambda x: x[1], reverse=True)
    
    rank_results = {}
    for k in ranks:
        top_k = [name for name, _ in sorted_identities[:k]]
        rank_results[f'rank{k}'] = true_identity in top_k

    try:
        true_rank = [name for name, _ in sorted_identities].index(true_identity) + 1
        rank_results['reciprocal_rank'] = 1.0 / true_rank
    except ValueError:
        rank_results['reciprocal_rank'] = 0.0
    
    return rank_results

def compute_dprime(genuine_scores: List[float], impostor_scores: List[float]) -> float:
    if not genuine_scores or not impostor_scores:
        return 0.0
    
    mean_genuine = np.mean(genuine_scores)
    mean_impostor = np.mean(impostor_scores)
    std_genuine = np.std(genuine_scores)
    std_impostor = np.std(impostor_scores)
    
    pooled_std = np.sqrt((std_genuine**2 + std_impostor**2) / 2)
    
    if pooled_std == 0:
        return 0.0
    
    return (mean_genuine - mean_impostor) / pooled_std

def bootstrap_confidence_interval(data: List[float], 
                                 n_bootstrap: int = 1000, 
                                 confidence: float = 0.95) -> Tuple[float, float]:
    if not data:
        return (0.0, 0.0)
    
    bootstrap_means = []
    n = len(data)
    
    for _ in range(n_bootstrap):
        sample = np.random.choice(data, size=n, replace=True)
        bootstrap_means.append(np.mean(sample))
    
    alpha = 1 - confidence
    lower = np.percentile(bootstrap_means, alpha/2 * 100)
    upper = np.percentile(bootstrap_means, (1 - alpha/2) * 100)
    
    return (lower, upper)

In [129]:

def evaluate_probes_comprehensive(gallery_embeddings: Dict[str, Dict],
                                 probe_embeddings: Dict[str, Dict],
                                 thresholds: List[float],
                                 aggregation: str = 'mean',
                                 k: int = 3) -> Dict:
    probe_data = probe_embeddings.get("all", probe_embeddings)
    all_predictions = []
    genuine_scores = []
    impostor_scores = []
    
    for true_name, data in tqdm(probe_data.items(), desc=f"Processing probes ({aggregation})"):
        probe_embs = data['embeddings']
        
        for probe_emb in probe_embs:
            predicted_name, best_score, identity_scores = identify_probe(
                probe_emb, gallery_embeddings, threshold=0.0,
                aggregation=aggregation, k=k
            )
            
            rank_metrics = compute_rank_metrics(identity_scores, true_name)
            
            all_predictions.append({
                'true_identity': true_name,
                'predicted_identity': predicted_name,
                'score': best_score,
                'identity_scores': identity_scores,
                'rank_metrics': rank_metrics
            })
            
            if true_name in identity_scores:
                genuine_scores.append(identity_scores[true_name])
            
            for name, score in identity_scores.items():
                if name != true_name:
                    impostor_scores.append(score)

    threshold_results = []
    
    for threshold in thresholds:
        tp = fp = tn = fn = 0
        rank1_correct = rank5_correct = rank10_correct = 0
        mrr_sum = 0
        
        correct_scores = []
        incorrect_scores = []
        
        for pred in all_predictions:
            true_name = pred['true_identity']
            predicted_name = pred['predicted_identity']
            score = pred['score']
            rank_metrics = pred['rank_metrics']
            
            if score >= threshold:
                if predicted_name == true_name:
                    tp += 1
                    correct_scores.append(score)
                else:
                    fp += 1
                    incorrect_scores.append(score)
            else:
                fn += 1

            if rank_metrics['rank1']:
                rank1_correct += 1
            if rank_metrics['rank5']:
                rank5_correct += 1
            if rank_metrics['rank10']:
                rank10_correct += 1
            mrr_sum += rank_metrics['reciprocal_rank']
        
        n_probes = len(all_predictions)

        rank1_acc = rank1_correct / n_probes if n_probes > 0 else 0
        rank5_acc = rank5_correct / n_probes if n_probes > 0 else 0
        rank10_acc = rank10_correct / n_probes if n_probes > 0 else 0
        mrr = mrr_sum / n_probes if n_probes > 0 else 0
        
        far = fp / n_probes if n_probes > 0 else 0
        frr = fn / n_probes if n_probes > 0 else 0
        tar = tp / n_probes if n_probes > 0 else 0
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        
        threshold_results.append({
            'threshold': threshold,
            'rank1_accuracy': rank1_acc,
            'rank5_accuracy': rank5_acc,
            'rank10_accuracy': rank10_acc,
            'mrr': mrr,
            'tar': tar,
            'far': far,
            'frr': frr,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'tp': tp,
            'fp': fp,
            'fn': fn,
            'n_probes': n_probes,
            'avg_correct_score': np.mean(correct_scores) if correct_scores else 0,
            'avg_incorrect_score': np.mean(incorrect_scores) if incorrect_scores else 0,
        })
    
    y_true = []
    y_scores = []
    for pred in all_predictions:
        y_true.append(1 if pred['predicted_identity'] == pred['true_identity'] else 0)
        y_scores.append(pred['score'])
    
    fpr, tpr, _ = roc_curve(y_true, y_scores)
    roc_auc = auc(fpr, tpr)
    avg_precision = average_precision_score(y_true, y_scores)

    dprime = compute_dprime(genuine_scores, impostor_scores)
    
    genuine_ci = bootstrap_confidence_interval(genuine_scores)
    impostor_ci = bootstrap_confidence_interval(impostor_scores)
    
    return {
        'threshold_results': pd.DataFrame(threshold_results),
        'roc_auc': roc_auc,
        'average_precision': avg_precision,
        'dprime': dprime,
        'genuine_scores': genuine_scores,
        'impostor_scores': impostor_scores,
        'genuine_ci': genuine_ci,
        'impostor_ci': impostor_ci,
        'fpr': fpr,
        'tpr': tpr,
        'aggregation': aggregation,
        'all_predictions': all_predictions
    }

In [130]:

def evaluate_impostors_comprehensive(gallery_embeddings: Dict[str, Dict],
                                    impostor_embeddings: Dict[str, Dict],
                                    thresholds: List[float],
                                    aggregation: str = 'mean',
                                    k: int = 3) -> Dict:
    impostor_scores = []
    
    for dataset_name, data in tqdm(impostor_embeddings.items(), desc=f"Processing impostors ({aggregation})"):
        impostor_embs = data['embeddings']
        
        for impostor_emb in impostor_embs:
            _, score, _ = identify_probe(
                impostor_emb, gallery_embeddings, threshold=0.0,
                aggregation=aggregation, k=k
            )
            impostor_scores.append(score)
    
    threshold_results = []
    
    for threshold in thresholds:
        tn = sum(1 for s in impostor_scores if s < threshold)
        fp = sum(1 for s in impostor_scores if s >= threshold)
        n_impostors = len(impostor_scores)
        
        rejection_rate = tn / n_impostors if n_impostors > 0 else 0
        far = fp / n_impostors if n_impostors > 0 else 0
        
        threshold_results.append({
            'threshold': threshold,
            'rejection_rate': rejection_rate,
            'far': far,
            'tn': tn,
            'fp': fp,
            'n_impostors': n_impostors,
            'avg_impostor_score': np.mean(impostor_scores)
        })
    
    impostor_ci = bootstrap_confidence_interval(impostor_scores)
    
    return {
        'threshold_results': pd.DataFrame(threshold_results),
        'impostor_scores': impostor_scores,
        'impostor_ci': impostor_ci,
        'mean_impostor_score': np.mean(impostor_scores),
        'std_impostor_score': np.std(impostor_scores),
        'aggregation': aggregation
    }

In [131]:
def evaluate_segmented_comprehensive(gallery_embeddings: Dict[str, Dict],
                                    probe_embeddings: Dict[str, Dict],
                                    thresholds: List[float],
                                    aggregation: str = 'mean',
                                    k: int = 3) -> Dict[str, Dict]:  
    segment_results = {}
    segments = [k for k in probe_embeddings.keys() if k != 'all']
    
    print(f"Found {len(segments)} segments: {segments}")
    
    for segment_name in tqdm(segments, desc=f"Processing segments ({aggregation})"):
        segment_data = probe_embeddings[segment_name]
        segment_probe = {'all': segment_data}
        
        results = evaluate_probes_comprehensive(
            gallery_embeddings, segment_probe, thresholds,
            aggregation=aggregation, k=k
        )
        
        segment_results[segment_name] = results
    
    return segment_results


In [132]:

def generate_comparison_summary(all_model_results: Dict) -> pd.DataFrame:
    """Generate comprehensive comparison table across all models"""
    summary_data = []
    
    for model_name, model_data in all_model_results.items():
        basic_results = model_data.get('basic_probe', {})
        
        for gallery_name, gallery_results in basic_results.items():
            for agg_method, results in gallery_results.items():
                df = results['threshold_results']
                best_idx = df['rank1_accuracy'].idxmax()
                best_row = df.loc[best_idx]
                
                summary_data.append({
                    'Model': model_name,
                    'Gallery': gallery_name,
                    'Aggregation': agg_method,
                    'Rank-1': best_row['rank1_accuracy'],
                    'Rank-5': best_row['rank5_accuracy'],
                    'Rank-10': best_row['rank10_accuracy'],
                    'MRR': best_row['mrr'],
                    'ROC-AUC': results['roc_auc'],
                    'd-prime': results['dprime'],
                    'Best_Threshold': best_row['threshold'],
                    'F1-Score': best_row['f1_score'],
                    'TAR': best_row['tar'],
                    'FAR': best_row['far']
                })
    
    return pd.DataFrame(summary_data)


In [133]:

def create_segmented_comparison_table(all_model_results: Dict, 
                                     gallery_type: str = 'oneshot') -> pd.DataFrame:
    """Create comparison table for segmented evaluations"""
    segment_data = []
    
    for model_name, model_data in all_model_results.items():
        seg_key = f'segmented_{gallery_type}'
        if seg_key not in model_data:
            continue
            
        segment_results = model_data[seg_key]
        
        for segment_name, results in segment_results.items():
            df = results['threshold_results']
            best_idx = df['rank1_accuracy'].idxmax()
            
            segment_data.append({
                'Model': model_name,
                'Segment': segment_name,
                'Rank-1': df.loc[best_idx, 'rank1_accuracy'],
                'ROC-AUC': results['roc_auc'],
                'd-prime': results['dprime'],
                'MRR': df.loc[best_idx, 'mrr']
            })
    
    df = pd.DataFrame(segment_data)
    
    # Pivot for better visualization
    pivot_rank1 = df.pivot(index='Model', columns='Segment', values='Rank-1')
    pivot_rank1['Mean'] = pivot_rank1.mean(axis=1)
    pivot_rank1['Std'] = pivot_rank1.std(axis=1)
    pivot_rank1['Min'] = pivot_rank1.drop(['Mean', 'Std'], axis=1).min(axis=1)
    pivot_rank1['Max'] = pivot_rank1.drop(['Mean', 'Std', 'Min'], axis=1).max(axis=1)
    
    return pivot_rank1


In [134]:
def analyze_gallery_strategies(all_model_results: Dict) -> pd.DataFrame:
    """Compare oneshot vs fewshot, base vs augmented"""
    comparison_data = []
    
    for model_name, model_data in all_model_results.items():
        basic_results = model_data.get('basic_probe', {})
        
        # Get best performance for each configuration
        configs = {}
        for gallery_name, gallery_results in basic_results.items():
            best_rank1 = 0
            best_agg = None
            for agg_method, results in gallery_results.items():
                df = results['threshold_results']
                rank1 = df['rank1_accuracy'].max()
                if rank1 > best_rank1:
                    best_rank1 = rank1
                    best_agg = agg_method
            configs[gallery_name] = {'rank1': best_rank1, 'agg': best_agg}
        
        # Calculate improvements
        oneshot_base = configs.get('oneshot_base', {}).get('rank1', 0)
        oneshot_aug = configs.get('oneshot_augmented', {}).get('rank1', 0)
        fewshot_base = configs.get('fewshot_base', {}).get('rank1', 0)
        fewshot_aug = configs.get('fewshot_augmented', {}).get('rank1', 0)
        
        comparison_data.append({
            'Model': model_name,
            'Oneshot_Base': oneshot_base,
            'Oneshot_Aug': oneshot_aug,
            'Fewshot_Base': fewshot_base,
            'Fewshot_Aug': fewshot_aug,
            'Aug_Improvement_Oneshot': oneshot_aug - oneshot_base,
            'Aug_Improvement_Fewshot': fewshot_aug - fewshot_base,
            'Fewshot_Improvement_Base': fewshot_base - oneshot_base,
            'Fewshot_Improvement_Aug': fewshot_aug - oneshot_aug,
            'Best_Config': max(configs.items(), key=lambda x: x[1]['rank1'])[0],
            'Best_Rank1': max(c['rank1'] for c in configs.values())
        })
    
    return pd.DataFrame(comparison_data)


In [135]:
def summarize_aggregation_performance(all_model_results: Dict) -> pd.DataFrame:
    """Analyze which aggregation method works best"""
    agg_data = []
    
    for model_name, model_data in all_model_results.items():
        basic_results = model_data.get('basic_probe', {})
        
        for gallery_name, gallery_results in basic_results.items():
            agg_scores = {}
            for agg_method, results in gallery_results.items():
                df = results['threshold_results']
                agg_scores[agg_method] = df['rank1_accuracy'].max()
            
            best_agg = max(agg_scores.items(), key=lambda x: x[1])
            
            agg_data.append({
                'Model': model_name,
                'Gallery': gallery_name,
                'Best_Aggregation': best_agg[0],
                'MAX_Score': agg_scores.get('max', 0),
                'MEAN_Score': agg_scores.get('mean', 0),
                'TOPK_Score': agg_scores.get('topk', 0),
                'Best_Score': best_agg[1],
                'Score_Range': max(agg_scores.values()) - min(agg_scores.values())
            })
    
    return pd.DataFrame(agg_data)

In [136]:
def recommend_operating_thresholds(all_model_results: Dict) -> pd.DataFrame:
    """Recommend thresholds for different operating points"""
    threshold_recs = []
    
    for model_name, model_data in all_model_results.items():
        basic_results = model_data.get('basic_probe', {})
        
        for gallery_name, gallery_results in basic_results.items():
            for agg_method, results in gallery_results.items():
                df = results['threshold_results']
                
                # Find various operating points
                tar_99_idx = (df['tar'] - 0.99).abs().idxmin()
                far_001_idx = (df['far'] - 0.001).abs().idxmin()
                f1_max_idx = df['f1_score'].idxmax()
                
                # Balanced TAR/FAR (minimize |TAR - (1-FAR)|)
                df['tar_far_diff'] = abs(df['tar'] - (1 - df['far']))
                balanced_idx = df['tar_far_diff'].idxmin()
                
                threshold_recs.append({
                    'Model': model_name,
                    'Gallery': gallery_name,
                    'Aggregation': agg_method,
                    'Threshold_TAR99': df.loc[tar_99_idx, 'threshold'],
                    'TAR_at_TAR99': df.loc[tar_99_idx, 'tar'],
                    'Threshold_FAR0.001': df.loc[far_001_idx, 'threshold'],
                    'TAR_at_FAR0.001': df.loc[far_001_idx, 'tar'],
                    'Threshold_BestF1': df.loc[f1_max_idx, 'threshold'],
                    'F1_Score': df.loc[f1_max_idx, 'f1_score'],
                    'Threshold_Balanced': df.loc[balanced_idx, 'threshold'],
                    'TAR_Balanced': df.loc[balanced_idx, 'tar'],
                    'FAR_Balanced': df.loc[balanced_idx, 'far']
                })
    
    return pd.DataFrame(threshold_recs)

In [137]:

def analyze_failure_cases(all_model_results: Dict) -> Dict:
    """Analyze failure patterns"""
    failure_analysis = {}
    
    for model_name, model_data in all_model_results.items():
        basic_results = model_data.get('basic_probe', {})
        
        for gallery_name, gallery_results in basic_results.items():
            # Use mean aggregation for analysis
            if 'mean' not in gallery_results:
                continue
                
            results = gallery_results['mean']
            predictions = results.get('all_predictions', [])
            
            if not predictions:
                continue
            
            # Find misclassifications
            misclassified = [p for p in predictions if p['predicted_identity'] != p['true_identity']]
            
            # Count confusion pairs
            confusion_pairs = {}
            identity_errors = {}
            
            for pred in misclassified:
                true_id = pred['true_identity']
                pred_id = pred['predicted_identity']
                
                pair = f"{true_id} -> {pred_id}"
                confusion_pairs[pair] = confusion_pairs.get(pair, 0) + 1
                
                identity_errors[true_id] = identity_errors.get(true_id, 0) + 1
            
            # Sort by frequency
            top_confusions = sorted(confusion_pairs.items(), key=lambda x: x[1], reverse=True)[:10]
            top_errors = sorted(identity_errors.items(), key=lambda x: x[1], reverse=True)[:10]
            
            failure_analysis[f"{model_name}_{gallery_name}"] = {
                'total_predictions': len(predictions),
                'total_errors': len(misclassified),
                'error_rate': len(misclassified) / len(predictions),
                'top_confusion_pairs': top_confusions,
                'most_confused_identities': top_errors
            }
    
    return failure_analysis

In [138]:
def compare_models_statistical(all_model_results: Dict) -> pd.DataFrame:
    """Statistical significance testing between models"""
    stat_comparisons = []
    
    models = list(all_model_results.keys())
    
    for i, model1 in enumerate(models):
        for model2 in models[i+1:]:
            # Compare on fewshot_augmented + mean (best config)
            try:
                results1 = all_model_results[model1]['basic_probe']['fewshot_augmented']['mean']
                results2 = all_model_results[model2]['basic_probe']['fewshot_augmented']['mean']
                
                scores1 = [p['score'] if p['predicted_identity'] == p['true_identity'] else 0 
                          for p in results1['all_predictions']]
                scores2 = [p['score'] if p['predicted_identity'] == p['true_identity'] else 0 
                          for p in results2['all_predictions']]
                
                # Paired t-test
                t_stat, p_value = stats.ttest_rel(scores1, scores2)
                
                # Effect size (Cohen's d)
                mean_diff = np.mean(scores1) - np.mean(scores2)
                pooled_std = np.sqrt((np.std(scores1)**2 + np.std(scores2)**2) / 2)
                cohens_d = mean_diff / pooled_std if pooled_std > 0 else 0
                
                stat_comparisons.append({
                    'Model_A': model1,
                    'Model_B': model2,
                    'Mean_Diff': mean_diff,
                    't_statistic': t_stat,
                    'p_value': p_value,
                    'Significant': 'Yes' if p_value < 0.05 else 'No',
                    'Cohens_d': cohens_d,
                    'Effect_Size': 'Small' if abs(cohens_d) < 0.5 else ('Medium' if abs(cohens_d) < 0.8 else 'Large')
                })
            except:
                continue
    
    return pd.DataFrame(stat_comparisons)

In [139]:
def generate_executive_summary(all_model_results: Dict, 
                              comparison_summary: pd.DataFrame) -> str:
    """Generate auto-summary of key findings"""
    
    # Best overall model
    best_row = comparison_summary.loc[comparison_summary['Rank-1'].idxmax()]
    
    # Best per gallery type
    best_per_gallery = comparison_summary.groupby('Gallery').apply(
        lambda x: x.loc[x['Rank-1'].idxmax()]
    )
    
    summary = f"""
================================================================================
EXECUTIVE SUMMARY - Face Recognition Evaluation
Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
================================================================================

KEY FINDINGS:

1. OVERALL BEST PERFORMANCE
   Model: {best_row['Model']}
   Configuration: {best_row['Gallery']} + {best_row['Aggregation']}
   Rank-1 Accuracy: {best_row['Rank-1']:.2%}
   ROC-AUC: {best_row['ROC-AUC']:.4f}
   d-prime: {best_row['d-prime']:.3f}

2. BEST CONFIGURATION PER GALLERY TYPE
"""
    
    for gallery, row in best_per_gallery.iterrows():
        summary += f"""
   {gallery.upper()}:
   - Model: {row['Model']} ({row['Aggregation']})
   - Rank-1: {row['Rank-1']:.2%}
   - ROC-AUC: {row['ROC-AUC']:.4f}
"""
    
    # Model rankings
    model_rankings = comparison_summary.groupby('Model')['Rank-1'].max().sort_values(ascending=False)
    
    summary += f"""
3. MODEL RANKINGS (by best Rank-1 accuracy)
"""
    for idx, (model, score) in enumerate(model_rankings.items(), 1):
        summary += f"   {idx}. {model}: {score:.2%}\n"
    
    # Aggregation method analysis
    agg_wins = comparison_summary.groupby(['Gallery', 'Aggregation'])['Rank-1'].max()
    best_agg_per_gallery = agg_wins.groupby('Gallery').idxmax()
    
    summary += f"""
4. BEST AGGREGATION METHOD PER GALLERY
"""
    for gallery, (_, agg) in best_agg_per_gallery.items():
        summary += f"   {gallery}: {agg.upper()}\n"
    
    summary += f"""
5. KEY RECOMMENDATIONS
   - Use {best_row['Model']} with {best_row['Gallery']} gallery for best accuracy
   - {best_row['Aggregation'].upper()} aggregation works best for this configuration
   - Operating threshold: {best_row['Best_Threshold']:.3f} for optimal performance
   - All models achieve 100% impostor rejection at threshold ≥ 0.35

6. LIMITATIONS
   - Performance degrades significantly on high pitch and high yaw conditions
   - Low quality images reduce accuracy by ~15-30%
   - Baseline/frontal images show best performance (>90% Rank-1)

================================================================================
"""
    
    return summary

In [140]:
def plot_all_metrics(results: Dict, title: str, save_path: Path):
    df = results['threshold_results']
    
    fig = plt.figure(figsize=(20, 12))
    gs = fig.add_gridspec(3, 4, hspace=0.3, wspace=0.3)
    
    ax1 = fig.add_subplot(gs[0, 0])
    ax1.plot(df['threshold'], df['rank1_accuracy'], 'b-', linewidth=2, label='Rank-1')
    ax1.plot(df['threshold'], df['rank5_accuracy'], 'g-', linewidth=2, label='Rank-5')
    ax1.plot(df['threshold'], df['rank10_accuracy'], 'r-', linewidth=2, label='Rank-10')
    ax1.set_xlabel('Threshold')
    ax1.set_ylabel('Accuracy')
    ax1.set_title('Rank-k Accuracy')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    ax2 = fig.add_subplot(gs[0, 1])
    ax2.plot(df['threshold'], df['mrr'], 'purple', linewidth=2)
    ax2.set_xlabel('Threshold')
    ax2.set_ylabel('MRR')
    ax2.set_title('Mean Reciprocal Rank')
    ax2.grid(True, alpha=0.3)
    
    ax3 = fig.add_subplot(gs[0, 2])
    ax3.plot(df['threshold'], df['far'], 'r-', linewidth=2, label='FAR')
    ax3.plot(df['threshold'], df['frr'], 'g-', linewidth=2, label='FRR')
    ax3.plot(df['threshold'], df['tar'], 'b-', linewidth=2, label='TAR')
    ax3.set_xlabel('Threshold')
    ax3.set_ylabel('Rate')
    ax3.set_title('FAR/FRR/TAR')
    ax3.legend()
    ax3.grid(True, alpha=0.3)
    
    ax4 = fig.add_subplot(gs[0, 3])
    ax4.plot(results['fpr'], results['tpr'], 'b-', linewidth=2)
    ax4.plot([0, 1], [0, 1], 'k--', alpha=0.3)
    ax4.set_xlabel('False Positive Rate')
    ax4.set_ylabel('True Positive Rate')
    ax4.set_title(f'ROC Curve (AUC={results["roc_auc"]:.4f})')
    ax4.grid(True, alpha=0.3)

    ax5 = fig.add_subplot(gs[1, 0])
    ax5.plot(df['threshold'], df['precision'], 'b-', linewidth=2, label='Precision')
    ax5.plot(df['threshold'], df['recall'], 'orange', linewidth=2, label='Recall')
    ax5.plot(df['threshold'], df['f1_score'], 'purple', linewidth=2, label='F1-Score')
    ax5.set_xlabel('Threshold')
    ax5.set_ylabel('Score')
    ax5.set_title('Precision/Recall/F1')
    ax5.legend()
    ax5.grid(True, alpha=0.3)

    ax6 = fig.add_subplot(gs[1, 1])
    ax6.hist(results['genuine_scores'], bins=50, alpha=0.5, label='Genuine', color='green')
    ax6.hist(results['impostor_scores'], bins=50, alpha=0.5, label='Impostor', color='red')
    ax6.axvline(np.mean(results['genuine_scores']), color='green', linestyle='--', linewidth=2)
    ax6.axvline(np.mean(results['impostor_scores']), color='red', linestyle='--', linewidth=2)
    ax6.set_xlabel('Similarity Score')
    ax6.set_ylabel('Frequency')
    ax6.set_title(f"Score Distributions (d'={results['dprime']:.3f})")
    ax6.legend()
    ax6.grid(True, alpha=0.3)
  
    ax7 = fig.add_subplot(gs[1, 2])
    ax7.plot(df['far'], df['frr'], 'b-', linewidth=2)
    ax7.set_xlabel('False Accept Rate')
    ax7.set_ylabel('False Reject Rate')
    ax7.set_title('DET Curve')
    ax7.set_xscale('log')
    ax7.set_yscale('log')
    ax7.grid(True, alpha=0.3, which='both')
 
    ax8 = fig.add_subplot(gs[1, 3])
    best_threshold_idx = df['rank1_accuracy'].idxmax()
    ranks = [1, 5, 10]
    cmc_scores = [
        df.loc[best_threshold_idx, 'rank1_accuracy'],
        df.loc[best_threshold_idx, 'rank5_accuracy'],
        df.loc[best_threshold_idx, 'rank10_accuracy']
    ]
    ax8.plot(ranks, cmc_scores, 'bo-', linewidth=2, markersize=8)
    ax8.set_xlabel('Rank')
    ax8.set_ylabel('Identification Rate')
    ax8.set_title('CMC Curve')
    ax8.set_xticks(ranks)
    ax8.grid(True, alpha=0.3)

    ax9 = fig.add_subplot(gs[2, 0])
    ax9.plot(df['threshold'], df['avg_correct_score'], 'g-', linewidth=2, label='Correct Matches')
    ax9.plot(df['threshold'], df['avg_incorrect_score'], 'r-', linewidth=2, label='Incorrect Matches')
    ax9.set_xlabel('Threshold')
    ax9.set_ylabel('Average Score')
    ax9.set_title('Score Analysis')
    ax9.legend()
    ax9.grid(True, alpha=0.3)
    
    ax10 = fig.add_subplot(gs[2, 1])
    genuine_mean = np.mean(results['genuine_scores'])
    impostor_mean = np.mean(results['impostor_scores'])
    genuine_ci = results['genuine_ci']
    impostor_ci = results['impostor_ci']
    
    categories = ['Genuine', 'Impostor']
    means = [genuine_mean, impostor_mean]
    errors_lower = [genuine_mean - genuine_ci[0], impostor_mean - impostor_ci[0]]
    errors_upper = [genuine_ci[1] - genuine_mean, impostor_ci[1] - impostor_mean]
    
    ax10.bar(categories, means, yerr=[errors_lower, errors_upper], 
            capsize=10, alpha=0.7, color=['green', 'red'])
    ax10.set_ylabel('Similarity Score')
    ax10.set_title('Mean Scores with 95% CI')
    ax10.grid(True, alpha=0.3, axis='y')
    
    ax11 = fig.add_subplot(gs[2, 2])
    target_fars = [0.1, 0.01, 0.001]
    tars_at_far = []
    for target_far in target_fars:
        idx = (df['far'] - target_far).abs().idxmin()
        tars_at_far.append(df.loc[idx, 'tar'])
    
    ax11.bar([f'FAR={f}' for f in target_fars], tars_at_far, alpha=0.7)
    ax11.set_ylabel('TAR')
    ax11.set_title('TAR @ FAR')
    ax11.set_ylim([0, 1])
    ax11.grid(True, alpha=0.3, axis='y')
    
    ax12 = fig.add_subplot(gs[2, 3])
    ax12.axis('off')
    
    best_idx = df['rank1_accuracy'].idxmax()
    best_row = df.loc[best_idx]
    
    summary_text = f"""
    SUMMARY STATISTICS
    ==================
    Aggregation: {results['aggregation'].upper()}
    
    Best Rank-1: {best_row['rank1_accuracy']:.4f}
    @ Threshold: {best_row['threshold']:.3f}
    
    Rank-5: {best_row['rank5_accuracy']:.4f}
    Rank-10: {best_row['rank10_accuracy']:.4f}
    MRR: {best_row['mrr']:.4f}
    
    ROC-AUC: {results['roc_auc']:.4f}
    Avg Precision: {results['average_precision']:.4f}
    d-prime: {results['dprime']:.3f}
    
    TAR@FAR=0.01: {tars_at_far[1]:.4f}
    
    Best F1: {df['f1_score'].max():.4f}
    @ Threshold: {df.loc[df['f1_score'].idxmax(), 'threshold']:.3f}
    """
    
    ax12.text(0.1, 0.5, summary_text, fontsize=10, family='monospace',
             verticalalignment='center')
    
    plt.suptitle(title, fontsize=16, fontweight='bold', y=0.995)
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Plot saved: {save_path}")

In [141]:

def plot_model_comparison_charts(all_model_results: Dict, 
                                comparison_summary: pd.DataFrame,
                                save_dir: Path):
    """Create comprehensive comparison visualizations"""
    save_dir.mkdir(parents=True, exist_ok=True)
    
    # 1. Bar chart: Rank-1 across models & galleries
    fig, ax = plt.subplots(figsize=(14, 6))
    
    pivot = comparison_summary.pivot_table(
        values='Rank-1', 
        index='Model', 
        columns='Gallery',
        aggfunc='max'
    )
    
    pivot.plot(kind='bar', ax=ax, width=0.8)
    ax.set_ylabel('Rank-1 Accuracy', fontsize=12)
    ax.set_xlabel('Model', fontsize=12)
    ax.set_title('Rank-1 Accuracy Comparison Across Models and Galleries', fontsize=14, fontweight='bold')
    ax.legend(title='Gallery Type', bbox_to_anchor=(1.05, 1), loc='upper left')
    ax.grid(True, alpha=0.3, axis='y')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig(save_dir / 'comparison_rank1_bar.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # 2. ROC curves overlaid
    fig, ax = plt.subplots(figsize=(10, 8))
    
    colors = plt.cm.tab10(np.linspace(0, 1, len(all_model_results)))
    
    for (model_name, model_data), color in zip(all_model_results.items(), colors):
        try:
            # Use fewshot_augmented + mean as reference
            results = model_data['basic_probe']['fewshot_augmented']['mean']
            ax.plot(results['fpr'], results['tpr'], 
                   label=f"{model_name} (AUC={results['roc_auc']:.3f})",
                   linewidth=2, color=color)
        except:
            continue
    
    ax.plot([0, 1], [0, 1], 'k--', alpha=0.3, label='Random')
    ax.set_xlabel('False Positive Rate', fontsize=12)
    ax.set_ylabel('True Positive Rate', fontsize=12)
    ax.set_title('ROC Curve Comparison (Fewshot Augmented + Mean)', fontsize=14, fontweight='bold')
    ax.legend(loc='lower right')
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(save_dir / 'comparison_roc_curves.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # 3. Heatmap: Models vs Aggregation methods
    fig, ax = plt.subplots(figsize=(12, 6))
    
    pivot_agg = comparison_summary[comparison_summary['Gallery'] == 'fewshot_augmented'].pivot(
        index='Model',
        columns='Aggregation',
        values='Rank-1'
    )
    
    sns.heatmap(pivot_agg, annot=True, fmt='.3f', cmap='RdYlGn', 
                vmin=0.4, vmax=0.7, ax=ax, cbar_kws={'label': 'Rank-1 Accuracy'})
    ax.set_title('Rank-1 Accuracy: Models vs Aggregation Methods (Fewshot Augmented)', 
                fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.savefig(save_dir / 'comparison_aggregation_heatmap.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # 4. Box plots: Score distributions
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    axes = axes.flatten()
    
    for idx, model_name in enumerate(list(all_model_results.keys())[:4]):
        try:
            results = all_model_results[model_name]['basic_probe']['fewshot_augmented']['mean']
            genuine = results['genuine_scores']
            impostor = results['impostor_scores']
            
            axes[idx].boxplot([genuine, impostor], labels=['Genuine', 'Impostor'])
            axes[idx].set_title(f'{model_name}', fontweight='bold')
            axes[idx].set_ylabel('Similarity Score')
            axes[idx].grid(True, alpha=0.3, axis='y')
        except:
            continue
    
    plt.suptitle('Score Distribution Comparison', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.savefig(save_dir / 'comparison_score_distributions.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    print(f"Comparison charts saved to: {save_dir}")


def plot_segmented_heatmap(segmented_table: pd.DataFrame, 
                          save_path: Path,
                          title: str = "Segmented Performance"):
    """Create heatmap for segmented evaluation results"""
    fig, ax = plt.subplots(figsize=(14, 6))
    
    # Drop summary columns for heatmap
    plot_data = segmented_table.drop(['Mean', 'Std', 'Min', 'Max'], axis=1, errors='ignore')
    
    sns.heatmap(plot_data, annot=True, fmt='.3f', cmap='RdYlGn',
                vmin=0.2, vmax=1.0, ax=ax, cbar_kws={'label': 'Rank-1 Accuracy'})
    ax.set_title(title, fontsize=14, fontweight='bold')
    ax.set_xlabel('Segment/Condition', fontsize=12)
    ax.set_ylabel('Model', fontsize=12)
    plt.tight_layout()
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Segmented heatmap saved: {save_path}")

In [142]:
def plot_gallery_strategy_comparison(strategy_df: pd.DataFrame, save_path: Path):
    """Visualize gallery strategy analysis"""
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # 1. Raw scores comparison
    ax = axes[0, 0]
    strategy_df.set_index('Model')[['Oneshot_Base', 'Oneshot_Aug', 
                                     'Fewshot_Base', 'Fewshot_Aug']].plot(
        kind='bar', ax=ax, width=0.8)
    ax.set_ylabel('Rank-1 Accuracy')
    ax.set_title('Gallery Strategy Comparison', fontweight='bold')
    ax.legend(title='Configuration')
    ax.grid(True, alpha=0.3, axis='y')
    plt.setp(ax.xaxis.get_majorticklabels(), rotation=45, ha='right')
    
    # 2. Augmentation improvement
    ax = axes[0, 1]
    strategy_df.set_index('Model')[['Aug_Improvement_Oneshot', 
                                     'Aug_Improvement_Fewshot']].plot(
        kind='bar', ax=ax, width=0.8)
    ax.set_ylabel('Rank-1 Improvement')
    ax.set_title('Augmentation Benefit', fontweight='bold')
    ax.axhline(y=0, color='k', linestyle='--', alpha=0.3)
    ax.legend(title='Gallery Type')
    ax.grid(True, alpha=0.3, axis='y')
    plt.setp(ax.xaxis.get_majorticklabels(), rotation=45, ha='right')
    
    # 3. Fewshot improvement
    ax = axes[1, 0]
    strategy_df.set_index('Model')[['Fewshot_Improvement_Base', 
                                     'Fewshot_Improvement_Aug']].plot(
        kind='bar', ax=ax, width=0.8)
    ax.set_ylabel('Rank-1 Improvement')
    ax.set_title('Fewshot vs Oneshot Benefit', fontweight='bold')
    ax.axhline(y=0, color='k', linestyle='--', alpha=0.3)
    ax.legend(title='Augmentation')
    ax.grid(True, alpha=0.3, axis='y')
    plt.setp(ax.xaxis.get_majorticklabels(), rotation=45, ha='right')
    
    # 4. Best configuration per model
    ax = axes[1, 1]
    best_configs = strategy_df.groupby('Best_Config').size()
    best_configs.plot(kind='bar', ax=ax, color='steelblue')
    ax.set_ylabel('Number of Models')
    ax.set_title('Most Common Best Configuration', fontweight='bold')
    ax.grid(True, alpha=0.3, axis='y')
    plt.setp(ax.xaxis.get_majorticklabels(), rotation=45, ha='right')
    
    plt.suptitle('Gallery Strategy Analysis', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Gallery strategy plot saved: {save_path}")

In [143]:
def export_comprehensive_report(all_model_results: Dict,
                               all_summaries: Dict,
                               save_dir: Path):
    """Export complete results to multiple formats"""
    save_dir.mkdir(parents=True, exist_ok=True)
    
    # 1. Excel workbook with multiple sheets
    excel_path = save_dir / 'comprehensive_report.xlsx'
    with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
        all_summaries['comparison_summary'].to_excel(writer, sheet_name='Overall_Comparison', index=False)
        all_summaries['gallery_strategy'].to_excel(writer, sheet_name='Gallery_Strategy', index=False)
        all_summaries['aggregation_analysis'].to_excel(writer, sheet_name='Aggregation_Analysis', index=False)
        all_summaries['threshold_recommendations'].to_excel(writer, sheet_name='Threshold_Recommendations', index=False)
        
        if 'segmented_oneshot' in all_summaries:
            all_summaries['segmented_oneshot'].to_excel(writer, sheet_name='Segmented_Oneshot')
        if 'segmented_fewshot' in all_summaries:
            all_summaries['segmented_fewshot'].to_excel(writer, sheet_name='Segmented_Fewshot')
        if 'statistical_comparison' in all_summaries:
            all_summaries['statistical_comparison'].to_excel(writer, sheet_name='Statistical_Tests', index=False)
    
    print(f"Excel report saved: {excel_path}")
    
    # 2. JSON export
    json_data = {
        'metadata': {
            'generated': datetime.now().isoformat(),
            'models_evaluated': list(all_model_results.keys())
        },
        'summaries': {
            key: df.to_dict(orient='records') if isinstance(df, pd.DataFrame) else df
            for key, df in all_summaries.items()
            if key != 'executive_summary'
        },
        'executive_summary': all_summaries.get('executive_summary', '')
    }
    
    json_path = save_dir / 'comprehensive_report.json'
    with open(json_path, 'w') as f:
        json.dump(json_data, f, indent=2)
    
    print(f"JSON report saved: {json_path}")
    
    # 3. Text summary
    txt_path = save_dir / 'executive_summary.txt'
    with open(txt_path, 'w') as f:
        f.write(all_summaries.get('executive_summary', ''))
    
    print(f"Text summary saved: {txt_path}")
    
    # 4. LaTeX tables
    latex_path = save_dir / 'latex_tables.tex'
    with open(latex_path, 'w') as f:
        f.write("% Comparison Summary\n")
        f.write(all_summaries['comparison_summary'].to_latex(index=False, float_format="%.4f"))
        f.write("\n\n% Gallery Strategy\n")
        f.write(all_summaries['gallery_strategy'].to_latex(index=False, float_format="%.4f"))
    
    print(f"LaTeX tables saved: {latex_path}")

In [144]:
def plot_impostor_metrics(results: Dict, title: str, save_path: Path):
    df = results['threshold_results']
    
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))

    ax = axes[0, 0]
    ax.plot(df['threshold'], df['rejection_rate'], 'g-', linewidth=2)
    ax.set_xlabel('Threshold')
    ax.set_ylabel('Rejection Rate')
    ax.set_title('Impostor Rejection Rate')
    ax.grid(True, alpha=0.3)

    ax = axes[0, 1]
    ax.plot(df['threshold'], df['far'], 'r-', linewidth=2)
    ax.set_xlabel('Threshold')
    ax.set_ylabel('False Accept Rate')
    ax.set_title('False Accept Rate')
    ax.grid(True, alpha=0.3)

    ax = axes[1, 0]
    ax.hist(results['impostor_scores'], bins=50, alpha=0.7, color='red')
    ax.axvline(np.mean(results['impostor_scores']), color='darkred', 
              linestyle='--', linewidth=2, label='Mean')
    ax.axvline(results['impostor_ci'][0], color='orange', 
              linestyle=':', linewidth=2, label='95% CI')
    ax.axvline(results['impostor_ci'][1], color='orange', 
              linestyle=':', linewidth=2)
    ax.set_xlabel('Similarity Score')
    ax.set_ylabel('Frequency')
    ax.set_title('Impostor Score Distribution')
    ax.legend()
    ax.grid(True, alpha=0.3)

    ax = axes[1, 1]
    ax.axis('off')
    
    best_idx = df['rejection_rate'].idxmax()
    best_row = df.loc[best_idx]
    
    summary_text = f"""
    IMPOSTOR REJECTION SUMMARY
    ==========================
    Aggregation: {results['aggregation'].upper()}
    
    Best Rejection: {best_row['rejection_rate']:.4f}
    @ Threshold: {best_row['threshold']:.3f}
    
    FAR at best: {best_row['far']:.4f}
    
    Total Impostors: {best_row['n_impostors']}
    
    Mean Score: {np.mean(results['impostor_scores']):.4f}
    Std Score: {np.std(results['impostor_scores']):.4f}
    
    95% CI: [{results['impostor_ci'][0]:.4f}, 
             {results['impostor_ci'][1]:.4f}]
    """
    
    ax.text(0.1, 0.5, summary_text, fontsize=11, family='monospace',
           verticalalignment='center')
    
    plt.suptitle(title, fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Plot saved: {save_path}")

In [145]:
def run_basic_probe_evaluation(model_name: str, embeddings: Dict, 
                               results_dir: Path, plots_dir: Path):
    """Run basic probe evaluation (ORIGINAL + FIXED)"""
    print(f"\n{'='*70}")
    print(f"BASIC PROBE EVALUATION: {model_name}")
    print(f"{'='*70}")
    
    probe = embeddings['probe_positive_unsegmented']

    if probe is None:
        print("Missing probe embeddings!")
        return None

    gallery_types = {
        'oneshot_base': 'gallery_oneshot_base',
        'oneshot_augmented': 'gallery_oneshot_augmented', 
        'fewshot_base': 'gallery_fewshot_base',
        'fewshot_augmented': 'gallery_fewshot_augmented'
    }
        
    thresholds = np.arange(0.2, 0.91, 0.05)
    aggregations = ['max', 'mean', 'topk']
    
    all_results = {}

    for gallery_name, gallery_key in gallery_types.items():
        gallery = embeddings.get(gallery_key)
        
        if gallery is None:
            print(f"Missing {gallery_name} gallery, skipping...")
            continue
        
        print(f"\n{'-'*70}")
        print(f"GALLERY: {gallery_name.upper()}")
        print(f"{'-'*70}")
        
        gallery_results = {}
    
        for agg in aggregations:
            print(f"\nEvaluating with {agg.upper()} aggregation...")
            results = evaluate_probes_comprehensive(
                gallery, probe, thresholds, aggregation=agg, k=3
            )
            
            csv_path = results_dir / model_name / f'basic_probe_{gallery_name}_{agg}_metrics.csv'
            csv_path.parent.mkdir(parents=True, exist_ok=True)
            results['threshold_results'].to_csv(csv_path, index=False)
    
            plot_path = plots_dir / model_name / f'basic_probe_{gallery_name}_{agg}_plot.png'
            plot_path.parent.mkdir(parents=True, exist_ok=True)
            plot_all_metrics(results, f"{model_name} - Basic Probe - {gallery_name.upper()} ({agg.upper()})", plot_path)
                        
            gallery_results[agg] = results

            df = results['threshold_results']
            best_idx = df['rank1_accuracy'].idxmax()
            print(f"  Best Rank-1: {df.loc[best_idx, 'rank1_accuracy']:.4f} "
                f"@ threshold {df.loc[best_idx, 'threshold']:.2f}")
            print(f"  ROC-AUC: {results['roc_auc']:.4f}")
            print(f"  d-prime: {results['dprime']:.3f}")

        all_results[gallery_name] = gallery_results
    
    return all_results

In [146]:
def run_impostor_evaluation(model_name: str, embeddings: Dict,
                           results_dir: Path, plots_dir: Path):
    """Run impostor evaluation (ORIGINAL + ENHANCED OUTPUT)"""
    print(f"\n{'='*70}")
    print(f"IMPOSTOR EVALUATION: {model_name}")
    print(f"{'='*70}")
    
    gallery = embeddings['gallery_oneshot_augmented']
    impostor = embeddings['probe_negative']
    
    if gallery is None or impostor is None:
        print("Missing embeddings!")
        return None
    
    thresholds = np.arange(0.2, 0.91, 0.05)
    
    print("\nEvaluating with MEAN aggregation...")
    results = evaluate_impostors_comprehensive(
        gallery, impostor, thresholds, aggregation='mean', k=3
    )
    
    csv_path = results_dir / model_name / 'impostor_metrics.csv'
    csv_path.parent.mkdir(parents=True, exist_ok=True)
    results['threshold_results'].to_csv(csv_path, index=False)
    
    plot_path = plots_dir / model_name / 'impostor_plot.png'
    plot_path.parent.mkdir(parents=True, exist_ok=True)
    plot_impostor_metrics(results, f"{model_name} - Impostor Rejection", plot_path)

    df = results['threshold_results']
    best_idx = df['rejection_rate'].idxmax()
    print(f"  Best Rejection Rate: {df.loc[best_idx, 'rejection_rate']:.4f} "
          f"@ threshold {df.loc[best_idx, 'threshold']:.2f}")
    print(f"  FAR at best: {df.loc[best_idx, 'far']:.4f}")
    print(f"  Total impostors: {df.loc[best_idx, 'n_impostors']}")
    print(f"  Mean impostor score: {results['mean_impostor_score']:.4f}")
    print(f"  Std impostor score: {results['std_impostor_score']:.4f}")
    print(f"  95% CI: [{results['impostor_ci'][0]:.4f}, {results['impostor_ci'][1]:.4f}]")
    
    return results


In [147]:
def run_segmented_evaluation(model_name: str, embeddings: Dict,
                             results_dir: Path, plots_dir: Path,
                             gallery_type: str):
    """Run segmented evaluation (FIXED - added d-prime output)"""
    print(f"\n{'='*70}")
    print(f"SEGMENTED EVALUATION: {model_name} ({gallery_type})")
    print(f"{'='*70}")
    
    gallery_key = f'gallery_{gallery_type}_augmented'
    gallery = embeddings[gallery_key]
    probe = embeddings['probe_positive_segmented']
    
    if gallery is None or probe is None:
        print("Missing embeddings!")
        return None
    
    thresholds = np.arange(0.2, 0.91, 0.05)
    
    print("\nEvaluating with MEAN aggregation...")
    segment_results = evaluate_segmented_comprehensive(
        gallery, probe, thresholds, aggregation='mean', k=3
    )
    
    for segment_name, results in segment_results.items():
        csv_path = results_dir / model_name / f'segmented_{gallery_type}_{segment_name}_metrics.csv'
        csv_path.parent.mkdir(parents=True, exist_ok=True)
        results['threshold_results'].to_csv(csv_path, index=False)

        plot_path = plots_dir / model_name / f'segmented_{gallery_type}_{segment_name}_plot.png'
        plot_path.parent.mkdir(parents=True, exist_ok=True)
        plot_all_metrics(results, 
                        f"{model_name} - {segment_name} ({gallery_type})", 
                        plot_path)
        
        df = results['threshold_results']
        best_idx = df['rank1_accuracy'].idxmax()
        print(f"\n  {segment_name}:")
        print(f"    Rank-1: {df.loc[best_idx, 'rank1_accuracy']:.4f} "
              f"@ threshold {df.loc[best_idx, 'threshold']:.2f}")
        print(f"    ROC-AUC: {results['roc_auc']:.4f}")
        print(f"    d-prime: {results['dprime']:.3f}")  # FIXED: Added d-prime output
    
    return segment_results


In [148]:
def run_complete_evaluation_pipeline(all_embeddings: Dict, output_base_dir: Path):
    """
    Complete evaluation pipeline with all analysis and comparisons
    
    Args:
        all_embeddings: Dict with structure {model_name: embeddings_dict}
        output_base_dir: Base directory for all outputs
    """
    
    print("\n" + "="*80)
    print("COMPLETE FACE RECOGNITION EVALUATION PIPELINE")
    print("="*80)
    
    results_dir = output_base_dir / 'evaluation_results'
    plots_dir = output_base_dir / 'plots'
    comparison_dir = output_base_dir / 'comparisons'

    results_dir.mkdir(parents=True, exist_ok=True)
    plots_dir.mkdir(parents=True, exist_ok=True)
    comparison_dir.mkdir(parents=True, exist_ok=True)
    (comparison_dir / "charts").mkdir(parents=True, exist_ok=True)
    (comparison_dir / "reports").mkdir(parents=True, exist_ok=True)
        
    all_model_results = {}
    
    # Run individual model evaluations
    for model_name, embeddings in all_embeddings.items():
        print(f"\n{'#'*80}")
        print(f"# PROCESSING MODEL: {model_name}")
        print(f"{'#'*80}")
        
        model_results = {}
        
        # 1. Basic probe evaluation
        model_results['basic_probe'] = run_basic_probe_evaluation(
            model_name, embeddings, results_dir, plots_dir
        )
        
        # 2. Impostor evaluation
        model_results['impostor'] = run_impostor_evaluation(
            model_name, embeddings, results_dir, plots_dir
        )
        
        # 3. Segmented evaluation - oneshot
        model_results['segmented_oneshot'] = run_segmented_evaluation(
            model_name, embeddings, results_dir, plots_dir, 'oneshot'
        )
        
        # 4. Segmented evaluation - fewshot
        model_results['segmented_fewshot'] = run_segmented_evaluation(
            model_name, embeddings, results_dir, plots_dir, 'fewshot'
        )
        
        all_model_results[model_name] = model_results
    
    # ========================================================================
    # COMPARATIVE ANALYSIS
    # ========================================================================
    
    print(f"\n{'#'*80}")
    print("# COMPARATIVE ANALYSIS")
    print(f"{'#'*80}")
    
    all_summaries = {}
    
    # 1. Generate comparison summary
    print("\n1. Generating comparison summary...")
    all_summaries['comparison_summary'] = generate_comparison_summary(all_model_results)
    all_summaries['comparison_summary'].to_csv(comparison_dir / 'comparison_summary.csv', index=False)
    print(f"   Saved: {comparison_dir / 'comparison_summary.csv'}")
    
    # 2. Gallery strategy analysis
    print("\n2. Analyzing gallery strategies...")
    all_summaries['gallery_strategy'] = analyze_gallery_strategies(all_model_results)
    all_summaries['gallery_strategy'].to_csv(comparison_dir / 'gallery_strategy_analysis.csv', index=False)
    print(f"   Saved: {comparison_dir / 'gallery_strategy_analysis.csv'}")
    
    # 3. Aggregation method analysis
    print("\n3. Analyzing aggregation methods...")
    all_summaries['aggregation_analysis'] = summarize_aggregation_performance(all_model_results)
    all_summaries['aggregation_analysis'].to_csv(comparison_dir / 'aggregation_analysis.csv', index=False)
    print(f"   Saved: {comparison_dir / 'aggregation_analysis.csv'}")
    
    # 4. Threshold recommendations
    print("\n4. Generating threshold recommendations...")
    all_summaries['threshold_recommendations'] = recommend_operating_thresholds(all_model_results)
    all_summaries['threshold_recommendations'].to_csv(comparison_dir / 'threshold_recommendations.csv', index=False)
    print(f"   Saved: {comparison_dir / 'threshold_recommendations.csv'}")
    
    # 5. Segmented comparison tables
    print("\n5. Creating segmented comparison tables...")
    all_summaries['segmented_oneshot'] = create_segmented_comparison_table(all_model_results, 'oneshot')
    all_summaries['segmented_oneshot'].to_csv(comparison_dir / 'segmented_oneshot_comparison.csv')
    print(f"   Saved: {comparison_dir / 'segmented_oneshot_comparison.csv'}")
    
    all_summaries['segmented_fewshot'] = create_segmented_comparison_table(all_model_results, 'fewshot')
    all_summaries['segmented_fewshot'].to_csv(comparison_dir / 'segmented_fewshot_comparison.csv')
    print(f"   Saved: {comparison_dir / 'segmented_fewshot_comparison.csv'}")
    
    # 6. Failure analysis
    print("\n6. Analyzing failure cases...")
    all_summaries['failure_analysis'] = analyze_failure_cases(all_model_results)
    with open(comparison_dir / 'failure_analysis.json', 'w') as f:
        json.dump(all_summaries['failure_analysis'], f, indent=2)
    print(f"   Saved: {comparison_dir / 'failure_analysis.json'}")
    
    # 7. Statistical comparison
    print("\n7. Performing statistical comparisons...")
    all_summaries['statistical_comparison'] = compare_models_statistical(all_model_results)
    all_summaries['statistical_comparison'].to_csv(comparison_dir / 'statistical_comparison.csv', index=False)
    print(f"   Saved: {comparison_dir / 'statistical_comparison.csv'}")
    
    # 8. Executive summary
    print("\n8. Generating executive summary...")
    all_summaries['executive_summary'] = generate_executive_summary(
        all_model_results, 
        all_summaries['comparison_summary']
    )
    print(all_summaries['executive_summary'])
    
    # ========================================================================
    # VISUALIZATIONS
    # ========================================================================
    
    print(f"\n{'#'*80}")
    print("# GENERATING COMPARISON VISUALIZATIONS")
    print(f"{'#'*80}")
    
    # 1. Model comparison charts
    print("\n1. Creating model comparison charts...")
    plot_model_comparison_charts(
        all_model_results,
        all_summaries['comparison_summary'],
        comparison_dir / 'charts'
    )
    
    # 2. Segmented heatmaps
    print("\n2. Creating segmented performance heatmaps...")
    plot_segmented_heatmap(
        all_summaries['segmented_oneshot'],
        comparison_dir / 'charts' / 'segmented_oneshot_heatmap.png',
        'Segmented Performance - Oneshot'
    )
    plot_segmented_heatmap(
        all_summaries['segmented_fewshot'],
        comparison_dir / 'charts' / 'segmented_fewshot_heatmap.png',
        'Segmented Performance - Fewshot'
    )
    
    # 3. Gallery strategy visualization
    print("\n3. Creating gallery strategy visualizations...")
    plot_gallery_strategy_comparison(
        all_summaries['gallery_strategy'],
        comparison_dir / 'charts' / 'gallery_strategy_comparison.png'
    )
    
    # ========================================================================
    # EXPORT REPORTS
    # ========================================================================
    
    print(f"\n{'#'*80}")
    print("# EXPORTING COMPREHENSIVE REPORTS")
    print(f"{'#'*80}")
    
    export_comprehensive_report(
        all_model_results,
        all_summaries,
        comparison_dir / 'reports'
    )
    
    print(f"\n{'='*80}")
    print("EVALUATION PIPELINE COMPLETE")
    print(f"{'='*80}")
    print(f"\nAll results saved to: {output_base_dir}")
    print(f"  - Individual results: {results_dir}")
    print(f"  - Individual plots: {plots_dir}")
    print(f"  - Comparisons: {comparison_dir}")
    print(f"  - Reports: {comparison_dir / 'reports'}")
    
    return all_model_results, all_summaries

In [149]:
def load_pkl(path: Path):
    with open(path, "rb") as f:
        return pickle.load(f)


def load_all_embeddings(base_dir: Path):
    """
    Auto-loads all embeddings following this structure:

    base_dir /
        adaface_ir_101 /
            gallery_few-shot_augmented.pkl
            gallery_few-shot_base.pkl
            ...
        arcface_ir_50 /
            ...

    Returns: dict formatted exactly for run_complete_evaluation_pipeline()
    """
    mapping = {
        "gallery_one-shot_base.pkl": "gallery_oneshot_base",
        "gallery_one-shot_augmented.pkl": "gallery_oneshot_augmented",
        "gallery_few-shot_base.pkl": "gallery_fewshot_base",
        "gallery_few-shot_augmented.pkl": "gallery_fewshot_augmented",
        "probe_negative.pkl": "probe_negative",
        "probe_positive_segmented.pkl": "probe_positive_segmented",
        "probe_positive_unsegmented.pkl": "probe_positive_unsegmented",
    }

    all_embeddings = {}

    for model_dir in base_dir.iterdir():
        if not model_dir.is_dir():
            continue

        model_name = model_dir.name
        all_embeddings[model_name] = {}

        for file in model_dir.iterdir():
            if file.suffix != ".pkl":
                continue

            key = mapping.get(file.name)
            if key is None:
                print(f"Warning: Unrecognized file {file.name}, skipping…")
                continue

            all_embeddings[model_name][key] = load_pkl(file)

    return all_embeddings

In [150]:
all_embeddings = {}

for model in models:
    model_dir = embeddings_root / model
    if not model_dir.exists():
        print(f"Warning: {model_dir} not found, skipping...")
        continue

    all_embeddings[model] = {}

    for file in model_dir.glob("*.pkl"):
        fname = file.name

        # Map filename → dictionary key
        if "one-shot_base" in fname:
            key = "gallery_oneshot_base"
        elif "one-shot_augmented" in fname:
            key = "gallery_oneshot_augmented"
        elif "few-shot_base" in fname:
            key = "gallery_fewshot_base"
        elif "few-shot_augmented" in fname:
            key = "gallery_fewshot_augmented"
        else:
            # probe_negative, probe_positive_segmented, etc.
            key = fname.replace(".pkl", "")

        with open(file, "rb") as f:
            all_embeddings[model][key] = pickle.load(f)

In [151]:
all_results, all_summaries = run_complete_evaluation_pipeline(
    all_embeddings,
    output_root
)

print(all_summaries['executive_summary'])
print(all_summaries['comparison_summary'])
print(all_summaries['gallery_strategy'])


COMPLETE FACE RECOGNITION EVALUATION PIPELINE

################################################################################
# PROCESSING MODEL: adaface_ir_50
################################################################################

BASIC PROBE EVALUATION: adaface_ir_50

----------------------------------------------------------------------
GALLERY: ONESHOT_BASE
----------------------------------------------------------------------

Evaluating with MAX aggregation...


Processing probes (max): 100%|██████████| 23/23 [00:00<00:00, 184.94it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_50\basic_probe_oneshot_base_max_plot.png
  Best Rank-1: 0.5183 @ threshold 0.20
  ROC-AUC: 0.9150
  d-prime: 1.360

Evaluating with MEAN aggregation...


Processing probes (mean): 100%|██████████| 23/23 [00:00<00:00, 111.72it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_50\basic_probe_oneshot_base_mean_plot.png
  Best Rank-1: 0.5183 @ threshold 0.20
  ROC-AUC: 0.9150
  d-prime: 1.360

Evaluating with TOPK aggregation...


Processing probes (topk): 100%|██████████| 23/23 [00:00<00:00, 112.68it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_50\basic_probe_oneshot_base_topk_plot.png
  Best Rank-1: 0.5183 @ threshold 0.20
  ROC-AUC: 0.9150
  d-prime: 1.360

----------------------------------------------------------------------
GALLERY: ONESHOT_AUGMENTED
----------------------------------------------------------------------

Evaluating with MAX aggregation...


Processing probes (max): 100%|██████████| 23/23 [00:00<00:00, 30.71it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_50\basic_probe_oneshot_augmented_max_plot.png
  Best Rank-1: 0.5640 @ threshold 0.20
  ROC-AUC: 0.8854
  d-prime: 1.380

Evaluating with MEAN aggregation...


Processing probes (mean): 100%|██████████| 23/23 [00:00<00:00, 31.02it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_50\basic_probe_oneshot_augmented_mean_plot.png
  Best Rank-1: 0.5457 @ threshold 0.20
  ROC-AUC: 0.9043
  d-prime: 1.380

Evaluating with TOPK aggregation...


Processing probes (topk): 100%|██████████| 23/23 [00:00<00:00, 28.77it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_50\basic_probe_oneshot_augmented_topk_plot.png
  Best Rank-1: 0.5488 @ threshold 0.20
  ROC-AUC: 0.9031
  d-prime: 1.376

----------------------------------------------------------------------
GALLERY: FEWSHOT_BASE
----------------------------------------------------------------------

Evaluating with MAX aggregation...


Processing probes (max): 100%|██████████| 23/23 [00:00<00:00, 74.12it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_50\basic_probe_fewshot_base_max_plot.png
  Best Rank-1: 0.5274 @ threshold 0.20
  ROC-AUC: 0.9140
  d-prime: 1.400

Evaluating with MEAN aggregation...


Processing probes (mean): 100%|██████████| 23/23 [00:00<00:00, 53.88it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_50\basic_probe_fewshot_base_mean_plot.png
  Best Rank-1: 0.5884 @ threshold 0.20
  ROC-AUC: 0.9225
  d-prime: 1.440

Evaluating with TOPK aggregation...


Processing probes (topk): 100%|██████████| 23/23 [00:00<00:00, 52.66it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_50\basic_probe_fewshot_base_topk_plot.png
  Best Rank-1: 0.5518 @ threshold 0.20
  ROC-AUC: 0.9241
  d-prime: 1.426

----------------------------------------------------------------------
GALLERY: FEWSHOT_AUGMENTED
----------------------------------------------------------------------

Evaluating with MAX aggregation...


Processing probes (max): 100%|██████████| 23/23 [00:02<00:00,  9.29it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_50\basic_probe_fewshot_augmented_max_plot.png
  Best Rank-1: 0.5335 @ threshold 0.20
  ROC-AUC: 0.8850
  d-prime: 1.408

Evaluating with MEAN aggregation...


Processing probes (mean): 100%|██████████| 23/23 [00:02<00:00, 10.12it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_50\basic_probe_fewshot_augmented_mean_plot.png
  Best Rank-1: 0.5915 @ threshold 0.20
  ROC-AUC: 0.9184
  d-prime: 1.454

Evaluating with TOPK aggregation...


Processing probes (topk): 100%|██████████| 23/23 [00:02<00:00,  9.11it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_50\basic_probe_fewshot_augmented_topk_plot.png
  Best Rank-1: 0.5335 @ threshold 0.20
  ROC-AUC: 0.9015
  d-prime: 1.416

IMPOSTOR EVALUATION: adaface_ir_50

Evaluating with MEAN aggregation...


Processing impostors (mean): 100%|██████████| 2/2 [00:00<00:00,  3.50it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_50\impostor_plot.png
  Best Rejection Rate: 1.0000 @ threshold 0.35
  FAR at best: 0.0000
  Total impostors: 252
  Mean impostor score: 0.1099
  Std impostor score: 0.0517
  95% CI: [0.1032, 0.1161]

SEGMENTED EVALUATION: adaface_ir_50 (oneshot)

Evaluating with MEAN aggregation...
Found 8 segments: ['baseline', 'left', 'center', 'right', 'high_pitch', 'high_yaw', 'blur', 'low_quality']


Processing probes (mean): 100%|██████████| 11/11 [00:00<00:00, 180.29it/s]
Processing probes (mean): 100%|██████████| 21/21 [00:00<00:00, 83.08it/s] 
Processing probes (mean): 100%|██████████| 22/22 [00:00<00:00, 62.84it/s]
Processing probes (mean): 100%|██████████| 18/18 [00:00<00:00, 75.75it/s]
Processing probes (mean): 100%|██████████| 2/2 [00:00<00:00, 153.80it/s]]
Processing probes (mean): 100%|██████████| 21/21 [00:00<00:00, 77.46it/s]
Processing probes (mean): 100%|██████████| 23/23 [00:00<00:00, 33.64it/s]
Processing probes (mean): 100%|██████████| 22/22 [00:00<00:00, 59.35it/s]
Processing segments (mean): 100%|██████████| 8/8 [00:04<00:00,  1.92it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_50\segmented_oneshot_baseline_plot.png

  baseline:
    Rank-1: 0.9167 @ threshold 0.20
    ROC-AUC: 0.8409
    d-prime: 3.504
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_50\segmented_oneshot_left_plot.png

  left:
    Rank-1: 0.5591 @ threshold 0.20
    ROC-AUC: 0.8677
    d-prime: 1.284
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_50\segmented_oneshot_center_plot.png

  center:
    Rank-1: 0.5461 @ threshold 0.20
    ROC-AUC: 0.9129
    d-prime: 1.372
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_50\segmented_oneshot_right_plot.png

  right:
    Rank-1: 0.5319 @ threshold 0.20
    ROC-AUC: 0.9382
    d-prime: 1.507
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_50\segmented_oneshot_high_pitch_plot.png

  high_pitch:
    Rank-1: 0.2000 @ threshold 0.20
    ROC-AUC: 0.5000
    d-prime: 1.716
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_50\segmented_oneshot_high_yaw_plot.pn

Processing probes (mean): 100%|██████████| 11/11 [00:00<00:00, 58.80it/s]
Processing probes (mean): 100%|██████████| 21/21 [00:00<00:00, 28.87it/s]
Processing probes (mean): 100%|██████████| 22/22 [00:01<00:00, 18.09it/s]
Processing probes (mean): 100%|██████████| 18/18 [00:00<00:00, 21.93it/s]
Processing probes (mean): 100%|██████████| 2/2 [00:00<00:00, 45.74it/s]t]
Processing probes (mean): 100%|██████████| 21/21 [00:00<00:00, 22.80it/s]
Processing probes (mean): 100%|██████████| 23/23 [00:02<00:00, 10.11it/s]
Processing probes (mean): 100%|██████████| 22/22 [00:01<00:00, 20.90it/s]
Processing segments (mean): 100%|██████████| 8/8 [00:09<00:00,  1.16s/it]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_50\segmented_fewshot_baseline_plot.png

  baseline:
    Rank-1: 0.9167 @ threshold 0.20
    ROC-AUC: 0.9545
    d-prime: 3.664
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_50\segmented_fewshot_left_plot.png

  left:
    Rank-1: 0.6129 @ threshold 0.20
    ROC-AUC: 0.8767
    d-prime: 1.366
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_50\segmented_fewshot_center_plot.png

  center:
    Rank-1: 0.5674 @ threshold 0.20
    ROC-AUC: 0.9365
    d-prime: 1.429
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_50\segmented_fewshot_right_plot.png

  right:
    Rank-1: 0.6064 @ threshold 0.20
    ROC-AUC: 0.9507
    d-prime: 1.596
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_50\segmented_fewshot_high_pitch_plot.png

  high_pitch:
    Rank-1: 0.2000 @ threshold 0.20
    ROC-AUC: 1.0000
    d-prime: 1.536
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_50\segmented_fewshot_high_yaw_plot.pn

Processing probes (max): 100%|██████████| 23/23 [00:00<00:00, 204.40it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_101\basic_probe_oneshot_base_max_plot.png
  Best Rank-1: 0.6189 @ threshold 0.20
  ROC-AUC: 0.9336
  d-prime: 1.473

Evaluating with MEAN aggregation...


Processing probes (mean): 100%|██████████| 23/23 [00:00<00:00, 98.04it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_101\basic_probe_oneshot_base_mean_plot.png
  Best Rank-1: 0.6189 @ threshold 0.20
  ROC-AUC: 0.9336
  d-prime: 1.473

Evaluating with TOPK aggregation...


Processing probes (topk): 100%|██████████| 23/23 [00:00<00:00, 98.28it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_101\basic_probe_oneshot_base_topk_plot.png
  Best Rank-1: 0.6189 @ threshold 0.20
  ROC-AUC: 0.9336
  d-prime: 1.473

----------------------------------------------------------------------
GALLERY: ONESHOT_AUGMENTED
----------------------------------------------------------------------

Evaluating with MAX aggregation...


Processing probes (max): 100%|██████████| 23/23 [00:00<00:00, 32.78it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_101\basic_probe_oneshot_augmented_max_plot.png
  Best Rank-1: 0.6067 @ threshold 0.20
  ROC-AUC: 0.9337
  d-prime: 1.491

Evaluating with MEAN aggregation...


Processing probes (mean): 100%|██████████| 23/23 [00:00<00:00, 29.50it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_101\basic_probe_oneshot_augmented_mean_plot.png
  Best Rank-1: 0.6128 @ threshold 0.20
  ROC-AUC: 0.9408
  d-prime: 1.489

Evaluating with TOPK aggregation...


Processing probes (topk): 100%|██████████| 23/23 [00:00<00:00, 27.66it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_101\basic_probe_oneshot_augmented_topk_plot.png
  Best Rank-1: 0.6098 @ threshold 0.20
  ROC-AUC: 0.9391
  d-prime: 1.489

----------------------------------------------------------------------
GALLERY: FEWSHOT_BASE
----------------------------------------------------------------------

Evaluating with MAX aggregation...


Processing probes (max): 100%|██████████| 23/23 [00:00<00:00, 73.93it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_101\basic_probe_fewshot_base_max_plot.png
  Best Rank-1: 0.5640 @ threshold 0.20
  ROC-AUC: 0.9074
  d-prime: 1.493

Evaluating with MEAN aggregation...


Processing probes (mean): 100%|██████████| 23/23 [00:00<00:00, 52.83it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_101\basic_probe_fewshot_base_mean_plot.png
  Best Rank-1: 0.6341 @ threshold 0.20
  ROC-AUC: 0.9143
  d-prime: 1.528

Evaluating with TOPK aggregation...


Processing probes (topk): 100%|██████████| 23/23 [00:00<00:00, 50.37it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_101\basic_probe_fewshot_base_topk_plot.png
  Best Rank-1: 0.6006 @ threshold 0.20
  ROC-AUC: 0.9136
  d-prime: 1.507

----------------------------------------------------------------------
GALLERY: FEWSHOT_AUGMENTED
----------------------------------------------------------------------

Evaluating with MAX aggregation...


Processing probes (max): 100%|██████████| 23/23 [00:02<00:00,  9.79it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_101\basic_probe_fewshot_augmented_max_plot.png
  Best Rank-1: 0.5610 @ threshold 0.20
  ROC-AUC: 0.9063
  d-prime: 1.494

Evaluating with MEAN aggregation...


Processing probes (mean): 100%|██████████| 23/23 [00:02<00:00, 10.27it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_101\basic_probe_fewshot_augmented_mean_plot.png
  Best Rank-1: 0.6463 @ threshold 0.20
  ROC-AUC: 0.9122
  d-prime: 1.542

Evaluating with TOPK aggregation...


Processing probes (topk): 100%|██████████| 23/23 [00:02<00:00,  9.27it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_101\basic_probe_fewshot_augmented_topk_plot.png
  Best Rank-1: 0.5671 @ threshold 0.20
  ROC-AUC: 0.8953
  d-prime: 1.502

IMPOSTOR EVALUATION: adaface_ir_101

Evaluating with MEAN aggregation...


Processing impostors (mean): 100%|██████████| 2/2 [00:00<00:00,  3.00it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_101\impostor_plot.png
  Best Rejection Rate: 1.0000 @ threshold 0.35
  FAR at best: 0.0000
  Total impostors: 252
  Mean impostor score: 0.1001
  Std impostor score: 0.0557
  95% CI: [0.0939, 0.1076]

SEGMENTED EVALUATION: adaface_ir_101 (oneshot)

Evaluating with MEAN aggregation...
Found 8 segments: ['baseline', 'left', 'center', 'right', 'high_pitch', 'high_yaw', 'blur', 'low_quality']


Processing probes (mean): 100%|██████████| 11/11 [00:00<00:00, 206.65it/s]
Processing probes (mean): 100%|██████████| 21/21 [00:00<00:00, 91.67it/s] 
Processing probes (mean): 100%|██████████| 22/22 [00:00<00:00, 62.58it/s]
Processing probes (mean): 100%|██████████| 18/18 [00:00<00:00, 76.52it/s]
Processing probes (mean): 100%|██████████| 2/2 [00:00<00:00, 166.72it/s]]
Processing probes (mean): 100%|██████████| 21/21 [00:00<00:00, 78.57it/s]
Processing probes (mean): 100%|██████████| 23/23 [00:00<00:00, 33.07it/s]
Processing probes (mean): 100%|██████████| 22/22 [00:00<00:00, 60.99it/s]
Processing segments (mean): 100%|██████████| 8/8 [00:04<00:00,  1.93it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_101\segmented_oneshot_baseline_plot.png

  baseline:
    Rank-1: 0.9583 @ threshold 0.20
    ROC-AUC: 1.0000
    d-prime: 3.516
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_101\segmented_oneshot_left_plot.png

  left:
    Rank-1: 0.6129 @ threshold 0.20
    ROC-AUC: 0.9488
    d-prime: 1.412
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_101\segmented_oneshot_center_plot.png

  center:
    Rank-1: 0.6028 @ threshold 0.20
    ROC-AUC: 0.9244
    d-prime: 1.460
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_101\segmented_oneshot_right_plot.png

  right:
    Rank-1: 0.6277 @ threshold 0.20
    ROC-AUC: 0.9554
    d-prime: 1.622
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_101\segmented_oneshot_high_pitch_plot.png

  high_pitch:
    Rank-1: 0.4000 @ threshold 0.20
    ROC-AUC: 0.8333
    d-prime: 1.838
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_101\segmented_oneshot_high_yaw_p

Processing probes (mean): 100%|██████████| 11/11 [00:00<00:00, 66.64it/s]
Processing probes (mean): 100%|██████████| 21/21 [00:00<00:00, 30.12it/s]
Processing probes (mean): 100%|██████████| 22/22 [00:01<00:00, 20.47it/s]
Processing probes (mean): 100%|██████████| 18/18 [00:00<00:00, 25.84it/s]
Processing probes (mean): 100%|██████████| 2/2 [00:00<00:00, 54.05it/s]s]
Processing probes (mean): 100%|██████████| 21/21 [00:00<00:00, 27.40it/s]
Processing probes (mean): 100%|██████████| 23/23 [00:02<00:00, 10.97it/s]
Processing probes (mean): 100%|██████████| 22/22 [00:01<00:00, 19.46it/s]
Processing segments (mean): 100%|██████████| 8/8 [00:08<00:00,  1.07s/it]
  ax7.set_xscale('log')


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_101\segmented_fewshot_baseline_plot.png

  baseline:
    Rank-1: 1.0000 @ threshold 0.20
    ROC-AUC: nan
    d-prime: 3.954
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_101\segmented_fewshot_left_plot.png

  left:
    Rank-1: 0.6344 @ threshold 0.20
    ROC-AUC: 0.9327
    d-prime: 1.454
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_101\segmented_fewshot_center_plot.png

  center:
    Rank-1: 0.6312 @ threshold 0.20
    ROC-AUC: 0.8917
    d-prime: 1.507
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_101\segmented_fewshot_right_plot.png

  right:
    Rank-1: 0.6809 @ threshold 0.20
    ROC-AUC: 0.9182
    d-prime: 1.698
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_101\segmented_fewshot_high_pitch_plot.png

  high_pitch:
    Rank-1: 0.6000 @ threshold 0.20
    ROC-AUC: 0.6667
    d-prime: 1.975
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\adaface_ir_101\segmented_fewshot_high_yaw_plot

Processing probes (max): 100%|██████████| 23/23 [00:00<00:00, 256.83it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_50\basic_probe_oneshot_base_max_plot.png
  Best Rank-1: 0.4970 @ threshold 0.20
  ROC-AUC: 0.8733
  d-prime: 1.246

Evaluating with MEAN aggregation...


Processing probes (mean): 100%|██████████| 23/23 [00:00<00:00, 122.14it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_50\basic_probe_oneshot_base_mean_plot.png
  Best Rank-1: 0.4970 @ threshold 0.20
  ROC-AUC: 0.8733
  d-prime: 1.246

Evaluating with TOPK aggregation...


Processing probes (topk): 100%|██████████| 23/23 [00:00<00:00, 117.30it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_50\basic_probe_oneshot_base_topk_plot.png
  Best Rank-1: 0.4970 @ threshold 0.20
  ROC-AUC: 0.8733
  d-prime: 1.246

----------------------------------------------------------------------
GALLERY: ONESHOT_AUGMENTED
----------------------------------------------------------------------

Evaluating with MAX aggregation...


Processing probes (max): 100%|██████████| 23/23 [00:00<00:00, 37.85it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_50\basic_probe_oneshot_augmented_max_plot.png
  Best Rank-1: 0.4817 @ threshold 0.20
  ROC-AUC: 0.8941
  d-prime: 1.226

Evaluating with MEAN aggregation...


Processing probes (mean): 100%|██████████| 23/23 [00:00<00:00, 32.10it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_50\basic_probe_oneshot_augmented_mean_plot.png
  Best Rank-1: 0.4878 @ threshold 0.20
  ROC-AUC: 0.8925
  d-prime: 1.248

Evaluating with TOPK aggregation...


Processing probes (topk): 100%|██████████| 23/23 [00:00<00:00, 31.51it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_50\basic_probe_oneshot_augmented_topk_plot.png
  Best Rank-1: 0.4848 @ threshold 0.20
  ROC-AUC: 0.8860
  d-prime: 1.237

----------------------------------------------------------------------
GALLERY: FEWSHOT_BASE
----------------------------------------------------------------------

Evaluating with MAX aggregation...


Processing probes (max): 100%|██████████| 23/23 [00:00<00:00, 81.09it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_50\basic_probe_fewshot_base_max_plot.png
  Best Rank-1: 0.4939 @ threshold 0.20
  ROC-AUC: 0.9157
  d-prime: 1.222

Evaluating with MEAN aggregation...


Processing probes (mean): 100%|██████████| 23/23 [00:00<00:00, 56.89it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_50\basic_probe_fewshot_base_mean_plot.png
  Best Rank-1: 0.5335 @ threshold 0.20
  ROC-AUC: 0.8892
  d-prime: 1.266

Evaluating with TOPK aggregation...


Processing probes (topk): 100%|██████████| 23/23 [00:00<00:00, 57.30it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_50\basic_probe_fewshot_base_topk_plot.png
  Best Rank-1: 0.5244 @ threshold 0.20
  ROC-AUC: 0.8830
  d-prime: 1.246

----------------------------------------------------------------------
GALLERY: FEWSHOT_AUGMENTED
----------------------------------------------------------------------

Evaluating with MAX aggregation...


Processing probes (max): 100%|██████████| 23/23 [00:02<00:00, 10.84it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_50\basic_probe_fewshot_augmented_max_plot.png
  Best Rank-1: 0.4878 @ threshold 0.20
  ROC-AUC: 0.8882
  d-prime: 1.216

Evaluating with MEAN aggregation...


Processing probes (mean): 100%|██████████| 23/23 [00:02<00:00, 10.13it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_50\basic_probe_fewshot_augmented_mean_plot.png
  Best Rank-1: 0.5335 @ threshold 0.20
  ROC-AUC: 0.8948
  d-prime: 1.279

Evaluating with TOPK aggregation...


Processing probes (topk): 100%|██████████| 23/23 [00:02<00:00, 10.39it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_50\basic_probe_fewshot_augmented_topk_plot.png
  Best Rank-1: 0.5000 @ threshold 0.20
  ROC-AUC: 0.8925
  d-prime: 1.228

IMPOSTOR EVALUATION: arcface_ir_50

Evaluating with MEAN aggregation...


Processing impostors (mean): 100%|██████████| 2/2 [00:00<00:00,  3.74it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_50\impostor_plot.png
  Best Rejection Rate: 1.0000 @ threshold 0.40
  FAR at best: 0.0000
  Total impostors: 252
  Mean impostor score: 0.1455
  Std impostor score: 0.0546
  95% CI: [0.1387, 0.1521]

SEGMENTED EVALUATION: arcface_ir_50 (oneshot)

Evaluating with MEAN aggregation...
Found 8 segments: ['baseline', 'left', 'center', 'right', 'high_pitch', 'high_yaw', 'blur', 'low_quality']


Processing probes (mean): 100%|██████████| 11/11 [00:00<00:00, 209.47it/s]
Processing probes (mean): 100%|██████████| 21/21 [00:00<00:00, 99.86it/s] 
Processing probes (mean): 100%|██████████| 22/22 [00:00<00:00, 69.69it/s]
Processing probes (mean): 100%|██████████| 18/18 [00:00<00:00, 86.85it/s]
Processing probes (mean): 100%|██████████| 2/2 [00:00<00:00, 173.40it/s]]
Processing probes (mean): 100%|██████████| 21/21 [00:00<00:00, 89.16it/s]
Processing probes (mean): 100%|██████████| 23/23 [00:00<00:00, 37.23it/s]
Processing probes (mean): 100%|██████████| 22/22 [00:00<00:00, 65.00it/s]
Processing segments (mean): 100%|██████████| 8/8 [00:03<00:00,  2.14it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_50\segmented_oneshot_baseline_plot.png

  baseline:
    Rank-1: 0.7917 @ threshold 0.20
    ROC-AUC: 0.8000
    d-prime: 2.280
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_50\segmented_oneshot_left_plot.png

  left:
    Rank-1: 0.4839 @ threshold 0.20
    ROC-AUC: 0.8829
    d-prime: 1.163
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_50\segmented_oneshot_center_plot.png

  center:
    Rank-1: 0.4610 @ threshold 0.20
    ROC-AUC: 0.9221
    d-prime: 1.243
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_50\segmented_oneshot_right_plot.png

  right:
    Rank-1: 0.5319 @ threshold 0.20
    ROC-AUC: 0.8514
    d-prime: 1.358
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_50\segmented_oneshot_high_pitch_plot.png

  high_pitch:
    Rank-1: 0.2000 @ threshold 0.20
    ROC-AUC: 0.7500
    d-prime: 2.039
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_50\segmented_oneshot_high_yaw_plot.pn

Processing probes (mean): 100%|██████████| 11/11 [00:00<00:00, 63.48it/s]
Processing probes (mean): 100%|██████████| 21/21 [00:00<00:00, 32.15it/s]
Processing probes (mean): 100%|██████████| 22/22 [00:00<00:00, 22.99it/s]
Processing probes (mean): 100%|██████████| 18/18 [00:00<00:00, 28.53it/s]
Processing probes (mean): 100%|██████████| 2/2 [00:00<00:00, 60.50it/s]s]
Processing probes (mean): 100%|██████████| 21/21 [00:00<00:00, 27.45it/s]
Processing probes (mean): 100%|██████████| 23/23 [00:02<00:00, 11.49it/s]
Processing probes (mean): 100%|██████████| 22/22 [00:01<00:00, 21.38it/s]
Processing segments (mean): 100%|██████████| 8/8 [00:08<00:00,  1.01s/it]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_50\segmented_fewshot_baseline_plot.png

  baseline:
    Rank-1: 0.8750 @ threshold 0.20
    ROC-AUC: 0.9206
    d-prime: 2.538
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_50\segmented_fewshot_left_plot.png

  left:
    Rank-1: 0.5161 @ threshold 0.20
    ROC-AUC: 0.8940
    d-prime: 1.201
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_50\segmented_fewshot_center_plot.png

  center:
    Rank-1: 0.4894 @ threshold 0.20
    ROC-AUC: 0.9157
    d-prime: 1.249
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_50\segmented_fewshot_right_plot.png

  right:
    Rank-1: 0.6170 @ threshold 0.20
    ROC-AUC: 0.8688
    d-prime: 1.414


  ax7.set_xscale('log')


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_50\segmented_fewshot_high_pitch_plot.png

  high_pitch:
    Rank-1: 0.4000 @ threshold 0.20
    ROC-AUC: 0.3333
    d-prime: 1.937
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_50\segmented_fewshot_high_yaw_plot.png

  high_yaw:
    Rank-1: 0.3645 @ threshold 0.20
    ROC-AUC: 0.8035
    d-prime: 0.999
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_50\segmented_fewshot_blur_plot.png

  blur:
    Rank-1: 0.4982 @ threshold 0.20
    ROC-AUC: 0.8839
    d-prime: 1.214
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_50\segmented_fewshot_low_quality_plot.png

  low_quality:
    Rank-1: 0.3487 @ threshold 0.20
    ROC-AUC: 0.8696
    d-prime: 0.919

################################################################################
# PROCESSING MODEL: arcface_ir_101
################################################################################

BASIC PROBE EVALUATION: arcface_ir_101

-----------------------

Processing probes (max): 100%|██████████| 23/23 [00:00<00:00, 251.59it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_101\basic_probe_oneshot_base_max_plot.png
  Best Rank-1: 0.5518 @ threshold 0.20
  ROC-AUC: 0.9005
  d-prime: 1.367

Evaluating with MEAN aggregation...


Processing probes (mean): 100%|██████████| 23/23 [00:00<00:00, 121.01it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_101\basic_probe_oneshot_base_mean_plot.png
  Best Rank-1: 0.5518 @ threshold 0.20
  ROC-AUC: 0.9005
  d-prime: 1.367

Evaluating with TOPK aggregation...


Processing probes (topk): 100%|██████████| 23/23 [00:00<00:00, 114.97it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_101\basic_probe_oneshot_base_topk_plot.png
  Best Rank-1: 0.5518 @ threshold 0.20
  ROC-AUC: 0.9005
  d-prime: 1.367

----------------------------------------------------------------------
GALLERY: ONESHOT_AUGMENTED
----------------------------------------------------------------------

Evaluating with MAX aggregation...


Processing probes (max): 100%|██████████| 23/23 [00:00<00:00, 38.49it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_101\basic_probe_oneshot_augmented_max_plot.png
  Best Rank-1: 0.5579 @ threshold 0.20
  ROC-AUC: 0.8894
  d-prime: 1.363

Evaluating with MEAN aggregation...


Processing probes (mean): 100%|██████████| 23/23 [00:00<00:00, 31.06it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_101\basic_probe_oneshot_augmented_mean_plot.png
  Best Rank-1: 0.5579 @ threshold 0.20
  ROC-AUC: 0.8893
  d-prime: 1.370

Evaluating with TOPK aggregation...


Processing probes (topk): 100%|██████████| 23/23 [00:00<00:00, 29.59it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_101\basic_probe_oneshot_augmented_topk_plot.png
  Best Rank-1: 0.5579 @ threshold 0.20
  ROC-AUC: 0.8911
  d-prime: 1.367

----------------------------------------------------------------------
GALLERY: FEWSHOT_BASE
----------------------------------------------------------------------

Evaluating with MAX aggregation...


Processing probes (max): 100%|██████████| 23/23 [00:00<00:00, 75.26it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_101\basic_probe_fewshot_base_max_plot.png
  Best Rank-1: 0.5366 @ threshold 0.20
  ROC-AUC: 0.9163
  d-prime: 1.358

Evaluating with MEAN aggregation...


Processing probes (mean): 100%|██████████| 23/23 [00:00<00:00, 58.50it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_101\basic_probe_fewshot_base_mean_plot.png
  Best Rank-1: 0.5945 @ threshold 0.20
  ROC-AUC: 0.9277
  d-prime: 1.407

Evaluating with TOPK aggregation...


Processing probes (topk): 100%|██████████| 23/23 [00:00<00:00, 50.97it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_101\basic_probe_fewshot_base_topk_plot.png
  Best Rank-1: 0.5701 @ threshold 0.20
  ROC-AUC: 0.9218
  d-prime: 1.385

----------------------------------------------------------------------
GALLERY: FEWSHOT_AUGMENTED
----------------------------------------------------------------------

Evaluating with MAX aggregation...


Processing probes (max): 100%|██████████| 23/23 [00:02<00:00, 10.27it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_101\basic_probe_fewshot_augmented_max_plot.png
  Best Rank-1: 0.5366 @ threshold 0.20
  ROC-AUC: 0.8137
  d-prime: 1.330

Evaluating with MEAN aggregation...


Processing probes (mean): 100%|██████████| 23/23 [00:02<00:00,  9.97it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_101\basic_probe_fewshot_augmented_mean_plot.png
  Best Rank-1: 0.5854 @ threshold 0.20
  ROC-AUC: 0.9409
  d-prime: 1.413

Evaluating with TOPK aggregation...


Processing probes (topk): 100%|██████████| 23/23 [00:02<00:00, 10.33it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_101\basic_probe_fewshot_augmented_topk_plot.png
  Best Rank-1: 0.5427 @ threshold 0.20
  ROC-AUC: 0.9005
  d-prime: 1.352

IMPOSTOR EVALUATION: arcface_ir_101

Evaluating with MEAN aggregation...


Processing impostors (mean): 100%|██████████| 2/2 [00:00<00:00,  3.52it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_101\impostor_plot.png
  Best Rejection Rate: 1.0000 @ threshold 0.40
  FAR at best: 0.0000
  Total impostors: 252
  Mean impostor score: 0.1377
  Std impostor score: 0.0537
  95% CI: [0.1311, 0.1440]

SEGMENTED EVALUATION: arcface_ir_101 (oneshot)

Evaluating with MEAN aggregation...
Found 8 segments: ['baseline', 'left', 'center', 'right', 'high_pitch', 'high_yaw', 'blur', 'low_quality']


Processing probes (mean): 100%|██████████| 11/11 [00:00<00:00, 187.84it/s]
Processing probes (mean): 100%|██████████| 21/21 [00:00<00:00, 93.31it/s] 
Processing probes (mean): 100%|██████████| 22/22 [00:00<00:00, 59.60it/s]
Processing probes (mean): 100%|██████████| 18/18 [00:00<00:00, 82.91it/s]
Processing probes (mean): 100%|██████████| 2/2 [00:00<00:00, 163.69it/s]]
Processing probes (mean): 100%|██████████| 21/21 [00:00<00:00, 70.73it/s]
Processing probes (mean): 100%|██████████| 23/23 [00:00<00:00, 32.57it/s]
Processing probes (mean): 100%|██████████| 22/22 [00:00<00:00, 62.18it/s]
Processing segments (mean): 100%|██████████| 8/8 [00:04<00:00,  1.91it/s]


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_101\segmented_oneshot_baseline_plot.png

  baseline:
    Rank-1: 0.7917 @ threshold 0.20
    ROC-AUC: 0.8632
    d-prime: 3.081
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_101\segmented_oneshot_left_plot.png

  left:
    Rank-1: 0.5484 @ threshold 0.20
    ROC-AUC: 0.8679
    d-prime: 1.304
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_101\segmented_oneshot_center_plot.png

  center:
    Rank-1: 0.5603 @ threshold 0.20
    ROC-AUC: 0.8722
    d-prime: 1.335
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_101\segmented_oneshot_right_plot.png

  right:
    Rank-1: 0.5638 @ threshold 0.20
    ROC-AUC: 0.9314
    d-prime: 1.503


  ax7.set_xscale('log')


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_101\segmented_oneshot_high_pitch_plot.png

  high_pitch:
    Rank-1: 0.4000 @ threshold 0.20
    ROC-AUC: 1.0000
    d-prime: 1.129
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_101\segmented_oneshot_high_yaw_plot.png

  high_yaw:
    Rank-1: 0.4486 @ threshold 0.20
    ROC-AUC: 0.8566
    d-prime: 1.095
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_101\segmented_oneshot_blur_plot.png

  blur:
    Rank-1: 0.5194 @ threshold 0.20
    ROC-AUC: 0.8936
    d-prime: 1.282
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_101\segmented_oneshot_low_quality_plot.png

  low_quality:
    Rank-1: 0.4211 @ threshold 0.20
    ROC-AUC: 0.8881
    d-prime: 1.031

SEGMENTED EVALUATION: arcface_ir_101 (fewshot)

Evaluating with MEAN aggregation...
Found 8 segments: ['baseline', 'left', 'center', 'right', 'high_pitch', 'high_yaw', 'blur', 'low_quality']


Processing probes (mean): 100%|██████████| 11/11 [00:00<00:00, 67.42it/s]
Processing probes (mean): 100%|██████████| 21/21 [00:00<00:00, 33.16it/s]
Processing probes (mean): 100%|██████████| 22/22 [00:00<00:00, 22.83it/s]
Processing probes (mean): 100%|██████████| 18/18 [00:00<00:00, 27.88it/s]
Processing probes (mean): 100%|██████████| 2/2 [00:00<00:00, 60.61it/s]s]
Processing probes (mean): 100%|██████████| 21/21 [00:00<00:00, 29.08it/s]
Processing probes (mean): 100%|██████████| 23/23 [00:01<00:00, 12.14it/s]
Processing probes (mean): 100%|██████████| 22/22 [00:01<00:00, 19.95it/s]
Processing segments (mean): 100%|██████████| 8/8 [00:07<00:00,  1.01it/s]
  ax7.set_xscale('log')


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_101\segmented_fewshot_baseline_plot.png

  baseline:
    Rank-1: 0.9583 @ threshold 0.20
    ROC-AUC: 1.0000
    d-prime: 3.172
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_101\segmented_fewshot_left_plot.png

  left:
    Rank-1: 0.5484 @ threshold 0.20
    ROC-AUC: 0.9290
    d-prime: 1.343
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_101\segmented_fewshot_center_plot.png

  center:
    Rank-1: 0.5957 @ threshold 0.20
    ROC-AUC: 0.9373
    d-prime: 1.378
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_101\segmented_fewshot_right_plot.png

  right:
    Rank-1: 0.6064 @ threshold 0.20
    ROC-AUC: 0.9587
    d-prime: 1.544


  ax7.set_xscale('log')


Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_101\segmented_fewshot_high_pitch_plot.png

  high_pitch:
    Rank-1: 0.6000 @ threshold 0.20
    ROC-AUC: 1.0000
    d-prime: 1.108
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_101\segmented_fewshot_high_yaw_plot.png

  high_yaw:
    Rank-1: 0.4299 @ threshold 0.20
    ROC-AUC: 0.9191
    d-prime: 1.113
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_101\segmented_fewshot_blur_plot.png

  blur:
    Rank-1: 0.5336 @ threshold 0.20
    ROC-AUC: 0.9427
    d-prime: 1.327
Plot saved: d:\KEVIN\0SLC\RIG\output\v0\plots\arcface_ir_101\segmented_fewshot_low_quality_plot.png

  low_quality:
    Rank-1: 0.4276 @ threshold 0.20
    ROC-AUC: 0.9294
    d-prime: 1.065

################################################################################
# COMPARATIVE ANALYSIS
################################################################################

1. Generating comparison summary...
   Saved: d:\KEVIN\0SLC\RIG\output\

  best_per_gallery = comparison_summary.groupby('Gallery').apply(
  axes[idx].boxplot([genuine, impostor], labels=['Genuine', 'Impostor'])
  axes[idx].boxplot([genuine, impostor], labels=['Genuine', 'Impostor'])
  axes[idx].boxplot([genuine, impostor], labels=['Genuine', 'Impostor'])
  axes[idx].boxplot([genuine, impostor], labels=['Genuine', 'Impostor'])


Comparison charts saved to: d:\KEVIN\0SLC\RIG\output\v0\comparisons\charts

2. Creating segmented performance heatmaps...
Segmented heatmap saved: d:\KEVIN\0SLC\RIG\output\v0\comparisons\charts\segmented_oneshot_heatmap.png
Segmented heatmap saved: d:\KEVIN\0SLC\RIG\output\v0\comparisons\charts\segmented_fewshot_heatmap.png

3. Creating gallery strategy visualizations...
Gallery strategy plot saved: d:\KEVIN\0SLC\RIG\output\v0\comparisons\charts\gallery_strategy_comparison.png

################################################################################
# EXPORTING COMPREHENSIVE REPORTS
################################################################################
Excel report saved: d:\KEVIN\0SLC\RIG\output\v0\comparisons\reports\comprehensive_report.xlsx
JSON report saved: d:\KEVIN\0SLC\RIG\output\v0\comparisons\reports\comprehensive_report.json
Text summary saved: d:\KEVIN\0SLC\RIG\output\v0\comparisons\reports\executive_summary.txt
LaTeX tables saved: d:\KEVIN\0SLC\RIG\output