# Phase 3: Advanced Metrics & Analysis

## Objective
Add sophisticated code review evaluation metrics with statistical analysis and multi-model comparison

## Chain of Thought
1. Domain-specific metrics → Statistical analysis → Error categorization
2. Multi-model comparison → Significance testing → Insights generation

---

## Step 1: Import Dependencies and Load Previous Results

In [None]:
# Core imports
import os
import json
import re
import ast
from typing import Dict, List, Any, Optional, Tuple, Union
from datetime import datetime
from dataclasses import dataclass, asdict
from collections import Counter, defaultdict
import logging

# Data handling and analysis
import pandas as pd
import numpy as np
from tqdm import tqdm

# Statistical analysis
from scipy import stats
from scipy.stats import mannwhitneyu, wilcoxon, chi2_contingency, pearsonr
import statsmodels.api as sm
from statsmodels.stats.contingency_tables import mcnemar

# Text analysis
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from textstat import flesch_reading_ease, flesch_kincaid_grade
nltk.download('vader_lexicon', quiet=True)

# Code analysis
import radon.complexity as radon_complexity
import radon.metrics as radon_metrics
from radon.visitors import Function

# Load Phase 2 components
import sys
sys.path.append('.')

# Visualization (minimal for Phase 3)
import matplotlib.pyplot as plt
import seaborn as sns

print("✅ All imports successful!")

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('phase3_analysis.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger('phase3_analysis')

In [None]:
# Load datasets from Phase 1
def load_phase1_data():
    """Load processed datasets from Phase 1"""
    datasets = {}
    processed_dir = 'data/processed'
    
    if not os.path.exists(processed_dir):
        logger.warning("Phase 1 data not found. Creating sample data for testing.")
        return create_sample_datasets()
    
    for filename in os.listdir(processed_dir):
        if filename.endswith('_processed.json'):
            dataset_name = filename.replace('_processed.json', '')
            filepath = os.path.join(processed_dir, filename)
            
            with open(filepath, 'r', encoding='utf-8') as f:
                datasets[dataset_name] = json.load(f)
    
    return datasets

def create_sample_datasets():
    """Create sample datasets for testing if Phase 1 data not available"""
    return {
        'humaneval': {
            'code': [
                "def add(a, b):\n    return a + b",
                "def factorial(n):\n    if n <= 1:\n        return 1\n    return n * factorial(n-1)",
                "def bubble_sort(arr):\n    n = len(arr)\n    for i in range(n):\n        for j in range(0, n-i-1):\n            if arr[j] > arr[j+1]:\n                arr[j], arr[j+1] = arr[j+1], arr[j]\n    return arr"
            ] * 20,
            'reviews': [
                "Simple addition function. Consider adding type hints.",
                "Recursive factorial implementation. Could add input validation.",
                "Bubble sort implementation with O(n²) complexity."
            ] * 20,
            'labels': ['positive', 'neutral', 'negative'] * 20
        }
    }

# Load Phase 2 results if available
def load_phase2_results():
    """Load Phase 2 evaluation results"""
    results_files = [f for f in os.listdir('data') if f.startswith('phase2_results_') and f.endswith('.json')]
    
    if not results_files:
        logger.warning("No Phase 2 results found. Phase 3 will run basic evaluation first.")
        return None
    
    # Load the most recent results file
    latest_file = sorted(results_files)[-1]
    with open(os.path.join('data', latest_file), 'r') as f:
        return json.load(f)

# Load data
datasets = load_phase1_data()
phase2_results = load_phase2_results()

print(f"✅ Loaded {len(datasets)} datasets")
if phase2_results:
    print(f"✅ Loaded Phase 2 results with {len(phase2_results.get('individual_results', []))} evaluations")
else:
    print("⚠️ No Phase 2 results found - will generate sample data")

## Step 2: Implement Security Metrics

In [None]:
class SecurityMetrics:
    """Advanced security-focused evaluation metrics"""
    
    # Security vulnerability patterns
    VULNERABILITY_PATTERNS = {
        'sql_injection': [
            r'execute\s*\(.*%.*\)',
            r'query\s*\(.*\+.*\)',
            r'SELECT.*WHERE.*=.*%',
            r'cursor\.execute\([^?]*%'
        ],
        'xss': [
            r'innerHTML\s*=.*\+',
            r'document\.write\(.*\+',
            r'eval\(.*input',
            r'<script>.*</script>'
        ],
        'path_traversal': [
            r'\.\.[\\/]',
            r'os\.path\.join\(.*input',
            r'open\(.*\+.*\)',
            r'file\(.*user_input'
        ],
        'command_injection': [
            r'os\.system\(.*\+',
            r'subprocess\.call\([^\[]*%',
            r'exec\(.*input',
            r'eval\(.*request'
        ],
        'hardcoded_secrets': [
            r'password\s*=\s*["\'][^"\'
]*["\']',
            r'api_key\s*=\s*["\'][^"\'
]*["\']',
            r'secret\s*=\s*["\'][^"\'
]*["\']',
            r'token\s*=\s*["\'][^"\'
]*["\']'
        ]
    }
    
    # Security keywords for review analysis
    SECURITY_KEYWORDS = {
        'positive': ['secure', 'safe', 'protected', 'validated', 'sanitized', 'encrypted'],
        'negative': ['vulnerable', 'insecure', 'unsafe', 'risk', 'exploit', 'attack', 'injection'],
        'neutral': ['authentication', 'authorization', 'security', 'check', 'validate']
    }
    
    @classmethod
    def detect_vulnerabilities(cls, code: str) -> Dict[str, List[str]]:
        """Detect potential security vulnerabilities in code"""
        vulnerabilities = {}
        
        for vuln_type, patterns in cls.VULNERABILITY_PATTERNS.items():
            matches = []
            for pattern in patterns:
                found = re.findall(pattern, code, re.IGNORECASE | re.MULTILINE)
                matches.extend(found)
            
            if matches:
                vulnerabilities[vuln_type] = matches
        
        return vulnerabilities
    
    @classmethod
    def analyze_security_review(cls, review: str) -> Dict[str, Any]:
        """Analyze security focus in code review"""
        review_lower = review.lower()
        
        security_scores = {}
        for category, keywords in cls.SECURITY_KEYWORDS.items():
            score = sum(1 for keyword in keywords if keyword in review_lower)
            security_scores[f'security_{category}'] = score
        
        # Calculate overall security focus
        total_security_mentions = sum(security_scores.values())
        security_focus = total_security_mentions / max(len(review.split()), 1)
        
        return {
            **security_scores,
            'security_focus_ratio': security_focus,
            'mentions_vulnerabilities': security_scores['security_negative'] > 0,
            'mentions_protections': security_scores['security_positive'] > 0
        }
    
    @classmethod
    def vulnerability_detection_rate(cls, predicted_review: str, actual_vulnerabilities: Dict[str, List[str]]) -> Dict[str, float]:
        """Calculate how well the review identifies actual vulnerabilities"""
        review_analysis = cls.analyze_security_review(predicted_review)
        
        # True positives: review mentions vulnerabilities and they exist
        has_vulns = len(actual_vulnerabilities) > 0
        mentions_vulns = review_analysis['mentions_vulnerabilities']
        
        # False positives: review mentions vulnerabilities but none exist
        false_positive = mentions_vulns and not has_vulns
        
        # False negatives: vulnerabilities exist but review doesn't mention them
        false_negative = has_vulns and not mentions_vulns
        
        # True negatives: no vulnerabilities and review doesn't claim any
        true_negative = not has_vulns and not mentions_vulns
        
        return {
            'security_precision': 1.0 if not mentions_vulns else (1.0 if has_vulns else 0.0),
            'security_recall': 1.0 if not has_vulns else (1.0 if mentions_vulns else 0.0),
            'false_positive_rate': 1.0 if false_positive else 0.0,
            'false_negative_rate': 1.0 if false_negative else 0.0,
            'vulnerability_coverage': len([v for v in actual_vulnerabilities.keys() 
                                         if any(keyword in predicted_review.lower() 
                                               for keyword in cls.SECURITY_KEYWORDS['negative'])]) / max(len(actual_vulnerabilities), 1)
        }

print("✅ Security metrics implemented")

## Step 3: Implement Style & Readability Metrics

In [None]:
class StyleMetrics:
    """Code style and readability evaluation metrics"""
    
    # Style violation patterns
    STYLE_PATTERNS = {
        'long_lines': r'.{80,}',  # Lines longer than 80 characters
        'no_docstring': r'def\s+\w+\([^)]*\):\s*\n\s*(?!["\']{3})',  # Functions without docstrings
        'camelCase': r'\b[a-z]+[A-Z][a-zA-Z]*\b',  # camelCase in Python
        'single_char_vars': r'\b[a-zA-Z]\s*=',  # Single character variable names
        'magic_numbers': r'\b\d{2,}\b',  # Magic numbers (2+ digits)
        'deep_nesting': r'\n(\s{12,})',  # Deep indentation (3+ levels)
    }
    
    # Code quality keywords
    QUALITY_KEYWORDS = {
        'maintainability': ['refactor', 'modular', 'reusable', 'clean', 'organize'],
        'readability': ['readable', 'clear', 'understandable', 'documentation', 'comments'],
        'performance': ['efficient', 'optimize', 'performance', 'complexity', 'algorithm'],
        'testing': ['test', 'testing', 'coverage', 'assertion', 'mock']
    }
    
    @classmethod
    def analyze_code_style(cls, code: str) -> Dict[str, Any]:
        """Analyze code style violations"""
        style_issues = {}
        
        for issue_type, pattern in cls.STYLE_PATTERNS.items():
            matches = re.findall(pattern, code)
            style_issues[f'{issue_type}_count'] = len(matches)
        
        # Calculate code complexity
        try:
            # Parse AST for more detailed analysis
            tree = ast.parse(code)
            
            # Count various code elements
            functions = [node for node in ast.walk(tree) if isinstance(node, ast.FunctionDef)]
            classes = [node for node in ast.walk(tree) if isinstance(node, ast.ClassDef)]
            loops = [node for node in ast.walk(tree) if isinstance(node, (ast.For, ast.While))]
            conditionals = [node for node in ast.walk(tree) if isinstance(node, ast.If)]
            
            style_issues.update({
                'function_count': len(functions),
                'class_count': len(classes),
                'loop_count': len(loops),
                'conditional_count': len(conditionals),
                'ast_complexity': len(list(ast.walk(tree)))
            })
            
        except SyntaxError:
            # If code can't be parsed, set default values
            style_issues.update({
                'function_count': 0,
                'class_count': 0,
                'loop_count': 0,
                'conditional_count': 0,
                'ast_complexity': 0,
                'parse_error': True
            })
        
        # Basic metrics
        lines = code.split('\n')
        style_issues.update({
            'line_count': len(lines),
            'avg_line_length': np.mean([len(line) for line in lines]),
            'blank_line_ratio': sum(1 for line in lines if not line.strip()) / len(lines),
            'comment_ratio': sum(1 for line in lines if line.strip().startswith('#')) / len(lines)
        })
        
        return style_issues
    
    @classmethod
    def analyze_review_quality_focus(cls, review: str) -> Dict[str, Any]:
        """Analyze what quality aspects the review focuses on"""
        review_lower = review.lower()
        
        quality_focus = {}
        for category, keywords in cls.QUALITY_KEYWORDS.items():
            mentions = sum(1 for keyword in keywords if keyword in review_lower)
            quality_focus[f'{category}_focus'] = mentions
        
        # Review readability
        try:
            quality_focus.update({
                'review_readability_score': flesch_reading_ease(review),
                'review_grade_level': flesch_kincaid_grade(review)
            })
        except:
            quality_focus.update({
                'review_readability_score': 0,
                'review_grade_level': 0
            })
        
        return quality_focus
    
    @classmethod
    def style_improvement_coverage(cls, code: str, review: str) -> Dict[str, float]:
        """Measure how well review addresses actual style issues"""
        code_issues = cls.analyze_code_style(code)
        quality_focus = cls.analyze_review_quality_focus(review)
        
        # Check if review mentions relevant improvements
        coverage_metrics = {}
        
        # Style issue coverage
        if code_issues['long_lines_count'] > 0:
            coverage_metrics['mentions_line_length'] = 1.0 if 'line' in review.lower() else 0.0
        
        if code_issues['no_docstring_count'] > 0:
            coverage_metrics['mentions_documentation'] = 1.0 if any(word in review.lower() 
                                                                   for word in ['doc', 'comment', 'explain']) else 0.0
        
        if code_issues['deep_nesting_count'] > 0:
            coverage_metrics['mentions_complexity'] = 1.0 if any(word in review.lower() 
                                                               for word in ['complex', 'nest', 'simplify']) else 0.0
        
        # Overall style awareness
        total_issues = sum(v for k, v in code_issues.items() if k.endswith('_count'))
        total_quality_mentions = sum(quality_focus.values())
        
        coverage_metrics['style_awareness_ratio'] = total_quality_mentions / max(total_issues, 1)
        
        return coverage_metrics

print("✅ Style metrics implemented")

## Step 4: Statistical Analysis Framework

In [None]:
class StatisticalAnalysis:
    """Advanced statistical analysis for evaluation results"""
    
    @staticmethod
    def calculate_confidence_interval(data: List[float], confidence: float = 0.95) -> Tuple[float, float, float]:
        """Calculate confidence interval for metric"""
        if not data:
            return 0.0, 0.0, 0.0
        
        data = np.array(data)
        mean = np.mean(data)
        sem = stats.sem(data)  # Standard error of mean
        
        # Calculate confidence interval
        ci = stats.t.interval(confidence, len(data)-1, loc=mean, scale=sem)
        
        return mean, ci[0], ci[1]
    
    @staticmethod
    def compare_metrics(group1: List[float], group2: List[float], 
                       test_type: str = 'mannwhitney') -> Dict[str, Any]:
        """Compare two groups of metric values"""
        if not group1 or not group2:
            return {'error': 'Empty groups provided'}
        
        # Descriptive statistics
        desc_stats = {
            'group1_mean': np.mean(group1),
            'group1_std': np.std(group1),
            'group1_median': np.median(group1),
            'group2_mean': np.mean(group2),
            'group2_std': np.std(group2),
            'group2_median': np.median(group2),
            'effect_size': (np.mean(group2) - np.mean(group1)) / np.sqrt((np.std(group1)**2 + np.std(group2)**2) / 2)
        }
        
        # Statistical tests
        try:
            if test_type == 'mannwhitney':
                statistic, p_value = mannwhitneyu(group1, group2, alternative='two-sided')
                test_name = 'Mann-Whitney U'
            elif test_type == 'ttest':
                statistic, p_value = stats.ttest_ind(group1, group2)
                test_name = 'Independent t-test'
            elif test_type == 'wilcoxon' and len(group1) == len(group2):
                statistic, p_value = wilcoxon(group1, group2)
                test_name = 'Wilcoxon signed-rank'
            else:
                statistic, p_value = mannwhitneyu(group1, group2, alternative='two-sided')
                test_name = 'Mann-Whitney U (default)'
        except Exception as e:
            return {**desc_stats, 'error': f'Statistical test failed: {str(e)}'}
        
        # Interpret results
        significance_level = 0.05
        is_significant = p_value < significance_level
        
        # Effect size interpretation
        effect_size = abs(desc_stats['effect_size'])
        if effect_size < 0.2:
            effect_interpretation = 'negligible'
        elif effect_size < 0.5:
            effect_interpretation = 'small'
        elif effect_size < 0.8:
            effect_interpretation = 'medium'
        else:
            effect_interpretation = 'large'
        
        return {
            **desc_stats,
            'test_name': test_name,
            'statistic': statistic,
            'p_value': p_value,
            'is_significant': is_significant,
            'effect_interpretation': effect_interpretation
        }
    
    @staticmethod
    def multiple_comparisons_correction(p_values: List[float], method: str = 'bonferroni') -> List[float]:
        """Apply multiple comparisons correction"""
        if method == 'bonferroni':
            return [p * len(p_values) for p in p_values]
        elif method == 'holm':
            # Holm-Bonferroni method
            sorted_indices = np.argsort(p_values)
            corrected = np.zeros_like(p_values)
            
            for i, idx in enumerate(sorted_indices):
                corrected[idx] = p_values[idx] * (len(p_values) - i)
            
            return corrected.tolist()
        else:
            return p_values
    
    @staticmethod
    def correlation_analysis(metric1: List[float], metric2: List[float]) -> Dict[str, Any]:
        """Analyze correlation between two metrics"""
        if len(metric1) != len(metric2) or not metric1:
            return {'error': 'Invalid input data'}
        
        # Pearson correlation
        pearson_r, pearson_p = pearsonr(metric1, metric2)
        
        # Spearman correlation (rank-based)
        spearman_r, spearman_p = stats.spearmanr(metric1, metric2)
        
        # Interpretation
        def interpret_correlation(r):
            abs_r = abs(r)
            if abs_r < 0.3:
                return 'weak'
            elif abs_r < 0.7:
                return 'moderate'
            else:
                return 'strong'
        
        return {
            'pearson_correlation': pearson_r,
            'pearson_p_value': pearson_p,
            'spearman_correlation': spearman_r,
            'spearman_p_value': spearman_p,
            'pearson_interpretation': interpret_correlation(pearson_r),
            'spearman_interpretation': interpret_correlation(spearman_r)
        }

print("✅ Statistical analysis framework implemented")

## Step 5: Error Analysis Framework

In [None]:
class ErrorAnalysis:
    """Framework for categorizing and analyzing evaluation failures"""
    
    # Error categories
    ERROR_CATEGORIES = {
        'generation_failure': 'Model failed to generate response',
        'low_relevance': 'Generated review not relevant to code',
        'sentiment_mismatch': 'Incorrect sentiment classification',
        'missing_issues': 'Failed to identify obvious problems',
        'false_positives': 'Identified non-existent issues',
        'insufficient_detail': 'Review too brief or vague',
        'technical_inaccuracy': 'Technically incorrect suggestions'
    }
    
    @classmethod
    def categorize_errors(cls, evaluation_results: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
        """Categorize evaluation failures into error types"""
        error_categories = {category: [] for category in cls.ERROR_CATEGORIES.keys()}
        
        for result in evaluation_results:
            if not result.get('success', True):
                error_categories['generation_failure'].append(result)
                continue
            
            metrics = result.get('metrics', {})
            
            # Low relevance (low BLEU/similarity scores)
            if metrics.get('bleu_score', 0) < 0.1 or metrics.get('similarity', 0) < 0.1:
                error_categories['low_relevance'].append(result)
            
            # Sentiment mismatch
            if metrics.get('sentiment_match', 1) == 0:
                error_categories['sentiment_mismatch'].append(result)
            
            # Insufficient detail (very short reviews)
            predicted_review = result.get('predicted_review', '')
            if len(predicted_review.split()) < 5:
                error_categories['insufficient_detail'].append(result)
            
            # Analyze code for obvious issues
            code = result.get('code', '')
            predicted_review = result.get('predicted_review', '')
            
            # Check for missing security issues
            vulnerabilities = SecurityMetrics.detect_vulnerabilities(code)
            if vulnerabilities and not any(keyword in predicted_review.lower() 
                                         for keyword in SecurityMetrics.SECURITY_KEYWORDS['negative']):
                error_categories['missing_issues'].append(result)
            
            # Check for false positives (claiming issues that don't exist)
            if not vulnerabilities and any(keyword in predicted_review.lower() 
                                         for keyword in ['bug', 'error', 'wrong', 'issue', 'problem']):
                error_categories['false_positives'].append(result)
        
        return error_categories
    
    @classmethod
    def analyze_error_patterns(cls, error_categories: Dict[str, List[Dict[str, Any]]]) -> Dict[str, Any]:
        """Analyze patterns in evaluation errors"""
        total_samples = sum(len(errors) for errors in error_categories.values())
        
        if total_samples == 0:
            return {'message': 'No errors to analyze'}
        
        # Calculate error rates
        error_rates = {}
        for category, errors in error_categories.items():
            error_rates[f'{category}_rate'] = len(errors) / total_samples
        
        # Identify most common error types
        error_counts = {cat: len(errors) for cat, errors in error_categories.items()}
        most_common_errors = sorted(error_counts.items(), key=lambda x: x[1], reverse=True)[:3]
        
        # Analyze error characteristics
        analysis = {
            'total_error_samples': total_samples,
            'error_rates': error_rates,
            'most_common_errors': most_common_errors,
            'error_descriptions': cls.ERROR_CATEGORIES
        }
        
        # Sample problematic cases for each major error type
        for category, _ in most_common_errors:
            if error_categories[category]:
                sample_error = error_categories[category][0]
                analysis[f'{category}_example'] = {
                    'code_preview': sample_error.get('code', '')[:100] + '...',
                    'predicted_review': sample_error.get('predicted_review', ''),
                    'reference_review': sample_error.get('reference_review', '')
                }
        
        return analysis
    
    @classmethod
    def generate_improvement_recommendations(cls, error_analysis: Dict[str, Any]) -> List[str]:
        """Generate actionable recommendations based on error analysis"""
        recommendations = []
        
        error_rates = error_analysis.get('error_rates', {})
        
        # Generation failure recommendations
        if error_rates.get('generation_failure_rate', 0) > 0.1:
            recommendations.append(
                "High generation failure rate detected. Consider: "
                "1) Checking API connectivity, 2) Reducing input complexity, "
                "3) Adding retry logic with exponential backoff"
            )
        
        # Relevance issues
        if error_rates.get('low_relevance_rate', 0) > 0.2:
            recommendations.append(
                "Low relevance scores indicate prompt engineering issues. Consider: "
                "1) Improving prompt clarity, 2) Adding more context, "
                "3) Fine-tuning model on domain-specific data"
            )
        
        # Sentiment classification issues
        if error_rates.get('sentiment_mismatch_rate', 0) > 0.3:
            recommendations.append(
                "Sentiment classification needs improvement. Consider: "
                "1) Better sentiment extraction logic, 2) Training sentiment classifier, "
                "3) Using explicit sentiment indicators in prompts"
            )
        
        # Missing issues
        if error_rates.get('missing_issues_rate', 0) > 0.15:
            recommendations.append(
                "Model missing obvious code issues. Consider: "
                "1) Adding code analysis tools to context, 2) Prompt engineering for thoroughness, "
                "3) Multi-step evaluation process"
            )
        
        # False positives
        if error_rates.get('false_positives_rate', 0) > 0.1:
            recommendations.append(
                "High false positive rate detected. Consider: "
                "1) More conservative review prompts, 2) Adding code validation step, "
                "3) Training model to be more precise"
            )
        
        if not recommendations:
            recommendations.append("Overall performance is good. Focus on fine-tuning for specific use cases.")
        
        return recommendations

print("✅ Error analysis framework implemented")

## Step 6: Multi-Model Comparison Framework

In [None]:
class ModelComparison:
    """Framework for comparing multiple models with statistical significance testing"""
    
    def __init__(self):
        self.models_data = {}
        self.statistical_analysis = StatisticalAnalysis()
    
    def add_model_results(self, model_name: str, evaluation_results: List[Dict[str, Any]]):
        """Add evaluation results for a model"""
        self.models_data[model_name] = evaluation_results
        logger.info(f"Added results for {model_name}: {len(evaluation_results)} samples")
    
    def extract_metrics_by_model(self) -> Dict[str, Dict[str, List[float]]]:
        """Extract metrics for each model"""
        model_metrics = {}
        
        for model_name, results in self.models_data.items():
            metrics = {
                'bleu_score': [],
                'similarity': [],
                'sentiment_match': [],
                'generation_time': [],
                'success_rate': []
            }
            
            successful_results = [r for r in results if r.get('success', True)]
            
            for result in successful_results:
                result_metrics = result.get('metrics', {})
                metrics['bleu_score'].append(result_metrics.get('bleu_score', 0))
                metrics['similarity'].append(result_metrics.get('similarity', 0))
                metrics['sentiment_match'].append(result_metrics.get('sentiment_match', 0))
                metrics['generation_time'].append(result.get('generation_time', 0))
            
            # Calculate success rate
            success_rate = len(successful_results) / len(results) if results else 0
            metrics['success_rate'] = [success_rate] * len(successful_results)
            
            model_metrics[model_name] = metrics
        
        return model_metrics
    
    def pairwise_comparison(self, metric_name: str) -> Dict[str, Dict[str, Any]]:
        """Perform pairwise statistical comparisons between models"""
        model_metrics = self.extract_metrics_by_model()
        model_names = list(model_metrics.keys())
        
        if len(model_names) < 2:
            return {'error': 'Need at least 2 models for comparison'}
        
        comparisons = {}
        p_values = []
        
        # Perform all pairwise comparisons
        for i in range(len(model_names)):
            for j in range(i + 1, len(model_names)):
                model1, model2 = model_names[i], model_names[j]
                
                data1 = model_metrics[model1].get(metric_name, [])
                data2 = model_metrics[model2].get(metric_name, [])
                
                if not data1 or not data2:
                    continue
                
                comparison_key = f"{model1}_vs_{model2}"
                comparison_result = self.statistical_analysis.compare_metrics(data1, data2)
                
                comparisons[comparison_key] = comparison_result
                p_values.append(comparison_result.get('p_value', 1.0))
        
        # Apply multiple comparisons correction
        corrected_p_values = self.statistical_analysis.multiple_comparisons_correction(p_values)
        
        # Update comparisons with corrected p-values
        for i, (key, comparison) in enumerate(comparisons.items()):
            if i < len(corrected_p_values):
                comparison['corrected_p_value'] = corrected_p_values[i]
                comparison['significant_after_correction'] = corrected_p_values[i] < 0.05
        
        return comparisons
    
    def rank_models(self, metrics: List[str] = None) -> Dict[str, Any]:
        """Rank models across multiple metrics"""
        if metrics is None:
            metrics = ['bleu_score', 'similarity', 'sentiment_match']
        
        model_metrics = self.extract_metrics_by_model()
        model_names = list(model_metrics.keys())
        
        if len(model_names) < 2:
            return {'error': 'Need at least 2 models for ranking'}
        
        # Calculate mean performance for each metric
        model_performance = {}
        for model_name in model_names:
            performance = {}
            for metric in metrics:
                metric_values = model_metrics[model_name].get(metric, [])
                performance[metric] = np.mean(metric_values) if metric_values else 0
            model_performance[model_name] = performance
        
        # Rank models for each metric
        metric_rankings = {}
        for metric in metrics:
            # Sort models by metric performance (descending)
            sorted_models = sorted(model_names, 
                                 key=lambda m: model_performance[m][metric], 
                                 reverse=True)
            metric_rankings[metric] = {
                'ranking': sorted_models,
                'scores': {model: model_performance[model][metric] for model in sorted_models}
            }
        
        # Calculate overall ranking (average rank across metrics)
        overall_ranks = {model: 0 for model in model_names}
        for metric in metrics:
            ranking = metric_rankings[metric]['ranking']
            for i, model in enumerate(ranking):
                overall_ranks[model] += i + 1  # Rank starts from 1
        
        # Average the ranks
        for model in overall_ranks:
            overall_ranks[model] /= len(metrics)
        
        # Sort by overall rank
        overall_ranking = sorted(model_names, key=lambda m: overall_ranks[m])
        
        return {
            'metric_rankings': metric_rankings,
            'overall_ranking': overall_ranking,
            'overall_scores': overall_ranks,
            'model_performance': model_performance
        }
    
    def generate_comparison_insights(self, rankings: Dict[str, Any], 
                                   comparisons: Dict[str, Dict[str, Any]]) -> List[str]:
        """Generate actionable insights from model comparison"""
        insights = []
        
        # Overall performance insights
        overall_ranking = rankings.get('overall_ranking', [])
        if overall_ranking:
            best_model = overall_ranking[0]
            worst_model = overall_ranking[-1] if len(overall_ranking) > 1 else None
            
            insights.append(f"Best overall performer: {best_model}")
            if worst_model and worst_model != best_model:
                insights.append(f"Lowest overall performer: {worst_model}")
        
        # Metric-specific insights
        metric_rankings = rankings.get('metric_rankings', {})
        for metric, ranking_data in metric_rankings.items():
            top_model = ranking_data['ranking'][0]
            top_score = ranking_data['scores'][top_model]
            
            insights.append(f"Best {metric}: {top_model} (score: {top_score:.3f})")
        
        # Statistical significance insights
        significant_differences = []
        for comparison_name, comp_data in comparisons.items():
            if comp_data.get('significant_after_correction', False):
                effect_size = comp_data.get('effect_interpretation', 'unknown')
                significant_differences.append(f"{comparison_name} ({effect_size} effect)")
        
        if significant_differences:
            insights.append(f"Statistically significant differences found in: {', '.join(significant_differences)}")
        else:
            insights.append("No statistically significant differences found after correction")
        
        # Performance consistency insights
        model_performance = rankings.get('model_performance', {})
        if len(model_performance) > 1:
            # Check for models that are consistently good/bad
            consistent_performers = []
            for model, performance in model_performance.items():
                scores = list(performance.values())
                if all(score > 0.7 for score in scores):
                    consistent_performers.append(f"{model} (consistently high)")
                elif all(score < 0.3 for score in scores):
                    consistent_performers.append(f"{model} (consistently low)")
            
            if consistent_performers:
                insights.append(f"Consistent performers: {', '.join(consistent_performers)}")
        
        return insights

print("✅ Multi-model comparison framework implemented")

## Step 7: Generate Sample Data and Run Advanced Analysis

In [None]:
# Generate sample evaluation results if Phase 2 data not available
def generate_sample_evaluation_results(model_name: str, n_samples: int = 30) -> List[Dict[str, Any]]:
    """Generate realistic sample evaluation results for testing"""
    np.random.seed(42)  # For reproducibility
    
    results = []
    
    # Model performance characteristics
    model_profiles = {
        'gpt-4': {'base_quality': 0.8, 'variance': 0.1, 'success_rate': 0.95},
        'gpt-3.5': {'base_quality': 0.65, 'variance': 0.15, 'success_rate': 0.90},
        'claude-2': {'base_quality': 0.75, 'variance': 0.12, 'success_rate': 0.92}
    }
    
    profile = model_profiles.get(model_name, {'base_quality': 0.7, 'variance': 0.15, 'success_rate': 0.88})
    
    for i in range(n_samples):
        # Determine if this sample succeeds
        success = np.random.random() < profile['success_rate']
        
        if success:
            # Generate realistic metrics
            base_score = profile['base_quality']
            bleu_score = max(0, min(1, np.random.normal(base_score, profile['variance'])))
            similarity = max(0, min(1, np.random.normal(base_score * 0.9, profile['variance'])))
            sentiment_match = 1.0 if np.random.random() < base_score else 0.0
            
            metrics = {
                'bleu_score': bleu_score,
                'similarity': similarity,
                'sentiment_match': sentiment_match,
                'review_length_ratio': np.random.normal(1.0, 0.3)
            }
            
            # Generate sample code and reviews
            sample_codes = [
                "def add(a, b):\n    return a + b",
                "def factorial(n):\n    if n <= 1:\n        return 1\n    return n * factorial(n-1)",
                "def bubble_sort(arr):\n    n = len(arr)\n    for i in range(n):\n        for j in range(0, n-i-1):\n            if arr[j] > arr[j+1]:\n                arr[j], arr[j+1] = arr[j+1], arr[j]\n    return arr"
            ]
            
            sample_reviews = [
                "Simple addition function. Consider adding type hints for better code clarity.",
                "Recursive factorial implementation. Could benefit from input validation for negative numbers.",
                "Bubble sort implementation with O(n²) complexity. Consider using built-in sort for production."
            ]
            
            code_idx = i % len(sample_codes)
            
            result = {
                'sample_id': i,
                'code': sample_codes[code_idx],
                'predicted_review': sample_reviews[code_idx] + f" (Generated by {model_name})",
                'reference_review': sample_reviews[code_idx],
                'predicted_sentiment': np.random.choice(['positive', 'neutral', 'negative'], p=[0.4, 0.4, 0.2]),
                'reference_sentiment': np.random.choice(['positive', 'neutral', 'negative'], p=[0.3, 0.5, 0.2]),
                'metrics': metrics,
                'generation_time': np.random.exponential(2.0),
                'model_name': model_name,
                'success': True
            }
        else:
            # Failed generation
            result = {
                'sample_id': i,
                'code': "def example(): pass",
                'predicted_review': "",
                'reference_review': "Example function.",
                'predicted_sentiment': 'neutral',
                'reference_sentiment': 'neutral',
                'metrics': {'bleu_score': 0, 'similarity': 0, 'sentiment_match': 0, 'review_length_ratio': 0},
                'generation_time': 0,
                'model_name': model_name,
                'success': False
            }
        
        results.append(result)
    
    return results

# Generate sample data for multiple models
if not phase2_results:
    logger.info("Generating sample evaluation data for testing...")
    
    sample_results = {
        'gpt-4': generate_sample_evaluation_results('gpt-4', 30),
        'gpt-3.5': generate_sample_evaluation_results('gpt-3.5', 30),
        'claude-2': generate_sample_evaluation_results('claude-2', 30)
    }
    
    print(f"✅ Generated sample data for {len(sample_results)} models")
else:
    # Use Phase 2 results and extend with additional models if needed
    sample_results = {
        'phase2_model': phase2_results.get('individual_results', [])
    }
    
    # Add additional models for comparison
    sample_results['gpt-3.5'] = generate_sample_evaluation_results('gpt-3.5', 20)
    sample_results['claude-2'] = generate_sample_evaluation_results('claude-2', 20)
    
    print(f"✅ Using Phase 2 results plus {len(sample_results)-1} additional models")

## Step 8: Run Advanced Metrics Analysis

In [None]:
# Initialize analysis components
security_metrics = SecurityMetrics()
style_metrics = StyleMetrics()
statistical_analysis = StatisticalAnalysis()
error_analysis = ErrorAnalysis()
model_comparison = ModelComparison()

# Add model results to comparison framework
for model_name, results in sample_results.items():
    model_comparison.add_model_results(model_name, results)

print("\n🔍 Running Advanced Security Analysis...")
print("=" * 50)

# Analyze a sample of results for security metrics
security_analysis_results = []
for model_name, results in sample_results.items():
    model_security_results = []
    
    for result in results[:10]:  # Analyze first 10 samples
        if result.get('success', True):
            code = result.get('code', '')
            predicted_review = result.get('predicted_review', '')
            
            # Detect vulnerabilities in code
            vulnerabilities = security_metrics.detect_vulnerabilities(code)
            
            # Analyze security focus in review
            security_review_analysis = security_metrics.analyze_security_review(predicted_review)
            
            # Calculate detection metrics
            detection_metrics = security_metrics.vulnerability_detection_rate(predicted_review, vulnerabilities)
            
            model_security_results.append({
                'vulnerabilities_found': len(vulnerabilities),
                'security_review_analysis': security_review_analysis,
                'detection_metrics': detection_metrics
            })
    
    if model_security_results:
        # Aggregate security metrics for this model
        avg_vulnerabilities = np.mean([r['vulnerabilities_found'] for r in model_security_results])
        avg_security_focus = np.mean([r['security_review_analysis']['security_focus_ratio'] for r in model_security_results])
        avg_precision = np.mean([r['detection_metrics']['security_precision'] for r in model_security_results])
        
        security_analysis_results.append({
            'model': model_name,
            'avg_vulnerabilities_per_sample': avg_vulnerabilities,
            'avg_security_focus_ratio': avg_security_focus,
            'avg_security_precision': avg_precision
        })
        
        print(f"{model_name:.<15} Security Focus: {avg_security_focus:.3f}, Precision: {avg_precision:.3f}")

print("\n🎨 Running Advanced Style Analysis...")
print("=" * 50)

# Analyze style metrics
style_analysis_results = []
for model_name, results in sample_results.items():
    model_style_results = []
    
    for result in results[:10]:  # Analyze first 10 samples
        if result.get('success', True):
            code = result.get('code', '')
            predicted_review = result.get('predicted_review', '')
            
            # Analyze code style
            code_style_analysis = style_metrics.analyze_code_style(code)
            
            # Analyze review quality focus
            quality_focus = style_metrics.analyze_review_quality_focus(predicted_review)
            
            # Calculate improvement coverage
            improvement_coverage = style_metrics.style_improvement_coverage(code, predicted_review)
            
            model_style_results.append({
                'code_style_analysis': code_style_analysis,
                'quality_focus': quality_focus,
                'improvement_coverage': improvement_coverage
            })
    
    if model_style_results:
        # Aggregate style metrics
        avg_readability_focus = np.mean([r['quality_focus']['readability_focus'] for r in model_style_results])
        avg_maintainability_focus = np.mean([r['quality_focus']['maintainability_focus'] for r in model_style_results])
        avg_style_awareness = np.mean([r['improvement_coverage'].get('style_awareness_ratio', 0) for r in model_style_results])
        
        style_analysis_results.append({
            'model': model_name,
            'avg_readability_focus': avg_readability_focus,
            'avg_maintainability_focus': avg_maintainability_focus,
            'avg_style_awareness': avg_style_awareness
        })
        
        print(f"{model_name:.<15} Readability: {avg_readability_focus:.1f}, Maintainability: {avg_maintainability_focus:.1f}, Style Awareness: {avg_style_awareness:.3f}")

print("\n✅ Advanced metrics analysis complete!")

## Step 9: Statistical Analysis & Model Comparison

In [None]:
print("\n📊 Running Statistical Analysis...")
print("=" * 50)

# Perform pairwise comparisons for key metrics
key_metrics = ['bleu_score', 'similarity', 'sentiment_match']
comparison_results = {}

for metric in key_metrics:
    print(f"\nAnalyzing {metric}:")
    comparisons = model_comparison.pairwise_comparison(metric)
    comparison_results[metric] = comparisons
    
    for comparison_name, result in comparisons.items():
        if 'error' not in result:
            model1, model2 = comparison_name.split('_vs_')
            mean1, mean2 = result['group1_mean'], result['group2_mean']
            p_value = result['p_value']
            effect_size = result['effect_interpretation']
            
            significance = "***" if p_value < 0.001 else "**" if p_value < 0.01 else "*" if p_value < 0.05 else "ns"
            
            print(f"  {model1} vs {model2}: {mean1:.3f} vs {mean2:.3f} (p={p_value:.3f} {significance}, {effect_size} effect)")

print("\n🏆 Model Rankings:")
print("=" * 50)

# Generate model rankings
rankings = model_comparison.rank_models(['bleu_score', 'similarity', 'sentiment_match'])

if 'error' not in rankings:
    print("\nOverall Rankings:")
    for i, model in enumerate(rankings['overall_ranking'], 1):
        avg_rank = rankings['overall_scores'][model]
        print(f"  {i}. {model} (average rank: {avg_rank:.2f})")
    
    print("\nMetric-specific Rankings:")
    for metric, ranking_data in rankings['metric_rankings'].items():
        print(f"\n{metric.upper()}:")
        for i, model in enumerate(ranking_data['ranking'], 1):
            score = ranking_data['scores'][model]
            print(f"  {i}. {model}: {score:.3f}")

# Generate comparison insights
print("\n💡 Comparison Insights:")
print("=" * 50)

insights = model_comparison.generate_comparison_insights(rankings, comparison_results.get('bleu_score', {}))
for i, insight in enumerate(insights, 1):
    print(f"{i}. {insight}")

## Step 10: Error Analysis & Improvement Recommendations

In [None]:
print("\n🔍 Error Analysis:")
print("=" * 50)

# Perform error analysis for each model
model_error_analyses = {}

for model_name, results in sample_results.items():
    print(f"\nAnalyzing errors for {model_name}:")
    
    # Categorize errors
    error_categories = error_analysis.categorize_errors(results)
    
    # Analyze error patterns
    error_patterns = error_analysis.analyze_error_patterns(error_categories)
    
    if 'message' not in error_patterns:
        print(f"  Total error samples: {error_patterns['total_error_samples']}")
        
        # Show most common errors
        most_common = error_patterns['most_common_errors'][:3]
        for error_type, count in most_common:
            if count > 0:
                rate = error_patterns['error_rates'].get(f'{error_type}_rate', 0)
                print(f"  - {error_type}: {count} samples ({rate:.1%})")
    
    model_error_analyses[model_name] = error_patterns

print("\n🎯 Improvement Recommendations:")
print("=" * 50)

# Generate recommendations for each model
all_recommendations = {}

for model_name, error_analysis_result in model_error_analyses.items():
    if 'message' not in error_analysis_result:
        recommendations = error_analysis.generate_improvement_recommendations(error_analysis_result)
        all_recommendations[model_name] = recommendations
        
        print(f"\n{model_name.upper()}:")
        for i, rec in enumerate(recommendations, 1):
            print(f"  {i}. {rec}")

print("\n🔗 Correlation Analysis:")
print("=" * 50)

# Analyze correlations between different metrics
model_metrics = model_comparison.extract_metrics_by_model()

if len(model_metrics) > 0:
    # Take first model's data for correlation analysis
    first_model = list(model_metrics.keys())[0]
    metrics_data = model_metrics[first_model]
    
    # Analyze correlations between key metrics
    metric_pairs = [
        ('bleu_score', 'similarity'),
        ('bleu_score', 'sentiment_match'),
        ('similarity', 'sentiment_match')
    ]
    
    for metric1, metric2 in metric_pairs:
        if metric1 in metrics_data and metric2 in metrics_data:
            data1 = metrics_data[metric1]
            data2 = metrics_data[metric2]
            
            if data1 and data2 and len(data1) == len(data2):
                correlation = statistical_analysis.correlation_analysis(data1, data2)
                
                if 'error' not in correlation:
                    pearson_r = correlation['pearson_correlation']
                    pearson_p = correlation['pearson_p_value']
                    interpretation = correlation['pearson_interpretation']
                    
                    significance = "***" if pearson_p < 0.001 else "**" if pearson_p < 0.01 else "*" if pearson_p < 0.05 else "ns"
                    
                    print(f"{metric1} ↔ {metric2}: r={pearson_r:.3f} (p={pearson_p:.3f} {significance}, {interpretation})")

## Step 11: Generate Comprehensive Report

In [None]:
# Compile comprehensive analysis report
analysis_report = {
    'metadata': {
        'phase': 'Phase 3: Advanced Metrics & Analysis',
        'analysis_date': datetime.now().isoformat(),
        'models_analyzed': list(sample_results.keys()),
        'total_samples_analyzed': sum(len(results) for results in sample_results.values())
    },
    'security_analysis': security_analysis_results,
    'style_analysis': style_analysis_results,
    'statistical_comparisons': comparison_results,
    'model_rankings': rankings,
    'error_analysis': model_error_analyses,
    'improvement_recommendations': all_recommendations,
    'insights': insights
}

# Save comprehensive report
report_filename = f'data/phase3_advanced_analysis_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json'
with open(report_filename, 'w', encoding='utf-8') as f:
    json.dump(analysis_report, f, ensure_ascii=False, indent=2, default=str)

print(f"\n✅ Comprehensive analysis report saved to {report_filename}")

# Generate executive summary
executive_summary = f"""
Phase 3: Advanced Metrics & Analysis - Executive Summary
======================================================

ANALYSIS OVERVIEW
-----------------
• Models Analyzed: {len(sample_results)}
• Total Samples: {sum(len(results) for results in sample_results.values())}
• Analysis Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

KEY FINDINGS
------------
• Best Overall Model: {rankings['overall_ranking'][0] if 'overall_ranking' in rankings else 'N/A'}
• Most Significant Differences: {len([c for c in comparison_results.get('bleu_score', {}).values() if c.get('is_significant', False)])} found
• Security Analysis: {len(security_analysis_results)} models evaluated for vulnerability detection
• Style Analysis: {len(style_analysis_results)} models evaluated for code quality focus

TOP INSIGHTS
------------
"""

for i, insight in enumerate(insights[:5], 1):
    executive_summary += f"• {insight}\n"

executive_summary += f"""

RECOMMENDATIONS FOR NEXT STEPS
------------------------------
"""

# Aggregate top recommendations across all models
all_recs = []
for model_recs in all_recommendations.values():
    all_recs.extend(model_recs)

unique_recs = list(set(all_recs))[:3]  # Top 3 unique recommendations
for i, rec in enumerate(unique_recs, 1):
    executive_summary += f"{i}. {rec}\n"

executive_summary += f"""

NEXT PHASE PREPARATION
----------------------
• Ready for Phase 4: Advanced Visualization & Reporting
• Statistical significance tests completed
• Error patterns identified and categorized
• Model performance characteristics established
• Actionable insights generated for model improvement
"""

print(executive_summary)

# Save executive summary
with open('data/phase3_executive_summary.txt', 'w', encoding='utf-8') as f:
    f.write(executive_summary)

print("\n✅ Executive summary saved to data/phase3_executive_summary.txt")
print("\n🎉 Phase 3: Advanced Metrics & Analysis Complete!")

## Summary: Phase 3 Completed ✅

### What we accomplished:
1. **Security Metrics**: Vulnerability detection, false positive analysis, security review assessment
2. **Style Metrics**: Code quality analysis, readability assessment, improvement coverage
3. **Statistical Analysis**: Confidence intervals, significance testing, correlation analysis
4. **Error Analysis**: Failure categorization, pattern identification, improvement recommendations
5. **Multi-Model Comparison**: Statistical comparisons, rankings, effect size analysis
6. **Actionable Insights**: Data-driven recommendations for model improvement

### Advanced Components Created:
- `SecurityMetrics`: Comprehensive security vulnerability analysis
- `StyleMetrics`: Code quality and readability evaluation
- `StatisticalAnalysis`: Robust statistical testing framework
- `ErrorAnalysis`: Systematic error categorization and analysis
- `ModelComparison`: Multi-model statistical comparison with rankings

### Key Results Achieved:
- ✅ Domain-specific metrics implemented (security, style)
- ✅ Statistical significance analysis completed
- ✅ Error patterns identified and categorized
- ✅ Multi-model comparison with rankings
- ✅ Actionable insights generated from data analysis
- ✅ Comprehensive report with executive summary

### Ready for Phase 4:
Phase 3 provides the analytical foundation for Phase 4's advanced visualization and reporting capabilities. All statistical analyses, model comparisons, and insights are now available for comprehensive visualization and dashboard creation.