In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import time
import json
from typing import List, Dict, Any, Optional, Tuple, Callable
import re
import requests
from tqdm.auto import tqdm

class MultilingualRAGEvaluator:
    """
    Evaluator for multilingual RAG systems without ground truth.
    Supports evaluation in multiple languages for maternal healthcare domain.
    """
    
    def __init__(
        self,
        judge_llm: Any = None,
        embedding_model: str = "google/muril-base-cased",
        languages: List[str] = ["english", "hindi", "assamese", "hinglish"],
        rag_system: Any = None,
        non_rag_system: Any = None
    ):
        """
        Initialize the evaluator.
        
        Args:
            judge_llm: LLM used for evaluating responses (if None, only automated metrics are used)
            embedding_model: Multilingual embedding model for semantic similarity
            languages: List of languages to evaluate
            rag_system: RAG system to evaluate
            non_rag_system: Non-RAG system to compare against
        """
        self.judge_llm = judge_llm
        self.languages = languages
        self.rag_system = rag_system
        self.non_rag_system = non_rag_system
        
        # Load embedding model for semantic evaluations
        print(f"Loading embedding model: {embedding_model}")
        self.embedder = SentenceTransformer(embedding_model)
        
        # Define medical keywords for different languages
        self.medical_keywords = self._load_medical_keywords()
        
        # Create metric trackers
        self.results = {lang: {} for lang in languages}
    
    def _load_medical_keywords(self) -> Dict[str, List[str]]:
        """Load medical keywords for each language."""
        # In a real implementation, load from a file or database
        # This is a minimal example with a few maternal healthcare terms
        return {
            "english": [
                "pregnancy", "prenatal", "postnatal", "birth", "labor", "delivery",
                "ultrasound", "trimester", "fetus", "breastfeeding", "complication",
                "eclampsia", "caesarean", "midwife", "obstetrician", "maternal"
            ],
            "hindi": [
                "गर्भावस्था", "प्रसवपूर्व", "प्रसवोत्तर", "जन्म", "प्रसव", "प्रसव",
                "अल्ट्रासाउंड", "तिमाही", "भ्रूण", "स्तनपान", "जटिलता",
                "एक्लेम्पसिया", "सिजेरियन", "दाई", "प्रसूति विशेषज्ञ", "मातृ"
            ],
            "assamese": [
                "গর্ভাৱস্থা", "প্ৰসৱ পূর্বৱর্তী", "প্ৰসৱৰ পিছত", "জন্ম", "প্ৰসৱ", "প্ৰসৱ",
                "আল্ট্ৰাছাউণ্ড", "ত্ৰৈমাসিক", "ভ্ৰূণ", "স্তন্যপান", "জটিলতা",
                "এক্লেম্পছিয়া", "ছিজেৰিয়ান", "ধাত্ৰী", "প্ৰসূতি বিশেষজ্ঞ", "মাতৃ"
            ],
            "hinglish": [
                "pregnancy", "prenatal", "postnatal", "birth", "labor", "delivery",
                "ultrasound", "trimester", "garbh", "breastfeeding", "complication",
                "eclampsia", "caesarean", "dai", "doctor", "maternity"
            ]
        }
    
    def evaluate_all(
        self,
        test_queries: Dict[str, List[str]],
        rag_answers: Optional[Dict[str, List[str]]] = None,
        non_rag_answers: Optional[Dict[str, List[str]]] = None,
        retrieval_contexts: Optional[Dict[str, List[str]]] = None
    ) -> Dict:
        """
        Run comprehensive evaluation for all languages.
        
        Args:
            test_queries: Dictionary of queries by language
            rag_answers: Pre-generated RAG answers (optional)
            non_rag_answers: Pre-generated non-RAG answers (optional)
            retrieval_contexts: Retrieved contexts for RAG (optional)
            
        Returns:
            Dictionary with evaluation results
        """
        # Generate answers if not provided
        if rag_answers is None and self.rag_system is not None:
            rag_answers = self._generate_answers(test_queries, self.rag_system, "rag")
            
        if non_rag_answers is None and self.non_rag_system is not None:
            non_rag_answers = self._generate_answers(test_queries, self.non_rag_system, "non_rag")
        
        # Get retrieval contexts if not provided
        if retrieval_contexts is None and self.rag_system is not None and hasattr(self.rag_system, "retrieve"):
            retrieval_contexts = self._get_retrieval_contexts(test_queries)
        
        # Run evaluations for each language
        for lang in self.languages:
            if lang not in test_queries:
                print(f"Skipping {lang} - no test queries provided")
                continue
                
            print(f"Evaluating {lang}...")
            
            # Evaluate RAG answers
            if rag_answers and lang in rag_answers:
                self.results[lang]["rag"] = self._evaluate_language_responses(
                    queries=test_queries[lang],
                    responses=rag_answers[lang],
                    system_type="rag",
                    language=lang,
                    contexts=retrieval_contexts.get(lang) if retrieval_contexts else None
                )
            
            # Evaluate non-RAG answers
            if non_rag_answers and lang in non_rag_answers:
                self.results[lang]["non_rag"] = self._evaluate_language_responses(
                    queries=test_queries[lang],
                    responses=non_rag_answers[lang],
                    system_type="non_rag",
                    language=lang
                )
                
            # Comparative analysis if both systems evaluated
            if "rag" in self.results[lang] and "non_rag" in self.results[lang]:
                self.results[lang]["comparative"] = self._compare_systems(
                    rag_results=self.results[lang]["rag"],
                    non_rag_results=self.results[lang]["non_rag"],
                    rag_responses=rag_answers[lang],
                    non_rag_responses=non_rag_answers[lang],
                    queries=test_queries[lang],
                    language=lang
                )
        
        # Cross-lingual analysis
        if len(self.languages) > 1:
            self._cross_lingual_analysis()
            
        return self.results
    
    def _generate_answers(self, test_queries, system, system_type):
        """Generate answers using the specified system."""
        answers = {}
        
        for lang in self.languages:
            if lang not in test_queries:
                continue
                
            print(f"Generating {system_type} answers for {lang}...")
            answers[lang] = []
            
            for query in tqdm(test_queries[lang]):
                try:
                    if system_type == "rag":
                        answer = system.answer(query, language=lang)
                    else:
                        answer = system.generate(query, language=lang)
                    answers[lang].append(answer)
                except Exception as e:
                    print(f"Error generating answer for '{query}': {e}")
                    answers[lang].append("")
        
        return answers
    
    def _get_retrieval_contexts(self, test_queries):
        """Get retrieval contexts for RAG system."""
        contexts = {}
        
        for lang in self.languages:
            if lang not in test_queries:
                continue
                
            print(f"Getting retrieval contexts for {lang}...")
            contexts[lang] = []
            
            for query in tqdm(test_queries[lang]):
                try:
                    # This method depends on your RAG system's API
                    retrieved_docs = self.rag_system.retrieve(query, language=lang)
                    contexts[lang].append(retrieved_docs)
                except Exception as e:
                    print(f"Error retrieving context for '{query}': {e}")
                    contexts[lang].append([])
        
        return contexts
    
    def _evaluate_language_responses(
        self,
        queries: List[str],
        responses: List[str],
        system_type: str,
        language: str,
        contexts: Optional[List[List[str]]] = None
    ) -> Dict:
        """
        Evaluate responses for a specific language.
        
        Args:
            queries: List of queries
            responses: List of responses
            system_type: 'rag' or 'non_rag'
            language: Language being evaluated
            contexts: Retrieved contexts (for RAG only)
            
        Returns:
            Dictionary with evaluation metrics
        """
        results = {}
        
        # 1. Evaluate uncertainty handling
        results["uncertainty"] = self._evaluate_uncertainty(responses, language)
        
        # 2. Medical terminology usage
        results["medical_terminology"] = self._evaluate_medical_terminology(responses, language)
        
        # 3. Response length and detail
        results["response_detail"] = self._evaluate_response_detail(responses)
        
        # 4. Self-consistency
        results["self_consistency"] = self._evaluate_self_consistency(responses)
        
        # 5. Query-response relevance
        results["query_relevance"] = self._evaluate_query_relevance(queries, responses)
        
        # 6. For RAG systems, evaluate context utilization
        if system_type == "rag" and contexts:
            results["context_utilization"] = self._evaluate_context_utilization(responses, contexts)
            results["attribution"] = self._evaluate_attribution(responses)
        
        # 7. LLM judge evaluation if available
        if self.judge_llm:
            results["llm_evaluation"] = self._llm_judge_evaluation(
                queries, responses, language, system_type, contexts
            )
        
        return results
    
    def _evaluate_uncertainty(self, responses: List[str], language: str) -> Dict:
        """Evaluate appropriate expression of uncertainty in responses."""
        uncertainty_phrases = {
            "english": ["may", "might", "possible", "suggest", "consider", "consult", "uncertain", "not clear", "limited information"],
            "hindi": ["शायद", "हो सकता है", "संभव", "सुझाव", "विचार", "परामर्श", "अनिश्चित", "स्पष्ट नहीं", "सीमित जानकारी"],
            "assamese": ["হয়তো", "সম্ভৱ", "পৰামৰ্শ", "বিবেচনা", "অনিশ্চিত", "স্পষ্ট নহয়", "সীমিত তথ্য"],
            "hinglish": ["may", "might", "ho sakta hai", "possible", "suggest", "consider", "consult", "uncertain", "not clear", "limited information"]
        }
        
        lang_phrases = uncertainty_phrases.get(language, uncertainty_phrases["english"])
        
        scores = []
        for response in responses:
            # Count uncertainty phrases
            count = sum(1 for phrase in lang_phrases if phrase.lower() in response.lower())
            
            # Normalize by response length
            norm_count = count / max(1, len(response.split()))
            
            # We want some uncertainty but not too much
            # Score is highest at appropriate levels (e.g., 1-3 phrases in a medium response)
            if count == 0:
                # No uncertainty expressions at all
                score = 0.0
            elif norm_count > 0.1:
                # Too much uncertainty
                score = 0.5
            else:
                # Appropriate level of uncertainty
                score = 1.0
                
            scores.append(score)
            
        return {
            "scores": scores,
            "average": np.mean(scores),
            "appropriate_uncertainty_rate": sum(1 for s in scores if s > 0.7) / len(scores)
        }
    
    def _evaluate_medical_terminology(self, responses: List[str], language: str) -> Dict:
        """Evaluate use of appropriate medical terminology for maternal healthcare."""
        # Get medical keywords for this language
        keywords = self.medical_keywords.get(language, [])
        
        scores = []
        keyword_counts = []
        
        for response in responses:
            response_lower = response.lower()
            
            # Count medical keywords
            count = sum(1 for keyword in keywords if keyword.lower() in response_lower)
            keyword_counts.append(count)
            
            # Normalize by response length
            words = len(response.split())
            norm_count = count / max(1, words / 50)  # Expect ~1 keyword per 50 words
            
            # Score based on normalized count (more is better up to a point)
            score = min(1.0, norm_count)
            scores.append(score)
            
        return {
            "scores": scores,
            "average": np.mean(scores),
            "keyword_density": np.mean([count / max(1, len(resp.split())) for count, resp in zip(keyword_counts, responses)])
        }
    
    def _evaluate_response_detail(self, responses: List[str]) -> Dict:
        """Evaluate response length and detail level."""
        # Get word counts
        word_counts = [len(response.split()) for response in responses]
        
        # Score based on word count (more detailed is better, up to a point)
        detail_scores = []
        for count in word_counts:
            if count < 30:
                # Too brief
                score = count / 30
            elif count > 500:
                # Excessively long
                score = 500 / count
            else:
                # Appropriate length
                score = 1.0
                
            detail_scores.append(score)
            
        return {
            "word_counts": word_counts,
            "average_words": np.mean(word_counts),
            "detail_scores": detail_scores,
            "average_detail_score": np.mean(detail_scores)
        }
    
    def _evaluate_self_consistency(self, responses: List[str]) -> Dict:
        """Evaluate consistency between different responses."""
        if len(responses) < 2:
            return {"average_similarity": 0.0, "consistency_score": 0.0}
            
        # Generate embeddings for all responses
        embeddings = self.embedder.encode(responses)
        
        # Calculate pairwise cosine similarities
        similarities = cosine_similarity(embeddings)
        
        # Get average similarity (excluding self-similarity)
        np.fill_diagonal(similarities, 0)
        avg_similarity = similarities.sum() / (similarities.shape[0] * (similarities.shape[0] - 1))
        
        # Convert to a score (too low or too high similarity is bad)
        if avg_similarity < 0.3:
            # Too inconsistent
            consistency_score = avg_similarity / 0.3
        elif avg_similarity > 0.9:
            # Too similar (might be repetitive/templated)
            consistency_score = 1 - (avg_similarity - 0.9) * 10
        else:
            # Healthy diversity with consistency
            consistency_score = 1.0
            
        return {
            "average_similarity": float(avg_similarity),
            "consistency_score": float(consistency_score)
        }
    
    def _evaluate_query_relevance(self, queries: List[str], responses: List[str]) -> Dict:
        """Evaluate semantic relevance of responses to queries."""
        if len(queries) != len(responses):
            raise ValueError("Number of queries and responses must match")
            
        relevance_scores = []
        
        for query, response in zip(queries, responses):
            # Get embeddings
            query_embedding = self.embedder.encode([query])[0]
            response_embedding = self.embedder.encode([response])[0]
            
            # Calculate cosine similarity
            similarity = cosine_similarity([query_embedding], [response_embedding])[0][0]
            
            # Convert to a score (higher is better)
            relevance_scores.append(float(similarity))
            
        return {
            "relevance_scores": relevance_scores,
            "average_relevance": float(np.mean(relevance_scores))
        }
    
    def _evaluate_context_utilization(self, responses: List[str], contexts: List[List[str]]) -> Dict:
        """Evaluate how well responses utilize retrieved contexts."""
        if len(responses) != len(contexts):
            raise ValueError("Number of responses and contexts must match")
            
        utilization_scores = []
        
        for response, context_list in zip(responses, contexts):
            if not context_list:
                utilization_scores.append(0.0)
                continue
                
            # Combine all context documents
            combined_context = " ".join(context_list)
            
            # Get embeddings
            context_embedding = self.embedder.encode([combined_context])[0]
            response_embedding = self.embedder.encode([response])[0]
            
            # Calculate cosine similarity
            similarity = cosine_similarity([context_embedding], [response_embedding])[0][0]
            
            # Content overlap analysis (more sophisticated than embedding similarity)
            # Check if significant n-grams from context appear in response
            context_words = set(combined_context.lower().split())
            response_words = set(response.lower().split())
            word_overlap = len(context_words.intersection(response_words)) / max(1, len(context_words))
            
            # Combine metrics (embedding similarity and word overlap)
            utilization_score = (similarity + word_overlap) / 2
            utilization_scores.append(float(utilization_score))
            
        return {
            "utilization_scores": utilization_scores,
            "average_utilization": float(np.mean(utilization_scores))
        }
    
    def _evaluate_attribution(self, responses: List[str]) -> Dict:
        """Evaluate whether responses include source attributions."""
        attribution_patterns = [
            r"according to",
            r"based on",
            r"sources? (indicate|say|state)",
            r"research (shows|indicates)",
            r"study|studies",
            r"guidelines",
            r"reference",
            r"cited",
            r"source:"
        ]
        
        combined_pattern = "|".join(attribution_patterns)
        
        attribution_scores = []
        for response in responses:
            # Check for attribution phrases
            matches = re.findall(combined_pattern, response.lower())
            has_attribution = len(matches) > 0
            
            attribution_scores.append(1.0 if has_attribution else 0.0)
            
        return {
            "attribution_scores": attribution_scores,
            "attribution_rate": float(np.mean(attribution_scores))
        }
    
    def _llm_judge_evaluation(
        self,
        queries: List[str],
        responses: List[str],
        language: str,
        system_type: str,
        contexts: Optional[List[List[str]]] = None
    ) -> Dict:
        """Use an LLM to evaluate responses on various dimensions."""
        if not self.judge_llm:
            return {}
            
        evaluation_prompts = {
            "english": """
            You are a maternal healthcare expert evaluating an AI response to a user query.
            
            User Query: {query}
            
            AI Response: {response}
            
            Please evaluate this response on the following criteria (score 1-5, where 5 is best):
            
            1. Medical Accuracy: Is the medical information correct and up-to-date?
            2. Comprehensiveness: Does it address all aspects of the query?
            3. Clarity: Is the information presented clearly?
            4. Safety: Does it include appropriate cautions and avoid harmful advice?
            5. Cultural Sensitivity: Is it respectful of cultural practices around maternal health?
            
            For each score, provide a brief justification.
            
            Output your evaluation in JSON format:
            {{
                "medical_accuracy": {{
                    "score": <score>,
                    "justification": "<brief justification>"
                }},
                "comprehensiveness": {{
                    "score": <score>,
                    "justification": "<brief justification>"
                }},
                "clarity": {{
                    "score": <score>,
                    "justification": "<brief justification>"
                }},
                "safety": {{
                    "score": <score>,
                    "justification": "<brief justification>"
                }},
                "cultural_sensitivity": {{
                    "score": <score>,
                    "justification": "<brief justification>"
                }},
                "overall_score": <average of all scores>
            }}
            """
        }
        
        # Use English prompt for all languages to simplify (customize as needed)
        prompt_template = evaluation_prompts["english"]
        
        results = []
        for i, (query, response) in enumerate(zip(queries, responses)):
            # Add context information if available (for RAG)
            context_str = ""
            if contexts and i < len(contexts) and contexts[i]:
                context_str = "\n\nRetrieved Context: " + "\n".join(contexts[i])
            
            # Format the prompt
            prompt = prompt_template.format(
                query=query,
                response=response,
                context=context_str
            )
            
            try:
                # Get evaluation from judge LLM
                judge_response = self.judge_llm.generate(prompt)
                
                # Parse JSON response
                try:
                    evaluation = json.loads(judge_response)
                    results.append(evaluation)
                except json.JSONDecodeError:
                    print(f"Failed to parse judge response as JSON: {judge_response[:100]}...")
                    results.append({
                        "error": "Failed to parse response",
                        "overall_score": 0
                    })
            except Exception as e:
                print(f"Error getting evaluation from judge LLM: {e}")
                results.append({
                    "error": str(e),
                    "overall_score": 0
                })
        
        # Aggregate results
        aggregated = {
            "evaluations": results,
            "average_scores": {}
        }
        
        # Calculate average scores for each dimension
        dimensions = ["medical_accuracy", "comprehensiveness", "clarity", "safety", "cultural_sensitivity", "overall_score"]
        for dim in dimensions:
            scores = [r.get(dim, {}).get("score", 0) if isinstance(r.get(dim), dict) else r.get(dim, 0) for r in results]
            aggregated["average_scores"][dim] = float(np.mean([s for s in scores if s > 0]))
        
        return aggregated
    
    def _compare_systems(
        self,
        rag_results: Dict,
        non_rag_results: Dict,
        rag_responses: List[str],
        non_rag_responses: List[str],
        queries: List[str],
        language: str
    ) -> Dict:
        """Compare RAG and non-RAG systems."""
        if len(rag_responses) != len(non_rag_responses):
            raise ValueError("Number of RAG and non-RAG responses must match")
            
        # Direct comparison of metrics
        metric_comparison = {}
        for metric in rag_results:
            if metric in non_rag_results and "average" in rag_results[metric] and "average" in non_rag_results[metric]:
                rag_score = rag_results[metric]["average"]
                non_rag_score = non_rag_results[metric]["average"]
                
                metric_comparison[metric] = {
                    "rag_score": rag_score,
                    "non_rag_score": non_rag_score,
                    "difference": rag_score - non_rag_score,
                    "winner": "rag" if rag_score > non_rag_score else "non_rag"
                }
        
        # Response similarity analysis
        similarities = []
        for rag, non_rag in zip(rag_responses, non_rag_responses):
            rag_embedding = self.embedder.encode([rag])[0]
            non_rag_embedding = self.embedder.encode([non_rag])[0]
            
            similarity = cosine_similarity([rag_embedding], [non_rag_embedding])[0][0]
            similarities.append(float(similarity))
        
        response_comparison = {
            "response_similarities": similarities,
            "average_similarity": float(np.mean(similarities)),
            "highly_similar_responses": sum(1 for s in similarities if s > 0.9),
            "highly_different_responses": sum(1 for s in similarities if s < 0.5)
        }
        
        # If LLM judge was used, compare those scores directly
        llm_eval_comparison = {}
        if "llm_evaluation" in rag_results and "llm_evaluation" in non_rag_results:
            rag_scores = rag_results["llm_evaluation"]["average_scores"]
            non_rag_scores = non_rag_results["llm_evaluation"]["average_scores"]
            
            for dimension in rag_scores:
                if dimension in non_rag_scores:
                    llm_eval_comparison[dimension] = {
                        "rag_score": rag_scores[dimension],
                        "non_rag_score": non_rag_scores[dimension],
                        "difference": rag_scores[dimension] - non_rag_scores[dimension],
                        "winner": "rag" if rag_scores[dimension] > non_rag_scores[dimension] else "non_rag"
                    }
        
        return {
            "metric_comparison": metric_comparison,
            "response_comparison": response_comparison,
            "llm_eval_comparison": llm_eval_comparison
        }
    
    def _cross_lingual_analysis(self) -> None:
        """Analyze performance across languages."""
        cross_lingual = {
            "rag": {},
            "non_rag": {}
        }
        
        for system_type in ["rag", "non_rag"]:
            # Collect metrics across languages
            metrics_by_language = {}
            
            for lang in self.languages:
                if system_type not in self.results[lang]:
                    continue
                    
                system_results = self.results[lang][system_type]
                
                # Extract key metrics
                lang_metrics = {}
                for metric, value in system_results.items():
                    if isinstance(value, dict) and "average" in value:
                        lang_metrics[metric] = value["average"]
                    elif isinstance(value, dict) and "average_scores" in value:
                        # LLM evaluation
                        for dim, score in value["average_scores"].items():
                            lang_metrics[f"llm_{dim}"] = score
                
                metrics_by_language[lang] = lang_metrics
            
            # Compare metrics across languages
            if metrics_by_language:
                for metric in list(metrics_by_language.values())[0].keys():
                    metric_values = {
                        lang: metrics[metric]
                        for lang, metrics in metrics_by_language.items()
                        if metric in metrics
                    }
                    
                    if metric_values:
                        # Find best and worst performing languages
                        best_lang = max(metric_values.items(), key=lambda x: x[1])[0]
                        worst_lang = min(metric_values.items(), key=lambda x: x[1])[0]
                        
                        cross_lingual[system_type][metric] = {
                            "values": metric_values,
                            "best_language": best_lang,
                            "worst_language": worst_lang,
                            "range": max(metric_values.values()) - min(metric_values.values())
                        }
        
        self.results["cross_lingual"] = cross_lingual
    
    def visualize_results(self, output_path: Optional[str] = None) -> None:
        """
        Visualize evaluation results.
        
        Args:
            output_path: Path to save visualizations (optional)
        """
        if not self.results:
            print("No results to visualize. Run evaluate_all() first.")
            return
            
        # Set up figure style
        plt.style.use('seaborn-v0_8-whitegrid')
        sns.set_palette("colorblind")
        
        # 1. Comparative metrics between RAG and non-RAG for each language
        for lang in self.languages:
            if "comparative" not in self.results[lang]:
                continue
                
            comparative = self.results[lang]["comparative"]["metric_comparison"]
            
            # Extract metrics and scores
            metrics = []
            rag_scores = []
            non_rag_scores = []
            
            for metric, values in comparative.items():
                metrics.append(metric)
                rag_scores.append(values["rag_score"])
                non_rag_scores.append(values["non_rag_score"])
            
            if not metrics:
                continue
                
            # Create figure
            fig, ax = plt.subplots(figsize=(12, 6))
            x = np.arange(len(metrics))
            width = 0.35
            
            ax.bar(x - width/2, rag_scores, width, label='RAG')
            ax.bar(x + width/2, non_rag_scores, width, label='Non-RAG')
            
            ax.set_ylabel('Score')
            ax.set_title(f'RAG vs Non-RAG Comparison - {lang.capitalize()}')
            ax.set_xticks(x)
            ax.set_xticklabels([m.replace('_', ' ').title() for m in metrics])
            ax.legend()
            
            plt.tight_layout()
            
            if output_path:
                plt.savefig(f"{output_path}/rag_vs_nonrag_{lang}.png")
            else:
                plt.show()
                
        # 2. Cross-lingual comparison for RAG
        if "cross_lingual" in self.results and "rag" in self.results["cross_lingual"]:
            cross_ling_rag = self.results["cross_lingual"]["rag"]
            
            # Select key metrics for visualization
            key_metrics = [
                m for m in cross_ling_rag.keys() 
                if m in ['query_relevance', 'medical_terminology', 'response_detail', 'llm_overall_score']
            ]
            
            for metric in key_metrics:
                if metric not in cross_ling_rag:
                    continue
                    
                metric_data = cross_ling_rag[metric]
                
                # Create figure
                fig, ax = plt.subplots(figsize=(10, 6))
                
                # Extract languages and values
                languages = list(metric_data["values"].keys())
                values = list(metric_data["values"].values())
                
                # Plot
                bars = ax.bar(languages, values, color=sns.color_palette("colorblind", len(languages)))
                
                # Highlight best and worst
                best_idx = languages.index(metric_data["best_language"])
                worst_idx = languages.index(metric_data["worst_language"])
                bars[best_idx].set_color('green')
                bars[worst_idx].set_color('red')
                
                ax.set_ylabel('Score')
                ax.set_title(f'Cross-Lingual Comparison - {metric.replace("_", " ").title()}')
                
                plt.tight_layout()
                
                if output_path:
                    plt.savefig(f"{output_path}/cross_lingual_{metric}.png")
                else:
                    plt.show()
        
        # 3. Heatmap for all metrics across languages
        if len(self.languages) > 1:
            for system_type in ["rag", "non_rag"]:
                # Collect all metrics
                all_metrics = {}
                all_languages = []
                
                for lang in self.languages:
                    if system_type in self.results[lang]:
                        all_languages.append(lang)
                        
                        for metric, values in self.results[lang][system_type].items():
                            if isinstance(values, dict) and "average" in values:
                                if metric not in all_metrics:
                                    all_metrics[metric] = []
                                all_metrics[metric].append(values["average"])
                            elif isinstance(values, dict) and "average_scores" in values:
                                # LLM evaluation
                                for dim, score in values["average_scores"].items():
                                    metric_name = f"llm_{dim}"
                                    if metric_name not in all_metrics:
                                        all_metrics[metric_name] = []
                                    all_metrics[metric_name].append(score)
                
                if not all_metrics or not all_languages:
                    continue
                    
                # Prepare data for heatmap
                data = []
                metric_names = []
                
                for metric, values in all_metrics.items():
                    if len(values) == len(all_languages):
                        data.append(values)
                        metric_names.append(metric.replace("_", " ").title())
                
                if not data:
                    continue
                    
                # Create heatmap
                fig, ax = plt.subplots(figsize=(12, 10))
                sns.heatmap(data, annot=True, fmt=".2f", xticklabels=[l.capitalize() for l in all_languages], 
                            yticklabels=metric_names, cmap="YlGnBu", ax=ax)
                
                ax.set_title(f'{system_type.upper()} Performance Across Languages')
                
                plt.tight_layout()
                
                if output_path:
                    plt.savefig(f"{output_path}/heatmap_{system_type}.png")
                else:
                    plt.show()
    
    def generate_report(self, output_path: Optional[str] = None) -> str:
        """
        Generate a detailed report of evaluation results.
        
        Args:
            output_path: Path to save report (optional)
            
        Returns:
            Report as string
        """
        if not self.results:
            return "No results available. Run evaluate_all() first."
            
        report = []
        report.append("# Multilingual Maternal Healthcare RAG Evaluation Report")
        report.append("\n## Overview")
        
        # Overall winner summary
        winners = {}
        for lang in self.languages:
            if "comparative" in self.results[lang]:
                # Count metric winners
                rag_wins = 0
                non_rag_wins = 0
                
                for metric, values in self.results[lang]["comparative"]["metric_comparison"].items():
                    if values["winner"] == "rag":
                        rag_wins += 1
                    else:
                        non_rag_wins += 1
                
                winners[lang] = {
                    "rag_wins": rag_wins,
                    "non_rag_wins": non_rag_wins,
                    "winner": "RAG" if rag_wins > non_rag_wins else "Non-RAG"
                }
        
        if winners:
            report.append("\n### System Performance by Language")
            for lang, results in winners.items():
                report.append(f"\n**{lang.capitalize()}**: {results['winner']} system performs better " +
                              f"({results['rag_wins']} vs {results['non_rag_wins']} metrics)")
        
        # Cross-lingual insights
        if "cross_lingual" in self.results:
            report.append("\n### Cross-Lingual Insights")
            
            for system_type in ["rag", "non_rag"]:
                if system_type in self.results["cross_lingual"]:
                    report.append(f"\n#### {system_type.upper()} System")
                    
                    cross_ling = self.results["cross_lingual"][system_type]
                    
                    # Find best performing language overall
                    lang_scores = {lang: 0 for lang in self.languages}
                    
                    for metric, data in cross_ling.items():
                        best_lang = data["best_language"]
                        lang_scores[best_lang] += 1
                    
                    best_lang_overall = max(lang_scores.items(), key=lambda x: x[1])[0]
                    
                    report.append(f"- Best overall performance: **{best_lang_overall.capitalize()}**")
                    
                    # Report on specific metrics
                    key_metrics = ['medical_terminology', 'query_relevance', 'response_detail']
                    for metric in key_metrics:
                        if metric in cross_ling:
                            data = cross_ling[metric]
                            report.append(f"- {metric.replace('_', ' ').title()}: Best in **{data['best_language'].capitalize()}**, " +
                                          f"worst in **{data['worst_language'].capitalize()}**")
        
        # Detailed per-language results
        for lang in self.languages:
            report.append(f"\n## {lang.capitalize()} Evaluation")
            
            if "rag" in self.results[lang]:
                report.append("\n### RAG System")
                self._add_system_results_to_report(report, self.results[lang]["rag"])
            
            if "non_rag" in self.results[lang]:
                report.append("\n### Non-RAG System")
                self._add_system_results_to_report(report, self.results[lang]["non_rag"])
            
            if "comparative" in self.results[lang]:
                report.append("\n### Comparative Analysis")
                
                report.append("\n#### Metric Comparison")
                for metric, values in self.results[lang]["comparative"]["metric_comparison"].items():
                    diff = values["difference"]
                    winner = values["winner"].upper()
                    report.append(f"- {metric.replace('_', ' ').title()}: **{winner}** is better by {abs(diff):.3f}")
                
                # Response similarity
                resp_comp = self.results[lang]["comparative"]["response_comparison"]
                report.append(f"\n#### Response Similarity")
                report.append(f"- Average similarity between RAG and non-RAG: {resp_comp['average_similarity']:.3f}")
                report.append(f"- Highly similar responses: {resp_comp['highly_similar_responses']}")
                report.append(f"- Highly different responses: {resp_comp['highly_different_responses']}")
                
                # LLM judge comparison if available
                if "llm_eval_comparison" in self.results[lang]["comparative"] and self.results[lang]["comparative"]["llm_eval_comparison"]:
                    report.append(f"\n#### Expert LLM Evaluation")
                    
                    llm_comp = self.results[lang]["comparative"]["llm_eval_comparison"]
                    for dimension, values in llm_comp.items():
                        diff = values["difference"]
                        winner = values["winner"].upper()
                        report.append(f"- {dimension.replace('_', ' ').title()}: **{winner}** is better by {abs(diff):.1f}/5.0")
        
        # Recommendations
        report.append("\n## Recommendations")
        
        # General recommendation based on overall performance
        rag_better_langs = [lang for lang, data in winners.items() if data["winner"] == "RAG"]
        non_rag_better_langs = [lang for lang, data in winners.items() if data["winner"] == "Non-RAG"]
        
        if len(rag_better_langs) > len(non_rag_better_langs):
            report.append("\n- RAG system performs better overall, especially for: " + 
                         ", ".join([l.capitalize() for l in rag_better_langs]))
            
            if non_rag_better_langs:
                report.append(f"- Consider using non-RAG for: " + 
                             ", ".join([l.capitalize() for l in non_rag_better_langs]))
        else:
            report.append("\n- Non-RAG system performs better overall, especially for: " + 
                         ", ".join([l.capitalize() for l in non_rag_better_langs]))
            
            if rag_better_langs:
                report.append(f"- Consider using RAG for: " + 
                             ", ".join([l.capitalize() for l in rag_better_langs]))
        
        # Specific recommendations for improvement
        report.append("\n### Areas for Improvement")
        
        # Identify common weaknesses
        if "cross_lingual" in self.results and "rag" in self.results["cross_lingual"]:
            cross_ling_rag = self.results["cross_lingual"]["rag"]
            
            # Find metrics with high variation across languages
            varying_metrics = []
            for metric, data in cross_ling_rag.items():
                if data["range"] > 0.2:  # Significant variation
                    varying_metrics.append((metric, data["range"], data["worst_language"]))
            
            varying_metrics.sort(key=lambda x: x[1], reverse=True)
            
            if varying_metrics:
                report.append("\n#### Consistency Issues")
                for metric, range_val, worst_lang in varying_metrics[:3]:  # Top 3 issues
                    report.append(f"- Improve {metric.replace('_', ' ').title()} for {worst_lang.capitalize()} " +
                                  f"(variation of {range_val:.2f} across languages)")
        
        # Full report
        full_report = "\n".join(report)
        
        # Save report if path is provided
        if output_path:
            with open(f"{output_path}/evaluation_report.md", "w", encoding="utf-8") as f:
                f.write(full_report)
        
        return full_report
    
    def _add_system_results_to_report(self, report: List[str], system_results: Dict) -> None:
        """Add system results to the report."""
        # Medical terminology
        if "medical_terminology" in system_results:
            terms = system_results["medical_terminology"]
            report.append(f"- Medical Terminology: {terms['average']:.3f} score, " +
                         f"{terms['keyword_density']:.4f} keyword density")
        
        # Response detail
        if "response_detail" in system_results:
            detail = system_results["response_detail"]
            report.append(f"- Response Detail: {detail['average_detail_score']:.3f} score, " +
                         f"{detail['average_words']:.1f} average words")
        
        # Query relevance
        if "query_relevance" in system_results:
            relevance = system_results["query_relevance"]
            report.append(f"- Query Relevance: {relevance['average_relevance']:.3f} score")
        
        # Context utilization (RAG only)
        if "context_utilization" in system_results:
            util = system_results["context_utilization"]
            report.append(f"- Context Utilization: {util['average_utilization']:.3f} score")
        
        # Attribution (RAG only)
        if "attribution" in system_results:
            attr = system_results["attribution"]
            report.append(f"- Source Attribution: {attr['attribution_rate']*100:.1f}% of responses")
        
        # LLM evaluation if available
        if "llm_evaluation" in system_results and "average_scores" in system_results["llm_evaluation"]:
            report.append("\n#### Expert LLM Evaluation")
            
            llm_scores = system_results["llm_evaluation"]["average_scores"]
            for dimension, score in llm_scores.items():
                report.append(f"- {dimension.replace('_', ' ').title()}: {score:.1f}/5.0")


# Function to demonstrate usage
def evaluate_maternal_rag_without_groundtruth(
    rag_system, 
    non_rag_system, 
    test_queries,
    judge_llm=None,
    output_path="./evaluation_results"
):
    """
    Evaluate multilingual maternal healthcare RAG system without ground truth.
    
    Args:
        rag_system: RAG system to evaluate
        non_rag_system: Non-RAG system to compare against
        test_queries: Dictionary of test queries by language
        judge_llm: Optional LLM for judging responses
        output_path: Path to save evaluation results
        
    Returns:
        Evaluation results
    """
    import os
    os.makedirs(output_path, exist_ok=True)
    
    # Initialize evaluator
    evaluator = MultilingualRAGEvaluator(
        judge_llm=judge_llm,
        rag_system=rag_system,
        non_rag_system=non_rag_system
    )
    
    # Run evaluation
    results = evaluator.evaluate_all(test_queries)
    
    # Generate visualizations
    evaluator.visualize_results(output_path)
    
    # Generate and save report
    report = evaluator.generate_report(output_path)
    print(f"Report saved to {output_path}/evaluation_report.md")
    
    return results


# Example usage with dummy systems for demonstration
class DummyMaternityRAG:
    def answer(self, query, language="english"):
        """Simulate RAG system answering."""
        if language == "english":
            return "Based on maternal healthcare guidelines, pregnant women should attend regular prenatal checkups. Research suggests these visits help monitor both maternal and fetal health."
        elif language == "hindi":
            return "मातृत्व स्वास्थ्य दिशानिर्देशों के अनुसार, गर्भवती महिलाओं को नियमित प्रसवपूर्व जांच में जाना चाहिए। अनुसंधान से पता चलता है कि ये विज़िट मातृ और भ्रूण दोनों के स्वास्थ्य की निगरानी में मदद करते हैं।"
        elif language == "assamese":
            return "মাতৃত্ব স্বাস্থ্যসেৱা নিৰ্দেশিকা অনুসৰি, গৰ্ভৱতী মহিলাসকলে নিয়মিত প্ৰসৱপূৰ্ব পৰীক্ষা কৰাব লাগে। গৱেষণাত দেখা গৈছে যে এই পৰিদৰ্শনবোৰে মাতৃ আৰু গৰ্ভস্থ শিশুৰ স্বাস্থ্য দুয়োটা পৰ্যবেক্ষণ কৰাত সহায় কৰে।"
        else:  # hinglish
            return "Maternal healthcare guidelines ke according, pregnant women ko regular prenatal checkups ke liye jana chahiye. Research batati hai ki ye visits maternal aur fetal health dono ko monitor karne mein help karte hain."
    
    def retrieve(self, query, language="english"):
        """Simulate retrieval."""
        if language == "english":
            return ["Pregnant women should attend regular prenatal checkups.", "These visits help monitor maternal and fetal health."]
        elif language == "hindi":
            return ["गर्भवती महिलाओं को नियमित प्रसवपूर्व जांच में जाना चाहिए।", "ये विज़िट मातृ और भ्रूण दोनों के स्वास्थ्य की निगरानी में मदद करते हैं।"]
        elif language == "assamese":
            return ["গৰ্ভৱতী মহিলাসকলে নিয়মিত প্ৰসৱপূৰ্ব পৰীক্ষা কৰাব লাগে।", "এই পৰিদৰ্শনবোৰে মাতৃ আৰু গৰ্ভস্থ শিশুৰ স্বাস্থ্য দুয়োটা পৰ্যবেক্ষণ কৰাত সহায় কৰে।"]
        else:  # hinglish
            return ["Pregnant women ko regular prenatal checkups ke liye jana chahiye.", "Ye visits maternal aur fetal health dono ko monitor karne mein help karte hain."]


class DummyMaternityLLM:
    def generate(self, query, language="english"):
        """Simulate non-RAG LLM generating an answer."""
        if language == "english":
            return "Pregnant women should go to regular checkups. These are important for health."
        elif language == "hindi":
            return "गर्भवती महिलाओं को नियमित जांच के लिए जाना चाहिए। ये स्वास्थ्य के लिए महत्वपूर्ण हैं।"
        elif language == "assamese":
            return "গৰ্ভৱতী মহিলাসকলে নিয়মিত পৰীক্ষাৰ বাবে যাব লাগে। এইবোৰ স্বাস্থ্যৰ বাবে গুৰুত্বপূৰ্ণ।"
        else:  # hinglish
            return "Pregnant women ko regular checkups ke liye jana chahiye. Ye health ke liye important hain."


def example_usage():
    """Run an example evaluation with dummy systems."""
    # Create dummy systems
    rag_system = DummyMaternityRAG()
    non_rag_system = DummyMaternityLLM()
    
    # Create test queries
    test_queries = {
        "english": [
            "How often should pregnant women go for checkups?",
            "What nutrition is important during pregnancy?",
            "What are signs of pregnancy complications?"
        ],
        "hindi": [
            "गर्भवती महिलाओं को कितनी बार जांच के लिए जाना चाहिए?",
            "गर्भावस्था के दौरान कौन सा पोषण महत्वपूर्ण है?",
            "गर्भावस्था की जटिलताओं के संकेत क्या हैं?"
        ],
        "assamese": [
            "গৰ্ভৱতী মহিলাসকলে কিমান সঘনাই পৰীক্ষাৰ বাবে যাব লাগে?",
            "গৰ্ভাৱস্থাৰ সময়ত কোনটো পুষ্টি গুৰুত্বপূৰ্ণ?",
            "গৰ্ভাৱস্থাৰ জটিলতাৰ লক্ষণবোৰ কি কি?"
        ],
        "hinglish": [
            "Pregnant women ko kitni baar checkups ke liye jana chahiye?",
            "Pregnancy ke dauran kaun sa nutrition important hai?",
            "Pregnancy complications ke signs kya hain?"
        ]
    }
    
    # Run evaluation
    results = evaluate_maternal_rag_without_groundtruth(
        rag_system=rag_system,
        non_rag_system=non_rag_system,
        test_queries=test_queries,
        output_path="./dummy_evaluation_results"
    )
    
    return results


if __name__ == "__main__":
    example_usage()

In [None]:
import pandas as pd
import numpy as np
import json
import os
from collections import defaultdict
from typing import List, Dict, Any, Optional

class HumanEvaluator:
    """
    Framework for human evaluation of maternal healthcare responses
    based on specified criteria: correctness, completeness, clarity, and cultural appropriateness.
    """
    
    def __init__(self, output_dir: str = "./human_evaluation"):
        """
        Initialize the human evaluator.
        
        Args:
            output_dir: Directory to save evaluation templates and results
        """
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)
        
        # Define evaluation criteria
        self.criteria = {
            "correctness": {
                "description": "Is each claim in the response medically correct?",
                "scale": {
                    1: "all correct",
                    2: "partially correct",
                    3: "not correct"
                },
                "instructions": "Please firstly divide each response by claims, and then grade each claim individually."
            },
            "completeness": {
                "description": "Does the answer cover everything necessary, or should it include more information?",
                "scale": {
                    1: "covers everything the patient should know",
                    2: "covers the most important information but not everything",
                    3: "omits significant information"
                },
                "instructions": "If the answer is 2 or 3, please provide 1-2 missing information bullets that you think is most necessary."
            },
            "clarity": {
                "description": "Is this response clear enough for a user of average literacy to fully understand?",
                "scale": {
                    1: "completely understandable",
                    2: "acceptable but not completely clear",
                    3: "unacceptable"
                },
                "instructions": ""
            },
            "cultural_appropriateness": {
                "description": "Is this response appropriate for the cultural context of the user who entered the question?",
                "scale": {
                    1: "completely appropriate",
                    2: "acceptable but could be better",
                    3: "inappropriate"
                },
                "instructions": "The cultural background of each user considered here is solely judged from their input language."
            }
        }
    
    def generate_evaluation_template(self, model_responses: Dict, output_format: str = "csv") -> str:
        """
        Generate a template for human evaluators to fill out.
        
        Args:
            model_responses: Dictionary with model responses by language
            output_format: Format of the template ('csv' or 'excel')
            
        Returns:
            Path to the generated template
        """
        # Prepare data for template
        rows = []
        
        # Get all model names
        model_names = list(model_responses.keys())
        
        # Track which questions have been added
        added_questions = set()
        
        for model_name in model_names:
            for lang in model_responses[model_name]:
                questions = model_responses[model_name][lang]["questions"]
                answers = model_responses[model_name][lang]["answers"]
                
                for i, (question, answer) in enumerate(zip(questions, answers)):
                    # Create a unique identifier for this question
                    question_id = f"{lang}_{i}_{hash(question) % 10000}"
                    
                    if question_id not in added_questions:
                        rows.append({
                            "question_id": question_id,
                            "language": lang,
                            "question": question,
                            "model": model_name,
                            "answer": answer,
                            "correctness": "",
                            "correctness_notes": "",
                            "completeness": "",
                            "completeness_notes": "",
                            "clarity": "",
                            "clarity_notes": "",
                            "cultural_appropriateness": "",
                            "cultural_appropriateness_notes": ""
                        })
                        added_questions.add(question_id)
        
        # Convert to DataFrame
        df = pd.DataFrame(rows)
        
        # Save template
        if output_format == "csv":
            template_path = os.path.join(self.output_dir, "evaluation_template.csv")
            df.to_csv(template_path, index=False)
        else:  # excel
            template_path = os.path.join(self.output_dir, "evaluation_template.xlsx")
            df.to_excel(template_path, index=False)
        
        # Create instructions file
        instructions_path = os.path.join(self.output_dir, "evaluation_instructions.txt")
        with open(instructions_path, "w", encoding="utf-8") as f:
            f.write("# Human Evaluation Instructions for Maternal Healthcare Responses\n\n")
            
            for criterion, details in self.criteria.items():
                f.write(f"## {criterion.capitalize()}\n")
                f.write(f"Description: {details['description']}\n\n")
                f.write("Scale:\n")
                for score, desc in details['scale'].items():
                    f.write(f"- {score} = {desc}\n")
                if details['instructions']:
                    f.write(f"\nSpecial instructions: {details['instructions']}\n")
                f.write("\n")
        
        print(f"Evaluation template created at {template_path}")
        print(f"Evaluation instructions created at {instructions_path}")
        
        return template_path
    
    def parse_evaluation_results(self, results_path: str) -> Dict[str, Any]:
        """
        Parse the completed evaluation file and compute metrics.
        
        Args:
            results_path: Path to the completed evaluation file
            
        Returns:
            Dictionary with evaluation results
        """
        # Read evaluation results
        if results_path.endswith('.csv'):
            df = pd.read_csv(results_path)
        elif results_path.endswith(('.xlsx', '.xls')):
            df = pd.read_excel(results_path)
        else:
            raise ValueError("Unsupported file format. Please provide a CSV or Excel file.")
        
        # Validate required columns
        required_columns = [
            "model", "language", "correctness", "completeness", 
            "clarity", "cultural_appropriateness"
        ]
        
        for col in required_columns:
            if col not in df.columns:
                raise ValueError(f"Required column '{col}' not found in evaluation results.")
        
        # Convert ratings to numeric
        rating_columns = ["correctness", "completeness", "clarity", "cultural_appropriateness"]
        for col in rating_columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        
        # Compute metrics
        results = {}
        
        # Metrics by model and language
        for model in df['model'].unique():
            model_data = df[df['model'] == model]
            
            if model not in results:
                results[model] = {
                    "overall": self._compute_metrics(model_data),
                    "by_language": {}
                }
            
            for lang in model_data['language'].unique():
                lang_data = model_data[model_data['language'] == lang]
                results[model]["by_language"][lang] = self._compute_metrics(lang_data)
        
        # Aggregate metrics by model type (RAG vs non-RAG)
        rag_models = [m for m in results.keys() if "_rag" in m]
        non_rag_models = [m for m in results.keys() if "_rag" not in m]
        
        results["aggregated"] = {
            "rag": self._aggregate_model_results([results[m] for m in rag_models]),
            "non_rag": self._aggregate_model_results([results[m] for m in non_rag_models])
        }
        
        # Save processed results
        results_json_path = os.path.join(self.output_dir, "processed_evaluation_results.json")
        with open(results_json_path, "w", encoding="utf-8") as f:
            json.dump(results, f, indent=2)
        
        print(f"Processed evaluation results saved to {results_json_path}")
        
        return results
    
    def _compute_metrics(self, data: pd.DataFrame) -> Dict[str, Any]:
        """
        Compute metrics for a subset of evaluation data.
        
        Args:
            data: DataFrame with evaluation data
            
        Returns:
            Dictionary with computed metrics
        """
        metrics = {}
        
        # Rating metrics
        rating_columns = ["correctness", "completeness", "clarity", "cultural_appropriateness"]
        
        for col in rating_columns:
            col_data = data[col].dropna()
            
            if len(col_data) > 0:
                metrics[col] = {
                    "mean": float(col_data.mean()),
                    "median": float(col_data.median()),
                    "distribution": {
                        "1": int((col_data == 1).sum()),
                        "2": int((col_data == 2).sum()),
                        "3": int((col_data == 3).sum())
                    },
                    "count": int(len(col_data))
                }
            else:
                metrics[col] = {
                    "mean": None,
                    "median": None,
                    "distribution": {"1": 0, "2": 0, "3": 0},
                    "count": 0
                }
        
        # Overall score (lower is better)
        metrics["overall_score"] = np.mean([metrics[col]["mean"] for col in rating_columns 
                                          if metrics[col]["mean"] is not None])
        
        # Calculate percentages for each rating
        for col in rating_columns:
            total = sum(metrics[col]["distribution"].values())
            if total > 0:
                metrics[col]["percentage"] = {
                    score: (count / total) * 100 
                    for score, count in metrics[col]["distribution"].items()
                }
            else:
                metrics[col]["percentage"] = {score: 0 for score in ["1", "2", "3"]}
        
        return metrics
    
    def _aggregate_model_results(self, model_results: List[Dict]) -> Dict[str, Any]:
        """
        Aggregate results from multiple models.
        
        Args:
            model_results: List of model result dictionaries
            
        Returns:
            Dictionary with aggregated metrics
        """
        if not model_results:
            return {}
        
        aggregated = {
            "overall": {},
            "by_language": defaultdict(list)
        }
        
        # Collect metrics
        rating_columns = ["correctness", "completeness", "clarity", "cultural_appropriateness"]
        
        # Aggregate overall metrics
        for metric in rating_columns:
            values = []
            distribution = {"1": 0, "2": 0, "3": 0}
            
            for result in model_results:
                if "overall" in result and metric in result["overall"]:
                    metric_data = result["overall"][metric]
                    if metric_data["mean"] is not None:
                        values.append(metric_data["mean"])
                    
                    for score, count in metric_data["distribution"].items():
                        distribution[score] += count
            
            if values:
                aggregated["overall"][metric] = {
                    "mean": float(np.mean(values)),
                    "distribution": distribution,
                    "count": sum(distribution.values())
                }
                
                # Calculate percentages
                total = sum(distribution.values())
                if total > 0:
                    aggregated["overall"][metric]["percentage"] = {
                        score: (count / total) * 100 
                        for score, count in distribution.items()
                    }
                else:
                    aggregated["overall"][metric]["percentage"] = {score: 0 for score in ["1", "2", "3"]}
        
        # Aggregate language metrics
        languages = set()
        for result in model_results:
            if "by_language" in result:
                languages.update(result["by_language"].keys())
        
        for lang in languages:
            lang_metrics = {}
            
            for metric in rating_columns:
                values = []
                distribution = {"1": 0, "2": 0, "3": 0}
                
                for result in model_results:
                    if "by_language" in result and lang in result["by_language"] and metric in result["by_language"][lang]:
                        metric_data = result["by_language"][lang][metric]
                        if metric_data["mean"] is not None:
                            values.append(metric_data["mean"])
                        
                        for score, count in metric_data["distribution"].items():
                            distribution[score] += count
                
                if values:
                    lang_metrics[metric] = {
                        "mean": float(np.mean(values)),
                        "distribution": distribution,
                        "count": sum(distribution.values())
                    }
                    
                    # Calculate percentages
                    total = sum(distribution.values())
                    if total > 0:
                        lang_metrics[metric]["percentage"] = {
                            score: (count / total) * 100 
                            for score, count in distribution.items()
                        }
                    else:
                        lang_metrics[metric]["percentage"] = {score: 0 for score in ["1", "2", "3"]}
            
            if lang_metrics:
                # Calculate overall score for this language
                lang_metrics["overall_score"] = np.mean([lang_metrics[col]["mean"] for col in rating_columns 
                                                      if col in lang_metrics and lang_metrics[col]["mean"] is not None])
                aggregated["by_language"][lang] = lang_metrics
        
        # Calculate overall score
        if "overall" in aggregated and any(aggregated["overall"]):
            aggregated["overall"]["overall_score"] = np.mean([
                aggregated["overall"][col]["mean"] for col in rating_columns 
                if col in aggregated["overall"] and aggregated["overall"][col]["mean"] is not None
            ])
        
        return aggregated
    
    def generate_evaluation_report(self, evaluation_results: Dict[str, Any]) -> str:
        """
        Generate a human-readable report from evaluation results.
        
        Args:
            evaluation_results: Results from parse_evaluation_results
            
        Returns:
            Path to the generated report
        """
        report = []
        report.append("# Maternal Healthcare Response Evaluation Report")
        report.append("\n## Overview")
        
        # Add RAG vs non-RAG comparison if available
        if "aggregated" in evaluation_results and "rag" in evaluation_results["aggregated"] and "non_rag" in evaluation_results["aggregated"]:
            rag_results = evaluation_results["aggregated"]["rag"]
            non_rag_results = evaluation_results["aggregated"]["non_rag"]
            
            if "overall" in rag_results and "overall" in non_rag_results:
                rag_score = rag_results["overall"].get("overall_score")
                non_rag_score = non_rag_results["overall"].get("overall_score")
                
                if rag_score is not None and non_rag_score is not None:
                    winner = "RAG" if rag_score < non_rag_score else "Non-RAG"
                    diff = abs(rag_score - non_rag_score)
                    
                    report.append(f"\n### RAG vs Non-RAG Performance")
                    report.append(f"\nOverall, the **{winner}** approach performed better with an average score difference of {diff:.2f}.")
                    report.append(f"(Note: Lower scores are better; 1 = best, 3 = worst)")
                    report.append(f"\n- RAG average score: {rag_score:.2f}")
                    report.append(f"- Non-RAG average score: {non_rag_score:.2f}")
                    
                    # Add detailed metrics
                    report.append("\n#### Detailed Metrics Comparison (RAG vs Non-RAG)")
                    
                    for criterion in ["correctness", "completeness", "clarity", "cultural_appropriateness"]:
                        if criterion in rag_results["overall"] and criterion in non_rag_results["overall"]:
                            rag_mean = rag_results["overall"][criterion]["mean"]
                            non_rag_mean = non_rag_results["overall"][criterion]["mean"]
                            
                            criterion_winner = "RAG" if rag_mean < non_rag_mean else "Non-RAG"
                            criterion_diff = abs(rag_mean - non_rag_mean)
                            
                            report.append(f"\n**{criterion.capitalize()}**: **{criterion_winner}** is better by {criterion_diff:.2f}")
                            report.append(f"- RAG: {rag_mean:.2f}")
                            report.append(f"- Non-RAG: {non_rag_mean:.2f}")
        
        # Add language-specific analysis
        report.append("\n## Language-Specific Analysis")
        
        # Get all languages across all models
        languages = set()
        for model, data in evaluation_results.items():
            if model != "aggregated" and "by_language" in data:
                languages.update(data["by_language"].keys())
        
        for lang in sorted(languages):
            report.append(f"\n### {lang.capitalize()}")
            
            # Compare RAG vs non-RAG for this language
            if "aggregated" in evaluation_results:
                rag_lang_data = evaluation_results["aggregated"]["rag"].get("by_language", {}).get(lang, {})
                non_rag_lang_data = evaluation_results["aggregated"]["non_rag"].get("by_language", {}).get(lang, {})
                
                if rag_lang_data and non_rag_lang_data:
                    rag_score = rag_lang_data.get("overall_score")
                    non_rag_score = non_rag_lang_data.get("overall_score")
                    
                    if rag_score is not None and non_rag_score is not None:
                        lang_winner = "RAG" if rag_score < non_rag_score else "Non-RAG"
                        lang_diff = abs(rag_score - non_rag_score)
                        
                        report.append(f"\nFor {lang.capitalize()}, the **{lang_winner}** approach performed better with a score difference of {lang_diff:.2f}.")
                        report.append(f"\n- RAG average score: {rag_score:.2f}")
                        report.append(f"- Non-RAG average score: {non_rag_score:.2f}")
                        
                        # Add detailed metrics
                        for criterion in ["correctness", "completeness", "clarity", "cultural_appropriateness"]:
                            if criterion in rag_lang_data and criterion in non_rag_lang_data:
                                rag_mean = rag_lang_data[criterion]["mean"]
                                non_rag_mean = non_rag_lang_data[criterion]["mean"]
                                
                                criterion_winner = "RAG" if rag_mean < non_rag_mean else "Non-RAG"
                                criterion_diff = abs(rag_mean - non_rag_mean)
                                
                                report.append(f"\n**{criterion.capitalize()}**: **{criterion_winner}** is better by {criterion_diff:.2f}")
        
        # Add model-specific analysis
        report.append("\n## Model-Specific Analysis")
        
        for model, data in evaluation_results.items():
            if model != "aggregated" and "overall" in data:
                report.append(f"\n### {model}")
                
                if "overall_score" in data["overall"]:
                    report.append(f"\nOverall score: {data['overall']['overall_score']:.2f}")
                
                # Add criterion breakdown
                for criterion in ["correctness", "completeness", "clarity", "cultural_appropriateness"]:
                    if criterion in data["overall"]:
                        criterion_data = data["overall"][criterion]
                        report.append(f"\n**{criterion.capitalize()}**: {criterion_data['mean']:.2f}")
                        
                        # Add distribution
                        if "percentage" in criterion_data:
                            report.append(f"- Score distribution:")
                            for score, percentage in criterion_data["percentage"].items():
                                report.append(f"  - {score}: {percentage:.1f}%")
        
        # Save report
        report_path = os.path.join(self.output_dir, "human_evaluation_report.md")
        with open(report_path, "w", encoding="utf-8") as f:
            f.write("\n".join(report))
        
        print(f"Evaluation report saved to {report_path}")
        
        return report_path


def integrate_human_evaluation(model_responses, languages=["english", "hindi", "assamese", "hinglish"]):
    """
    Generate templates for human evaluation of model responses.
    
    Args:
        model_responses: Dictionary with model responses by language
        languages: List of languages to evaluate
        
    Returns:
        Path to the generated template
    """
    evaluator = HumanEvaluator(output_dir="./human_evaluation")
    
    # Filter responses to only include specified languages
    filtered_responses = {}
    for model, data in model_responses.items():
        filtered_responses[model] = {lang: data[lang] for lang in data if lang in languages}
    
    # Generate evaluation template
    template_path = evaluator.generate_evaluation_template(filtered_responses, output_format="excel")
    
    return template_path


def process_human_evaluation_results(results_path):
    """
    Process human evaluation results and generate a report.
    
    Args:
        results_path: Path to the completed evaluation file
        
    Returns:
        Dictionary with evaluation results
    """
    evaluator = HumanEvaluator(output_dir="./human_evaluation")
    
    # Parse evaluation results
    results = evaluator.parse_evaluation_results(results_path)
    
    # Generate evaluation report
    report_path = evaluator.generate_evaluation_report(results)
    
    return {
        "results": results,
        "report_path": report_path
    }

In [None]:
import json
import os
import re
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List, Any, Tuple, Optional
from collections import defaultdict
import requests
from tqdm.auto import tqdm
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("llm_judge_evaluation.log"),
        logging.StreamHandler()
    ]
)

class LLMJudgeEvaluator:
    """
    Evaluator that uses different LLMs as judges to evaluate responses based on:
    - Correctness: Is each claim in the response medically correct?
    - Completeness: Does the answer cover everything necessary?
    - Clarity: Is this response clear for a user of average literacy?
    - Cultural Appropriateness: Is this response appropriate for the cultural context?
    """
    
    def __init__(
        self,
        output_dir: str = "./llm_judge_evaluation",
        parallel_requests: int = 5,
        cache_results: bool = True
    ):
        """
        Initialize the LLM Judge evaluator.
        
        Args:
            output_dir: Directory to save evaluation results
            parallel_requests: Number of parallel requests to make to LLM APIs
            cache_results: Whether to cache LLM judgments to avoid duplicate API calls
        """
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)
        
        # Cache for LLM judgments
        self.cache_dir = os.path.join(output_dir, "cache")
        os.makedirs(self.cache_dir, exist_ok=True)
        self.cache_results = cache_results
        
        # Parallel processing settings
        self.parallel_requests = parallel_requests
        
        # Define the criteria
        self.criteria = [
            "correctness",
            "completeness",
            "clarity",
            "cultural_appropriateness"
        ]
        
        # LLM judges configuration
        self.judges = {
            "gpt-4": {
                "api_name": "openai",
                "model": "gpt-4-turbo-preview",
                "temperature": 0.2,
                "max_tokens": 1000,
                "system_prompt": "You are an expert in maternal healthcare with extensive medical knowledge. You will evaluate AI responses to maternal healthcare questions to determine if they are correct, complete, clear, and culturally appropriate."
            },
            "gemini-pro": {
                "api_name": "google",
                "model": "gemini-1.5-pro",
                "temperature": 0.2,
                "max_tokens": 1000,
                "system_prompt": "You are an expert in maternal healthcare with extensive medical knowledge. You will evaluate AI responses to maternal healthcare questions to determine if they are correct, complete, clear, and culturally appropriate."
            }
            # Add more judges as needed
        }
    
    def evaluate(
        self,
        json_files: Dict[str, str],
        judges_to_use: List[str] = ["gpt-4"],
        languages: List[str] = ["english", "hindi", "assamese", "hinglish"],
        sample_size: Optional[int] = None
    ) -> Dict:
        """
        Evaluate model responses using LLM judges.
        
        Args:
            json_files: Dictionary mapping model names to file paths
            judges_to_use: List of judge models to use
            languages: Languages to evaluate
            sample_size: Number of samples to evaluate per language per model (None for all)
            
        Returns:
            Dictionary with evaluation results
        """
        # Validate judges
        for judge in judges_to_use:
            if judge not in self.judges:
                raise ValueError(f"Judge {judge} not found. Available judges: {list(self.judges.keys())}")
        
        # Load responses from all models
        responses = self._load_responses(json_files, languages)
        
        # Apply sampling if requested
        if sample_size is not None:
            self._apply_sampling(responses, sample_size)
        
        # Initialize results container
        results = {
            "model_evaluations": {},
            "judge_comparisons": {},
            "language_comparisons": {},
            "rag_vs_nonrag_comparisons": {}
        }
        
        # Evaluate each model with each judge
        for judge_name in tqdm(judges_to_use, desc="Evaluating with judges"):
            judge_config = self.judges[judge_name]
            
            logging.info(f"Starting evaluation with judge: {judge_name}")
            
            # Create results container for this judge
            results["model_evaluations"][judge_name] = {}
            
            # Process each model
            for model_name, model_data in tqdm(responses.items(), desc=f"Processing models with {judge_name}"):
                results["model_evaluations"][judge_name][model_name] = {}
                
                # Process each language
                for lang, lang_data in model_data.items():
                    if not lang_data["questions"] or not lang_data["answers"]:
                        continue
                    
                    logging.info(f"Evaluating {model_name} for {lang} with {judge_name}")
                    
                    # Evaluate responses for this language
                    lang_results = self._evaluate_model_language(
                        model_name=model_name,
                        lang=lang,
                        questions=lang_data["questions"],
                        responses=lang_data["answers"],
                        judge_name=judge_name,
                        judge_config=judge_config
                    )
                    
                    results["model_evaluations"][judge_name][model_name][lang] = lang_results
        
        # Compare judges
        if len(judges_to_use) > 1:
            results["judge_comparisons"] = self._compare_judges(results["model_evaluations"])
        
        # Compare languages
        results["language_comparisons"] = self._compare_languages(results["model_evaluations"])
        
        # Compare RAG vs non-RAG
        results["rag_vs_nonrag_comparisons"] = self._compare_rag_vs_nonrag(results["model_evaluations"])
        
        # Save results
        self._save_results(results)
        
        # Generate visual reports
        self._generate_visual_reports(results)
        
        return results
    
    def _load_responses(self, json_files: Dict[str, str], languages: List[str]) -> Dict:
        """Load responses from JSON files."""
        model_responses = {}
        
        for model_name, file_path in json_files.items():
            logging.info(f"Loading responses from {file_path}...")
            
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    
                # Initialize containers for this model
                if model_name not in model_responses:
                    model_responses[model_name] = {lang: {"questions": [], "answers": []} for lang in languages}
                
                # Process each entry in the JSON
                for entry in data:
                    # Detect language from the entry
                    detected_lang = entry.get("detected_info", {}).get("language", "unknown")
                    
                    # Map detected language to our standard language categories
                    if detected_lang == "english":
                        lang = "english"
                    elif detected_lang == "hindi":
                        lang = "hindi"
                    elif detected_lang == "assamese":
                        lang = "assamese"
                    elif "hinglish" in detected_lang.lower() or (detected_lang == "english" and any(hindi_word in entry["question_original"].lower() for hindi_word in ["kya", "hai", "me", "ko", "ki"])):
                        lang = "hinglish"
                    else:
                        lang = "other"
                        
                    # Skip if not in our target languages
                    if lang not in languages:
                        continue
                    
                    # Add the question and answer to the appropriate container
                    model_responses[model_name][lang]["questions"].append(entry["question_original"])
                    model_responses[model_name][lang]["answers"].append(entry["answer"])
                    
                logging.info(f"Loaded responses for {model_name}")
                
            except Exception as e:
                logging.error(f"Error loading {file_path}: {e}")
        
        return model_responses
    
    def _apply_sampling(self, responses: Dict, sample_size: int):
        """Apply sampling to limit the number of evaluated examples."""
        for model_name, model_data in responses.items():
            for lang, lang_data in model_data.items():
                if len(lang_data["questions"]) > sample_size:
                    # Get random indices for sampling
                    indices = np.random.choice(
                        len(lang_data["questions"]), 
                        size=sample_size, 
                        replace=False
                    )
                    
                    # Apply sampling
                    lang_data["questions"] = [lang_data["questions"][i] for i in indices]
                    lang_data["answers"] = [lang_data["answers"][i] for i in indices]
                    
                    logging.info(f"Sampled {sample_size} examples for {model_name} in {lang}")
    
    def _evaluate_model_language(
        self,
        model_name: str,
        lang: str,
        questions: List[str],
        responses: List[str],
        judge_name: str,
        judge_config: Dict
    ) -> Dict:
        """
        Evaluate responses for a specific model and language using an LLM judge.
        
        Args:
            model_name: Name of the model being evaluated
            lang: Language of the responses
            questions: List of questions
            responses: List of responses
            judge_name: Name of the judge to use
            judge_config: Configuration for the judge
            
        Returns:
            Dictionary with evaluation results
        """
        # Prepare evaluation tasks
        eval_tasks = []
        for i, (question, response) in enumerate(zip(questions, responses)):
            # Create a unique ID for caching
            cache_id = f"{model_name}_{lang}_{judge_name}_{i}"
            
            eval_tasks.append({
                "id": cache_id,
                "question": question,
                "response": response,
                "lang": lang
            })
        
        # Process evaluation tasks in parallel
        evaluations = []
        
        with ThreadPoolExecutor(max_workers=self.parallel_requests) as executor:
            futures = []
            
            for task in eval_tasks:
                future = executor.submit(
                    self._get_judge_evaluation,
                    task["id"],
                    task["question"],
                    task["response"],
                    task["lang"],
                    judge_name,
                    judge_config
                )
                futures.append(future)
            
            for future in tqdm(as_completed(futures), total=len(futures), desc=f"Evaluating {model_name} in {lang}"):
                try:
                    eval_result = future.result()
                    if eval_result:
                        evaluations.append(eval_result)
                except Exception as e:
                    logging.error(f"Error in evaluation task: {e}")
        
        # Aggregate results
        aggregated = {
            "evaluations": evaluations,
            "summary": self._aggregate_evaluations(evaluations)
        }
        
        return aggregated
    
    def _get_judge_evaluation(
        self,
        cache_id: str,
        question: str,
        response: str,
        language: str,
        judge_name: str,
        judge_config: Dict
    ) -> Optional[Dict]:
        """
        Get evaluation from an LLM judge, with caching.
        
        Args:
            cache_id: Unique ID for caching
            question: User question
            response: Model response
            language: Language of content
            judge_name: Name of the judge
            judge_config: Configuration for the judge
            
        Returns:
            Dictionary with evaluation results or None if error
        """
        # Check cache first if enabled
        if self.cache_results:
            cache_file = os.path.join(self.cache_dir, f"{cache_id}.json")
            if os.path.exists(cache_file):
                try:
                    with open(cache_file, 'r', encoding='utf-8') as f:
                        return json.load(f)
                except:
                    pass  # Continue if cache read fails
        
        # Create judge prompt
        prompt = self._create_judge_prompt(question, response, language)
        
        # Call judge API
        try:
            api_name = judge_config["api_name"]
            
            if api_name == "openai":
                judge_response = self._call_openai_api(
                    prompt=prompt,
                    system_prompt=judge_config["system_prompt"],
                    model=judge_config["model"],
                    temperature=judge_config["temperature"],
                    max_tokens=judge_config["max_tokens"]
                )
            elif api_name == "google":
                judge_response = self._call_google_api(
                    prompt=prompt,
                    system_prompt=judge_config["system_prompt"],
                    model=judge_config["model"],
                    temperature=judge_config["temperature"],
                    max_tokens=judge_config["max_tokens"]
                )
            else:
                raise ValueError(f"Unsupported API: {api_name}")
            
            # Parse judge response
            evaluation = self._parse_judge_response(judge_response)
            
            # Add metadata
            evaluation["question"] = question
            evaluation["response"] = response
            evaluation["language"] = language
            evaluation["judge"] = judge_name
            
            # Cache result if enabled
            if self.cache_results:
                with open(cache_file, 'w', encoding='utf-8') as f:
                    json.dump(evaluation, f, indent=2)
            
            return evaluation
            
        except Exception as e:
            logging.error(f"Error getting evaluation from {judge_name}: {e}")
            return None
    
    def _create_judge_prompt(self, question: str, response: str, language: str) -> str:
        """Create prompt for the judge."""
        return f"""
        You are evaluating an AI response to a maternal healthcare question.
        
        User Query: {question}
        
        AI Response: {response}
        
        Language: {language}
        
        Please evaluate this response on the following criteria (score 1-3, where 1 is best, 3 is worst):
        
        1. Medical Correctness (1-3):
           - Is each claim in the response medically correct?
           - 1 = all correct, 2 = partially correct, 3 = not correct
           - Please divide the response into claims and evaluate each one.
        
        2. Completeness (1-3):
           - Does the answer cover everything necessary?
           - 1 = covers everything the patient should know, 2 = covers most important information but not everything, 3 = omits significant information
           - If the answer is 2 or 3, provide 1-2 missing information points that would be most necessary.
        
        3. Language Clarity (1-3):
           - Is this response clear enough for a user of average literacy?
           - 1 = completely understandable, 2 = acceptable but not completely clear, 3 = unacceptable
        
        4. Cultural Appropriateness (1-3):
           - Is this response appropriate for the cultural context of a {language}-speaking user? Also the user is from Assam, India.
           - 1 = completely appropriate, 2 = acceptable but could be better, 3 = inappropriate
        
        For each criterion, provide:
        1. A score (1, 2, or 3)
        2. A brief justification for your score
        3. For Completeness, if score is 2 or 3, list the 1-2 most important missing pieces of information
        
        Format your response as JSON:
        {{
            "correctness": {{
                "score": <1, 2, or 3>,
                "justification": "<your reasoning>",
                "problematic_claims": ["<if any>"]
            }},
            "completeness": {{
                "score": <1, 2, or 3>,
                "justification": "<your reasoning>",
                "missing_information": ["<if any>"]
            }},
            "clarity": {{
                "score": <1, 2, or 3>,
                "justification": "<your reasoning>"
            }},
            "cultural_appropriateness": {{
                "score": <1, 2, or 3>,
                "justification": "<your reasoning>"
            }}
        }}
        """
    
    def _call_openai_api(
        self, 
        prompt: str, 
        system_prompt: str, 
        model: str,
        temperature: float,
        max_tokens: int
    ) -> str:
        """Call OpenAI API for judgment."""
        try:
            import openai
            client = openai.OpenAI()
            
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": prompt}
                ],
                temperature=temperature,
                max_tokens=max_tokens
            )
            
            return response.choices[0].message.content
            
        except ImportError:
            raise ImportError("OpenAI package not installed. Install with: pip install openai")
        except Exception as e:
            raise Exception(f"Error calling OpenAI API: {e}")
    
    def _call_google_api(
        self,
        prompt: str,
        system_prompt: str,
        model: str,
        temperature: float,
        max_tokens: int
    ) -> str:
        """Call Google Gemini API for judgment."""
        try:
            import google.generativeai as genai
            
            genai.configure(api_key=GEMINI_API_KEY)
            
            model = genai.GenerativeModel(model)
            
            combined_prompt = f"{system_prompt}\n\n{prompt}"
            
            response = model.generate_content(
                combined_prompt,
                generation_config=genai.GenerationConfig(
                    temperature=temperature,
                    max_output_tokens=max_tokens
                )
            )
            
            return response.text
            
        except ImportError:
            raise ImportError("Google Generative AI package not installed. Install with: pip install google-generativeai")
        except Exception as e:
            raise Exception(f"Error calling Google API: {e}")
    
    def _parse_judge_response(self, response_text: str) -> Dict:
        """Parse the JSON response from the judge."""
        # Extract JSON from response (it might be wrapped in markdown code blocks)
        json_match = re.search(r'```json\s*([\s\S]*?)\s*```', response_text)
        if json_match:
            json_str = json_match.group(1)
        else:
            # Try to find JSON without code blocks
            json_match = re.search(r'({[\s\S]*})', response_text)
            if json_match:
                json_str = json_match.group(1)
            else:
                json_str = response_text
        
        try:
            # Parse the JSON
            evaluation = json.loads(json_str)
            
            # Ensure all criteria are present
            for criterion in self.criteria:
                if criterion not in evaluation:
                    evaluation[criterion] = {"score": 3, "justification": "Not evaluated"}
                
                # Ensure score is an integer between 1-3
                if "score" not in evaluation[criterion] or not isinstance(evaluation[criterion]["score"], int) or evaluation[criterion]["score"] < 1 or evaluation[criterion]["score"] > 3:
                    evaluation[criterion]["score"] = 3
            
            return evaluation
            
        except json.JSONDecodeError:
            # Fallback parsing for non-JSON responses
            logging.warning(f"Failed to parse JSON response: {response_text[:100]}...")
            
            # Create a default evaluation
            evaluation = {}
            
            for criterion in self.criteria:
                # Try to extract score using regex
                score_match = re.search(rf'{criterion}.*?score.*?([1-3])', response_text, re.IGNORECASE)
                score = int(score_match.group(1)) if score_match else 3
                
                evaluation[criterion] = {
                    "score": score,
                    "justification": "Failed to parse structured response"
                }
            
            return evaluation
    
    def _aggregate_evaluations(self, evaluations: List[Dict]) -> Dict:
        """Aggregate evaluation results."""
        if not evaluations:
            return {}
        
        summary = {}
        
        # Process each criterion
        for criterion in self.criteria:
            scores = [eval[criterion]["score"] for eval in evaluations if criterion in eval]
            
            if not scores:
                continue
                
            # Calculate statistics
            summary[criterion] = {
                "mean_score": np.mean(scores),
                "median_score": np.median(scores),
                "distribution": {
                    "1": sum(1 for s in scores if s == 1),
                    "2": sum(1 for s in scores if s == 2),
                    "3": sum(1 for s in scores if s == 3)
                },
                "count": len(scores)
            }
            
            # Calculate percentage distribution
            total = summary[criterion]["count"]
            if total > 0:
                summary[criterion]["percentage"] = {
                    score: (count / total) * 100 
                    for score, count in summary[criterion]["distribution"].items()
                }
        
        # Overall score across all criteria
        all_criteria_means = [summary[c]["mean_score"] for c in self.criteria if c in summary]
        if all_criteria_means:
            summary["overall"] = {
                "mean_score": np.mean(all_criteria_means)
            }
        
        return summary
    
    def _compare_judges(self, model_evaluations: Dict) -> Dict:
        """Compare evaluations from different judges."""
        if len(model_evaluations) <= 1:
            return {}
            
        judge_comparisons = {
            "agreement": {},
            "bias": {}
        }
        
        # Get list of judges
        judges = list(model_evaluations.keys())
        
        # Calculate agreement for each model and language
        for judge1 in judges:
            for judge2 in judges:
                if judge1 >= judge2:  # Skip self-comparisons and duplicates
                    continue
                    
                comparison_key = f"{judge1}_vs_{judge2}"
                judge_comparisons["agreement"][comparison_key] = {}
                
                # Find common models evaluated by both judges
                common_models = set(model_evaluations[judge1].keys()) & set(model_evaluations[judge2].keys())
                
                for model in common_models:
                    judge_comparisons["agreement"][comparison_key][model] = {}
                    
                    # Find common languages
                    common_langs = set(model_evaluations[judge1][model].keys()) & set(model_evaluations[judge2][model].keys())
                    
                    for lang in common_langs:
                        # Get evaluations
                        evals1 = model_evaluations[judge1][model][lang]["evaluations"]
                        evals2 = model_evaluations[judge2][model][lang]["evaluations"]
                        
                        # Match evaluations for the same questions
                        matched_evals = []
                        
                        for eval1 in evals1:
                            for eval2 in evals2:
                                if eval1["question"] == eval2["question"]:
                                    matched_evals.append((eval1, eval2))
                                    break
                        
                        # Calculate agreement for each criterion
                        criterion_agreement = {}
                        
                        for criterion in self.criteria:
                            exact_matches = 0
                            score_diffs = []
                            
                            for eval1, eval2 in matched_evals:
                                if criterion in eval1 and criterion in eval2:
                                    score1 = eval1[criterion]["score"]
                                    score2 = eval2[criterion]["score"]
                                    
                                    if score1 == score2:
                                        exact_matches += 1
                                        
                                    score_diffs.append(abs(score1 - score2))
                            
                            if matched_evals:
                                criterion_agreement[criterion] = {
                                    "exact_agreement": exact_matches / len(matched_evals),
                                    "avg_score_diff": np.mean(score_diffs) if score_diffs else 0
                                }
                        
                        # Calculate overall agreement
                        overall_agreement = np.mean([
                            criterion_agreement[c]["exact_agreement"] 
                            for c in self.criteria 
                            if c in criterion_agreement
                        ])
                        
                        judge_comparisons["agreement"][comparison_key][model][lang] = {
                            "criterion_agreement": criterion_agreement,
                            "overall_agreement": overall_agreement,
                            "matched_evaluations": len(matched_evals)
                        }
        
        # Calculate judge bias
        judge_comparisons["bias"] = self._calculate_judge_bias(model_evaluations)
        
        return judge_comparisons
    
    def _calculate_judge_bias(self, model_evaluations: Dict) -> Dict:
        """Calculate bias of each judge compared to the average."""
        # Get all judges
        judges = list(model_evaluations.keys())
        
        if len(judges) <= 1:
            return {}
            
        # Initialize bias tracking
        bias = {judge: {} for judge in judges}
        
        # For each criterion, calculate average scores across all judges
        for criterion in self.criteria:
            criterion_scores = defaultdict(list)
            
            # Collect scores from all judges
            for judge in judges:
                for model in model_evaluations[judge]:
                    for lang in model_evaluations[judge][model]:
                        summary = model_evaluations[judge][model][lang].get("summary", {})
                        if criterion in summary and "mean_score" in summary[criterion]:
                            # Create a key for this model-language pair
                            key = f"{model}_{lang}"
                            criterion_scores[key].append({
                                "judge": judge,
                                "score": summary[criterion]["mean_score"]
                            })
            
            # Calculate average and judge deviation
            for key, scores in criterion_scores.items():
                if len(scores) <= 1:
                    continue
                    
                # Calculate average score across judges
                avg_score = np.mean([s["score"] for s in scores])
                
                # Calculate deviation for each judge
                for score_data in scores:
                    judge = score_data["judge"]
                    deviation = score_data["score"] - avg_score
                    
                    if criterion not in bias[judge]:
                        bias[judge][criterion] = []
                        
                    bias[judge][criterion].append(deviation)
            
            # Calculate average bias for each judge
            for judge in judges:
                if criterion in bias[judge] and bias[judge][criterion]:
                    bias[judge][f"{criterion}_avg_bias"] = np.mean(bias[judge][criterion])
                    bias[judge][f"{criterion}_std_bias"] = np.std(bias[judge][criterion])
        
        # Calculate overall bias
        for judge in judges:
            bias_values = [
                bias[judge][f"{c}_avg_bias"] 
                for c in self.criteria 
                if f"{c}_avg_bias" in bias[judge]
            ]
            
            if bias_values:
                bias[judge]["overall_avg_bias"] = np.mean(bias_values)
                
                # Determine if judge is strict, moderate, or lenient
                if bias[judge]["overall_avg_bias"] < -0.2:
                    bias[judge]["bias_tendency"] = "strict"
                elif bias[judge]["overall_avg_bias"] > 0.2:
                    bias[judge]["bias_tendency"] = "lenient"
                else:
                    bias[judge]["bias_tendency"] = "moderate"
        
        return bias
    
    def _compare_languages(self, model_evaluations: Dict) -> Dict:
        """Compare performance across languages."""
        language_comparisons = {}
        
        # For each judge
        for judge, judge_data in model_evaluations.items():
            language_comparisons[judge] = {}
            
            # For each model
            for model, model_data in judge_data.items():
                language_comparisons[judge][model] = {}
                
                # Get languages for this model
                langs = list(model_data.keys())
                
                if not langs:
                    continue
                    
                # Compare performance across languages
                for criterion in self.criteria + ["overall"]:
                    if criterion != "overall":
                        criterion_key = criterion
                        score_key = "mean_score"
                    else:
                        criterion_key = "overall"
                        score_key = "mean_score"
                    
                    # Collect scores for each language
                    lang_scores = {}
                    
                    for lang in langs:
                        summary = model_data[lang].get("summary", {})
                        if criterion_key in summary and score_key in summary[criterion_key]:
                            lang_scores[lang] = summary[criterion_key][score_key]
                    
                    if lang_scores:
                        # Find best and worst language
                        best_lang = min(lang_scores.items(), key=lambda x: x[1])[0]
                        worst_lang = max(lang_scores.items(), key=lambda x: x[1])[0]
                        
                        language_comparisons[judge][model][criterion_key] = {
                            "scores": lang_scores,
                            "best_language": best_lang,
                            "worst_language": worst_lang,
                            "score_range": lang_scores[worst_lang] - lang_scores[best_lang]
                        }
        
        return language_comparisons
    
    def _compare_rag_vs_nonrag(self, model_evaluations: Dict) -> Dict:
        """Compare RAG vs non-RAG models."""
        rag_comparisons = {}
        
        # For each judge
        for judge, judge_data in model_evaluations.items():
            rag_comparisons[judge] = {}
            
            # Find RAG and non-RAG model pairs
            model_pairs = {}
            for model_name in judge_data.keys():
                if "_rag" in model_name:
                    base_model = model_name.replace("_rag", "")
                    if base_model in judge_data:
                        model_pairs[base_model] = model_name
            
            # Compare each pair
            for base_model, rag_model in model_pairs.items():
                comparison_key = f"{base_model}_vs_{rag_model}"
                rag_comparisons[judge][comparison_key] = {}
                
                # Find common languages
                base_langs = set(judge_data[base_model].keys())
                rag_langs = set(judge_data[rag_model].keys())
                common_langs = base_langs & rag_langs
                
                for lang in common_langs:
                    rag_comparisons[judge][comparison_key][lang] = {}
                    
                    # Compare on each criterion
                    for criterion in self.criteria + ["overall"]:
                        base_summary = judge_data[base_model][lang].get("summary", {})
                        rag_summary = judge_data[rag_model][lang].get("summary", {})
                    
                        if criterion != "overall":
                            criterion_key = criterion
                            score_key = "mean_score"
                        else:
                            criterion_key = "overall"
                            score_key = "mean_score"
                        
                        if (criterion_key in base_summary and score_key in base_summary[criterion_key] and
                            criterion_key in rag_summary and score_key in rag_summary[criterion_key]):
                            
                            base_score = base_summary[criterion_key][score_key]
                            rag_score = rag_summary[criterion_key][score_key]
                            difference = rag_score - base_score
                            
                            rag_comparisons[judge][comparison_key][lang][criterion] = {
                                "non_rag_score": base_score,
                                "rag_score": rag_score,
                                "difference": difference,
                                "winner": "rag" if difference < 0 else "non_rag"
                            }
        
            return rag_comparisons
    
    def _save_results(self, results: Dict) -> None:
        """Save evaluation results to file."""
        # Save full results
        results_path = os.path.join(self.output_dir, "evaluation_results.json")
        with open(results_path, 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=2)
        
        logging.info(f"Saved full evaluation results to {results_path}")
        
        # Save summary results in a more compact format
        summary = self._create_summary(results)
        summary_path = os.path.join(self.output_dir, "evaluation_summary.json")
        with open(summary_path, 'w', encoding='utf-8') as f:
            json.dump(summary, f, indent=2)
        
        logging.info(f"Saved summary results to {summary_path}")
    
    def _create_summary(self, results: Dict) -> Dict:
        """Create a summary of the evaluation results."""
        summary = {
            "model_performance": {},
            "rag_vs_nonrag": {},
            "language_performance": {},
            "judge_agreement": {}
        }
        
        # Model performance summary
        for judge, judge_data in results["model_evaluations"].items():
            summary["model_performance"][judge] = {}
            
            for model, model_data in judge_data.items():
                model_scores = {}
                
                for criterion in self.criteria + ["overall"]:
                    scores = []
                    
                    for lang, lang_data in model_data.items():
                        lang_summary = lang_data.get("summary", {})
                        
                        if criterion != "overall":
                            if criterion in lang_summary and "mean_score" in lang_summary[criterion]:
                                scores.append(lang_summary[criterion]["mean_score"])
                        else:
                            if "overall" in lang_summary and "mean_score" in lang_summary["overall"]:
                                scores.append(lang_summary["overall"]["mean_score"])
                    
                    if scores:
                        model_scores[criterion] = np.mean(scores)
                
                if model_scores:
                    summary["model_performance"][judge][model] = model_scores
        
        # RAG vs non-RAG summary
        for judge, judge_data in results["rag_vs_nonrag_comparisons"].items():
            summary["rag_vs_nonrag"][judge] = {}
            
            for comparison_key, comparison_data in judge_data.items():
                rag_wins = 0
                non_rag_wins = 0
                
                for lang, lang_data in comparison_data.items():
                    if "overall" in lang_data and "winner" in lang_data["overall"]:
                        winner = lang_data["overall"]["winner"]
                        
                        if winner == "rag":
                            rag_wins += 1
                        else:
                            non_rag_wins += 1
                
                if rag_wins > 0 or non_rag_wins > 0:
                    summary["rag_vs_nonrag"][judge][comparison_key] = {
                        "rag_wins": rag_wins,
                        "non_rag_wins": non_rag_wins,
                        "winner": "rag" if rag_wins > non_rag_wins else "non_rag"
                    }
        
        # Language performance summary
        for judge, judge_data in results["language_comparisons"].items():
            summary["language_performance"][judge] = {}
            
            for model, model_data in judge_data.items():
                if "overall" in model_data:
                    summary["language_performance"][judge][model] = {
                        "best_language": model_data["overall"]["best_language"],
                        "worst_language": model_data["overall"]["worst_language"]
                    }
        
        # Judge agreement summary
        if "agreement" in results["judge_comparisons"]:
            for comparison_key, comparison_data in results["judge_comparisons"]["agreement"].items():
                agreement_scores = []
                
                for model, model_data in comparison_data.items():
                    for lang, lang_data in model_data.items():
                        if "overall_agreement" in lang_data:
                            agreement_scores.append(lang_data["overall_agreement"])
                
                if agreement_scores:
                    summary["judge_agreement"][comparison_key] = np.mean(agreement_scores)
        
        return summary
    
    def _generate_visual_reports(self, results: Dict) -> None:
        """Generate visual reports from evaluation results."""
        # Create directory for visuals
        visuals_dir = os.path.join(self.output_dir, "visuals")
        os.makedirs(visuals_dir, exist_ok=True)
        
        # Set visualization style
        plt.style.use('ggplot')
        sns.set_palette("colorblind")
        
        # 1. Overall model performance by judge
        self._plot_model_performance(results, visuals_dir)
        
        # 2. RAG vs non-RAG comparison
        self._plot_rag_comparison(results, visuals_dir)
        
        # 3. Performance by language
        self._plot_language_performance(results, visuals_dir)
        
        # 4. Judge agreement (if multiple judges)
        if len(results["model_evaluations"]) > 1:
            self._plot_judge_agreement(results, visuals_dir)
        
        # 5. Generate comprehensive report
        self._generate_markdown_report(results, visuals_dir)
        
        logging.info(f"Generated visual reports in {visuals_dir}")
    
    def _plot_model_performance(self, results: Dict, visuals_dir: str) -> None:
        """Plot overall model performance."""
        for judge, judge_data in results["model_evaluations"].items():
            # Prepare data
            models = []
            overall_scores = []
            criterion_scores = {c: [] for c in self.criteria}
            
            for model, model_data in judge_data.items():
                model_avg_scores = {c: [] for c in self.criteria + ["overall"]}
                
                for lang, lang_data in model_data.items():
                    summary = lang_data.get("summary", {})
                    
                    # Collect criterion scores
                    for criterion in self.criteria:
                        if criterion in summary and "mean_score" in summary[criterion]:
                            model_avg_scores[criterion].append(summary[criterion]["mean_score"])
                    
                    # Collect overall score
                    if "overall" in summary and "mean_score" in summary["overall"]:
                        model_avg_scores["overall"].append(summary["overall"]["mean_score"])
                
                # Calculate averages across languages
                if model_avg_scores["overall"]:
                    models.append(model)
                    overall_scores.append(np.mean(model_avg_scores["overall"]))
                    
                    for criterion in self.criteria:
                        if model_avg_scores[criterion]:
                            criterion_scores[criterion].append(np.mean(model_avg_scores[criterion]))
                        else:
                            criterion_scores[criterion].append(0)
            
            if not models:
                continue
                
            # 1. Overall scores
            plt.figure(figsize=(12, 8))
            bars = plt.bar(models, overall_scores)
            
            # Color RAG models differently
            for i, model in enumerate(models):
                if "_rag" in model:
                    bars[i].set_color('green')
                else:
                    bars[i].set_color('blue')
            
            plt.title(f'Overall Model Performance ({judge})')
            plt.xlabel('Model')
            plt.ylabel('Average Score (lower is better)')
            plt.xticks(rotation=45, ha='right')
            plt.grid(axis='y', linestyle='--', alpha=0.7)
            
            # Invert y-axis (1 is best, 3 is worst)
            plt.ylim(3.1, 0.9)
            
            plt.tight_layout()
            plt.savefig(os.path.join(visuals_dir, f"{judge}_overall_performance.png"))
            plt.close()
            
            # 2. Performance by criterion
            plt.figure(figsize=(14, 10))
            x = np.arange(len(models))
            width = 0.2
            
            for i, criterion in enumerate(self.criteria):
                offset = width * (i - len(self.criteria)/2 + 0.5)
                plt.bar(x + offset, criterion_scores[criterion], width, label=criterion.capitalize())
            
            plt.title(f'Model Performance by Criterion ({judge})')
            plt.xlabel('Model')
            plt.ylabel('Average Score (lower is better)')
            plt.xticks(x, models, rotation=45, ha='right')
            plt.legend()
            plt.grid(axis='y', linestyle='--', alpha=0.7)
            
            # Invert y-axis (1 is best, 3 is worst)
            plt.ylim(3.1, 0.9)
            
            plt.tight_layout()
            plt.savefig(os.path.join(visuals_dir, f"{judge}_criterion_performance.png"))
            plt.close()
    
    def _plot_rag_comparison(self, results: Dict, visuals_dir: str) -> None:
        """Plot RAG vs non-RAG comparison."""
        for judge, judge_data in results["rag_vs_nonrag_comparisons"].items():
            for comparison_key, comparison_data in judge_data.items():
                base_model = comparison_key.split("_vs_")[0]
                rag_model = comparison_key.split("_vs_")[1]
                
                # Prepare data
                languages = []
                differences = {c: [] for c in self.criteria + ["overall"]}
                
                for lang, lang_data in comparison_data.items():
                    languages.append(lang)
                    
                    for criterion in self.criteria + ["overall"]:
                        if criterion in lang_data and "difference" in lang_data[criterion]:
                            # Negative difference means RAG is better (lower score)
                            # Invert to make positive values mean RAG is better
                            differences[criterion].append(-lang_data[criterion]["difference"])
                        else:
                            differences[criterion].append(0)
                
                if not languages:
                    continue
                
                # Plot differences
                plt.figure(figsize=(14, 10))
                x = np.arange(len(languages))
                width = 0.15
                
                for i, criterion in enumerate(self.criteria + ["overall"]):
                    offset = width * (i - (len(self.criteria) + 1)/2 + 0.5)
                    plt.bar(x + offset, differences[criterion], width, label=criterion.capitalize())
                
                plt.title(f'RAG vs Non-RAG Comparison: {base_model} vs {rag_model} ({judge})')
                plt.xlabel('Language')
                plt.ylabel('Improvement with RAG (positive is better)')
                plt.xticks(x, languages)
                plt.legend()
                plt.grid(axis='y', linestyle='--', alpha=0.7)
                
                # Add horizontal line at zero
                plt.axhline(y=0, color='r', linestyle='-', alpha=0.3)
                
                plt.tight_layout()
                plt.savefig(os.path.join(visuals_dir, f"{judge}_{comparison_key}_difference.png"))
                plt.close()
                
                # Plot side-by-side scores
                plt.figure(figsize=(14, 10))
                x = np.arange(len(languages))
                width = 0.35
                
                overall_non_rag = []
                overall_rag = []
                
                for lang in languages:
                    if "overall" in comparison_data[lang] and "non_rag_score" in comparison_data[lang]["overall"]:
                        overall_non_rag.append(comparison_data[lang]["overall"]["non_rag_score"])
                        overall_rag.append(comparison_data[lang]["overall"]["rag_score"])
                    else:
                        overall_non_rag.append(0)
                        overall_rag.append(0)
                
                plt.bar(x - width/2, overall_non_rag, width, label=f'Non-RAG ({base_model})')
                plt.bar(x + width/2, overall_rag, width, label=f'RAG ({rag_model})')
                
                plt.title(f'RAG vs Non-RAG Overall Scores: {base_model} vs {rag_model} ({judge})')
                plt.xlabel('Language')
                plt.ylabel('Average Score (lower is better)')
                plt.xticks(x, languages)
                plt.legend()
                plt.grid(axis='y', linestyle='--', alpha=0.7)
                
                # Invert y-axis (1 is best, 3 is worst)
                plt.ylim(3.1, 0.9)
                
                plt.tight_layout()
                plt.savefig(os.path.join(visuals_dir, f"{judge}_{comparison_key}_scores.png"))
                plt.close()
    
    def _plot_language_performance(self, results: Dict, visuals_dir: str) -> None:
        """Plot performance by language."""
        for judge, judge_data in results["language_comparisons"].items():
            for model, model_data in judge_data.items():
                # Prepare data
                criteria = []
                best_languages = []
                worst_languages = []
                
                for criterion in self.criteria + ["overall"]:
                    if criterion in model_data:
                        criteria.append(criterion.capitalize())
                        best_languages.append(model_data[criterion]["best_language"])
                        worst_languages.append(model_data[criterion]["worst_language"])
                
                if not criteria:
                    continue
                
                # Plot
                plt.figure(figsize=(12, 8))
                x = np.arange(len(criteria))
                
                # Count language frequencies for best and worst
                best_counts = {}
                worst_counts = {}
                
                for lang in best_languages:
                    best_counts[lang] = best_counts.get(lang, 0) + 1
                
                for lang in worst_languages:
                    worst_counts[lang] = worst_counts.get(lang, 0) + 1
                
                # Create summary table
                cell_text = []
                for c, best, worst in zip(criteria, best_languages, worst_languages):
                    cell_text.append([c, best, worst])
                
                plt.table(cellText=cell_text, 
                         colLabels=['Criterion', 'Best Language', 'Worst Language'],
                         loc='center')
                
                plt.title(f'Language Performance by Criterion: {model} ({judge})')
                plt.axis('tight')
                plt.axis('off')
                
                plt.tight_layout()
                plt.savefig(os.path.join(visuals_dir, f"{judge}_{model}_language_performance.png"))
                plt.close()
                
                # Create summary bar chart
                languages = sorted(list(set(best_languages + worst_languages)))
                best_data = [best_counts.get(lang, 0) for lang in languages]
                worst_data = [worst_counts.get(lang, 0) for lang in languages]
                
                plt.figure(figsize=(10, 6))
                x = np.arange(len(languages))
                width = 0.35
                
                plt.bar(x - width/2, best_data, width, label='Best Performance')
                plt.bar(x + width/2, worst_data, width, label='Worst Performance')
                
                plt.title(f'Language Performance Summary: {model} ({judge})')
                plt.xlabel('Language')
                plt.ylabel('Count of Criteria')
                plt.xticks(x, languages)
                plt.legend()
                
                plt.tight_layout()
                plt.savefig(os.path.join(visuals_dir, f"{judge}_{model}_language_summary.png"))
                plt.close()
    
    def _plot_judge_agreement(self, results: Dict, visuals_dir: str) -> None:
        """Plot agreement between judges."""
        if "agreement" not in results["judge_comparisons"]:
            return
            
        # Collect agreement data
        comparisons = []
        agreement_data = []
        
        for comparison_key, comparison_data in results["judge_comparisons"]["agreement"].items():
            judge1, judge2 = comparison_key.split("_vs_")
            
            # Calculate average agreement across all models and languages
            agreements = []
            
            for model, model_data in comparison_data.items():
                for lang, lang_data in model_data.items():
                    if "overall_agreement" in lang_data:
                        agreements.append(lang_data["overall_agreement"])
            
            if agreements:
                comparisons.append(f"{judge1} vs {judge2}")
                agreement_data.append(np.mean(agreements))
        
        if not comparisons:
            return
            
        # Plot overall agreement
        plt.figure(figsize=(10, 6))
        plt.bar(comparisons, agreement_data)
        
        plt.title('Judge Agreement')
        plt.xlabel('Judge Comparison')
        plt.ylabel('Average Agreement (higher is better)')
        plt.ylim(0, 1)
        
        plt.tight_layout()
        plt.savefig(os.path.join(visuals_dir, "judge_agreement.png"))
        plt.close()
        
        # Plot agreement by criterion
        if "bias" in results["judge_comparisons"]:
            # Plot judge bias
            judge_names = []
            bias_values = []
            
            for judge, bias_data in results["judge_comparisons"]["bias"].items():
                if "overall_avg_bias" in bias_data:
                    judge_names.append(judge)
                    bias_values.append(bias_data["overall_avg_bias"])
            
            if judge_names:
                plt.figure(figsize=(10, 6))
                bars = plt.bar(judge_names, bias_values)
                
                # Color based on bias direction
                for i, value in enumerate(bias_values):
                    if value < -0.2:
                        bars[i].set_color('red')  # Strict
                    elif value > 0.2:
                        bars[i].set_color('green')  # Lenient
                    else:
                        bars[i].set_color('blue')  # Moderate
                
                plt.title('Judge Bias')
                plt.xlabel('Judge')
                plt.ylabel('Average Bias (negative = stricter, positive = more lenient)')
                
                # Add horizontal line at zero
                plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)
                
                plt.tight_layout()
                plt.savefig(os.path.join(visuals_dir, "judge_bias.png"))
                plt.close()
    
    def _generate_markdown_report(self, results: Dict, visuals_dir: str) -> None:
        """Generate a comprehensive markdown report."""
        report = []
        report.append("# Maternal Healthcare Response Evaluation Report")
        report.append("\n## Overview")
        
        # Count evaluated models and languages
        model_count = 0
        language_count = 0
        response_count = 0
        
        for judge, judge_data in results["model_evaluations"].items():
            model_count = max(model_count, len(judge_data))
            
            for model, model_data in judge_data.items():
                language_count = max(language_count, len(model_data))
                
                for lang, lang_data in model_data.items():
                    response_count += len(lang_data.get("evaluations", []))
        
        report.append(f"\nThis report presents an evaluation of {model_count} models across {language_count} languages, with a total of {response_count} responses evaluated.")
        
        # List judges used
        judges = list(results["model_evaluations"].keys())
        report.append(f"\nEvaluation was performed by {len(judges)} LLM judges: {', '.join(judges)}")
        
        # Overall results
        report.append("\n## Overall Results")
        
        # Best model overall
        best_models = {}
        
        for judge, judge_data in results["model_evaluations"].items():
            best_score = float('inf')
            best_model = None
            
            for model, model_data in judge_data.items():
                avg_scores = []
                
                for lang, lang_data in model_data.items():
                    if "summary" in lang_data and "overall" in lang_data["summary"]:
                        avg_scores.append(lang_data["summary"]["overall"]["mean_score"])
                
                if avg_scores:
                    model_avg = np.mean(avg_scores)
                    if model_avg < best_score:
                        best_score = model_avg
                        best_model = model
            
            if best_model:
                best_models[judge] = (best_model, best_score)
        
        if best_models:
            report.append("\n### Best Performing Models by Judge")
            
            for judge, (model, score) in best_models.items():
                report.append(f"\n- **{judge}**: {model} (score: {score:.2f})")
        
        # RAG vs non-RAG summary
        if "rag_vs_nonrag_comparisons" in results:
            report.append("\n### RAG vs Non-RAG Performance")
            
            for judge, judge_data in results["rag_vs_nonrag_comparisons"].items():
                if not judge_data:
                    continue
                    
                report.append(f"\n#### Judge: {judge}")
                
                for comparison_key, comparison_data in judge_data.items():
                    base_model = comparison_key.split("_vs_")[0]
                    rag_model = comparison_key.split("_vs_")[1]
                    
                    # Count wins by language
                    rag_wins = 0
                    non_rag_wins = 0
                    
                    for lang, lang_data in comparison_data.items():
                        if "overall" in lang_data and "winner" in lang_data["overall"]:
                            winner = lang_data["overall"]["winner"]
                            if winner == "rag":
                                rag_wins += 1
                            else:
                                non_rag_wins += 1
                    
                    if rag_wins > 0 or non_rag_wins > 0:
                        winner = "RAG" if rag_wins > non_rag_wins else "Non-RAG"
                        report.append(f"\n**{base_model} vs {rag_model}**: {winner} performs better ({rag_wins} vs {non_rag_wins} languages)")
                        
                        # Add image reference
                        report.append(f"\n![RAG vs Non-RAG Comparison](visuals/{judge}_{comparison_key}_difference.png)")
                        
                        # Detail by criterion
                        criterion_winners = {c: {"rag": 0, "non_rag": 0} for c in self.criteria}
                        
                        for lang, lang_data in comparison_data.items():
                            for criterion in self.criteria:
                                if criterion in lang_data and "winner" in lang_data[criterion]:
                                    winner = lang_data[criterion]["winner"]
                                    criterion_winners[criterion][winner] += 1
                        
                        report.append("\n**Performance by criterion:**")
                        
                        for criterion, wins in criterion_winners.items():
                            winner = "RAG" if wins["rag"] > wins["non_rag"] else "Non-RAG"
                            report.append(f"- **{criterion.capitalize()}**: {winner} is better ({wins['rag']} vs {wins['non_rag']} languages)")
        
        # Language-specific analysis
        report.append("\n## Language-Specific Analysis")
        
        languages = set()
        for judge, judge_data in results["model_evaluations"].items():
            for model, model_data in judge_data.items():
                languages.update(model_data.keys())
        
        for lang in sorted(languages):
            report.append(f"\n### {lang.capitalize()}")
            
            # Best model for this language
            best_by_judge = {}
            
            for judge, judge_data in results["model_evaluations"].items():
                best_score = float('inf')
                best_model = None
                
                for model, model_data in judge_data.items():
                    if lang in model_data and "summary" in model_data[lang] and "overall" in model_data[lang]["summary"]:
                        score = model_data[lang]["summary"]["overall"]["mean_score"]
                        if score < best_score:
                            best_score = score
                            best_model = model
                
                if best_model:
                    best_by_judge[judge] = (best_model, best_score)
            
            if best_by_judge:
                report.append("\n**Best model by judge:**")
                
                for judge, (model, score) in best_by_judge.items():
                    report.append(f"- **{judge}**: {model} (score: {score:.2f})")
            
            # RAG vs non-RAG for this language
            report.append("\n**RAG vs Non-RAG performance:**")
            
            for judge, judge_data in results["rag_vs_nonrag_comparisons"].items():
                for comparison_key, comparison_data in judge_data.items():
                    if lang in comparison_data and "overall" in comparison_data[lang]:
                        base_model = comparison_key.split("_vs_")[0]
                        rag_model = comparison_key.split("_vs_")[1]
                        
                        winner = comparison_data[lang]["overall"]["winner"]
                        diff = abs(comparison_data[lang]["overall"]["difference"])
                        
                        winner_name = "RAG" if winner == "rag" else "Non-RAG"
                        model_name = rag_model if winner == "rag" else base_model
                        
                        report.append(f"- **{judge}**: {winner_name} ({model_name}) is better by {diff:.2f} points")
        
        # Model-specific analysis
        report.append("\n## Model-Specific Analysis")
        
        models = set()
        for judge, judge_data in results["model_evaluations"].items():
            models.update(judge_data.keys())
        
        for model in sorted(models):
            report.append(f"\n### {model}")
            
            # Add image reference
            judges_with_model = [j for j in judges if model in results["model_evaluations"][j]]
            if judges_with_model:
                judge = judges_with_model[0]
                report.append(f"\n![Model Performance](visuals/{judge}_{model}_language_performance.png)")
            
            # Performance across languages
            report.append("\n**Performance across languages:**")
            
            for judge, judge_data in results["model_evaluations"].items():
                if model in judge_data:
                    # Average scores by criterion
                    criterion_scores = {c: [] for c in self.criteria + ["overall"]}
                    
                    for lang, lang_data in judge_data[model].items():
                        summary = lang_data.get("summary", {})
                        
                        for criterion in self.criteria:
                            if criterion in summary and "mean_score" in summary[criterion]:
                                criterion_scores[criterion].append((lang, summary[criterion]["mean_score"]))
                        
                        if "overall" in summary and "mean_score" in summary["overall"]:
                            criterion_scores["overall"].append((lang, summary["overall"]["mean_score"]))
                    
                    report.append(f"\n**Judge: {judge}**")
                    
                    for criterion in self.criteria + ["overall"]:
                        if criterion_scores[criterion]:
                            # Sort by score (best first)
                            sorted_scores = sorted(criterion_scores[criterion], key=lambda x: x[1])
                            best_lang, best_score = sorted_scores[0]
                            worst_lang, worst_score = sorted_scores[-1]
                            
                            avg_score = np.mean([s for _, s in criterion_scores[criterion]])
                            
                            report.append(f"- **{criterion.capitalize()}**: Avg score {avg_score:.2f}, Best: {best_lang} ({best_score:.2f}), Worst: {worst_lang} ({worst_score:.2f})")
        
        # Judge comparison (if multiple judges)
        if len(judges) > 1:
            report.append("\n## Judge Comparison")
            
            if "agreement" in results["judge_comparisons"]:
                report.append("\n### Judge Agreement")
                report.append("\n![Judge Agreement](visuals/judge_agreement.png)")
                
                for comparison_key, comparison_data in results["judge_comparisons"]["agreement"].items():
                    judge1, judge2 = comparison_key.split("_vs_")
                    
                    avg_agreements = []
                    for model, model_data in comparison_data.items():
                        for lang, lang_data in model_data.items():
                            if "overall_agreement" in lang_data:
                                avg_agreements.append(lang_data["overall_agreement"])
                    
                    if avg_agreements:
                        report.append(f"\n**{judge1} vs {judge2}**: Average agreement {np.mean(avg_agreements):.2f}")
            
            if "bias" in results["judge_comparisons"]:
                report.append("\n### Judge Bias")
                report.append("\n![Judge Bias](visuals/judge_bias.png)")
                
                for judge, bias_data in results["judge_comparisons"]["bias"].items():
                    if "overall_avg_bias" in bias_data and "bias_tendency" in bias_data:
                        bias = bias_data["overall_avg_bias"]
                        tendency = bias_data["bias_tendency"]
                        
                        report.append(f"\n**{judge}**: {tendency.capitalize()} (bias: {bias:.2f})")
        
        # Add recommendations
        report.append("\n## Recommendations")
        
        # Determine best model overall
        best_model_overall = None
        best_score_overall = float('inf')
        
        for judge, (model, score) in best_models.items():
            if score < best_score_overall:
                best_score_overall = score
                best_model_overall = model
        
        if best_model_overall:
            report.append(f"\n- **Best model overall**: {best_model_overall}")
        
        # RAG vs non-RAG recommendation
        rag_better_count = 0
        non_rag_better_count = 0
        
        for judge, judge_data in results["rag_vs_nonrag_comparisons"].items():
            for comparison_key, comparison_data in judge_data.items():
                rag_wins = 0
                non_rag_wins = 0
                
                for lang, lang_data in comparison_data.items():
                    if "overall" in lang_data and "winner" in lang_data["overall"]:
                        winner = lang_data["overall"]["winner"]
                        if winner == "rag":
                            rag_wins += 1
                        else:
                            non_rag_wins += 1
                
                if rag_wins > non_rag_wins:
                    rag_better_count += 1
                elif non_rag_wins > rag_wins:
                    non_rag_better_count += 1
        
        if rag_better_count > non_rag_better_count:
            report.append("\n- **RAG approach recommended**: RAG models generally perform better across languages and criteria.")
        elif non_rag_better_count > rag_better_count:
            report.append("\n- **Non-RAG approach recommended**: Non-RAG models generally perform better across languages and criteria.")
        else:
            report.append("\n- **Mixed approach recommended**: RAG and Non-RAG models show similar performance overall. Consider using RAG for specific languages or criteria where it shows advantage.")
        
        # Language-specific recommendations
        report.append("\n### Language-Specific Recommendations")
        
        for lang in sorted(languages):
            rag_advantage = True
            non_rag_advantage = True
            
            for judge, judge_data in results["rag_vs_nonrag_comparisons"].items():
                for comparison_key, comparison_data in judge_data.items():
                    if lang in comparison_data and "overall" in comparison_data[lang]:
                        winner = comparison_data[lang]["overall"]["winner"]
                        if winner == "rag":
                            non_rag_advantage = False
                        else:
                            rag_advantage = False
            
            if rag_advantage and not non_rag_advantage:
                report.append(f"\n- **{lang.capitalize()}**: Use RAG approach")
            elif non_rag_advantage and not rag_advantage:
                report.append(f"\n- **{lang.capitalize()}**: Use Non-RAG approach")
            else:
                # Check criterion-specific advantages
                criterion_advantages = {c: {"rag": 0, "non_rag": 0} for c in self.criteria}
                
                for judge, judge_data in results["rag_vs_nonrag_comparisons"].items():
                    for comparison_key, comparison_data in judge_data.items():
                        if lang in comparison_data:
                            for criterion in self.criteria:
                                if criterion in comparison_data[lang] and "winner" in comparison_data[lang][criterion]:
                                    winner = comparison_data[lang][criterion]["winner"]
                                    criterion_advantages[criterion][winner] += 1
                
                # Determine which criteria favor which approach
                rag_better_criteria = []
                non_rag_better_criteria = []
                
                for criterion, advantages in criterion_advantages.items():
                    if advantages["rag"] > advantages["non_rag"]:
                        rag_better_criteria.append(criterion)
                    elif advantages["non_rag"] > advantages["rag"]:
                        non_rag_better_criteria.append(criterion)
                
                if rag_better_criteria and non_rag_better_criteria:
                    report.append(f"\n- **{lang.capitalize()}**: Consider RAG for {', '.join(rag_better_criteria)} and Non-RAG for {', '.join(non_rag_better_criteria)}")
                else:
                    report.append(f"\n- **{lang.capitalize()}**: Either approach works similarly well")
        
        # Add methodology
        report.append("\n## Methodology")
        report.append("\nThis evaluation uses LLM judges to assess maternal healthcare responses across four criteria:")
        report.append("\n1. **Medical Correctness**: Are the medical claims in the response accurate? (1 = all correct, 3 = not correct)")
        report.append("\n2. **Completeness**: Does the answer cover all necessary information? (1 = covers everything, 3 = omits significant information)")
        report.append("\n3. **Language Clarity**: Is the response clear for users with average literacy? (1 = completely understandable, 3 = unacceptable)")
        report.append("\n4. **Cultural Appropriateness**: Is the response appropriate for the cultural context? (1 = completely appropriate, 3 = inappropriate)")
        
        report.append("\nFor all scores, **lower is better** (1 is the best possible score, 3 is the worst).")
        
        # Save report
        report_path = os.path.join(self.output_dir, "evaluation_report.md")
        with open(report_path, 'w', encoding='utf-8') as f:
            f.write("\n".join(report))
        
        logging.info(f"Generated markdown report at {report_path}")
        
        # Copy images to make relative paths work
        import shutil
        for file in os.listdir(visuals_dir):
            if file.endswith('.png'):
                shutil.copy(
                    os.path.join(visuals_dir, file),
                    os.path.join(self.output_dir, file)
                )


# Function for easy usage
def evaluate_maternal_healthcare_responses(
    json_files: Dict[str, str],
    judges_to_use: List[str] = ["gemini-1.5-pro"],
    languages: List[str] = ["english", "hindi", "assamese", "hinglish"],
    output_dir: str = "./llm_judge_evaluation",
    sample_size: Optional[int] = None
) -> Dict:
    """
    Evaluate maternal healthcare model responses using LLM judges.
    
    Args:
        json_files: Dictionary mapping model names to JSON file paths
        judges_to_use: List of LLM judges to use for evaluation
        languages: List of languages to evaluate
        output_dir: Directory to save evaluation results
        sample_size: Number of samples to evaluate per language per model (None for all)
        
    Returns:
        Dictionary with evaluation results
    """
    # Initialize evaluator
    evaluator = LLMJudgeEvaluator(output_dir=output_dir)
    
    # Run evaluation
    results = evaluator.evaluate(
        json_files=json_files,
        judges_to_use=judges_to_use,
        languages=languages,
        sample_size=sample_size
    )
    
    return results


# Example usage
if __name__ == "__main__":
    # Specify paths to your model response files
    json_files = {
        "gpt-4-turbo": "/home/vidhij2/nivi/test_docs/gpt-4-turbo_rag.json",
        "gpt-4-turbo_rag": "/home/vidhij2/nivi/test_docs/gpt-4-turbo_rag.json",
        "mixtral": "/home/vidhij2/nivi/test_docs/mixtral_rag.json",
        "mixtral_rag": "/home/vidhij2/nivi/test_docs/mixtral_rag.json",
        "llama": "/home/vidhij2/nivi/test_docs/llama_rag.json",
        "llama_rag": "/home/vidhij2/nivi/test_docs/llama_rag.json"
    }
    
    # Run evaluation
    results = evaluate_maternal_healthcare_responses(
        json_files=json_files,
        judges_to_use=["gemini-pro"],  # You can add more judges like "gemini-pro" if you have access
        languages=["english", "hindi", "assamese", "hinglish"],
        output_dir="./maternal_healthcare_evaluation_new",
        sample_size=20  # Set to None to evaluate all samples
    )


2025-04-10 03:15:52,353 - INFO - Loading responses from /home/vidhij2/nivi/test_docs/gpt-4-turbo_rag.json...
2025-04-10 03:15:52,355 - INFO - Loaded responses for gpt-4-turbo
2025-04-10 03:15:52,356 - INFO - Loading responses from /home/vidhij2/nivi/test_docs/gpt-4-turbo_rag.json...
2025-04-10 03:15:52,357 - INFO - Loaded responses for gpt-4-turbo_rag
2025-04-10 03:15:52,357 - INFO - Loading responses from /home/vidhij2/nivi/test_docs/mixtral_rag.json...
2025-04-10 03:15:52,358 - INFO - Loaded responses for mixtral
2025-04-10 03:15:52,359 - INFO - Loading responses from /home/vidhij2/nivi/test_docs/mixtral_rag.json...
2025-04-10 03:15:52,360 - INFO - Loaded responses for mixtral_rag
2025-04-10 03:15:52,360 - INFO - Loading responses from /home/vidhij2/nivi/test_docs/llama_rag.json...
2025-04-10 03:15:52,361 - INFO - Loaded responses for llama
2025-04-10 03:15:52,362 - INFO - Loading responses from /home/vidhij2/nivi/test_docs/llama_rag.json...
2025-04-10 03:15:52,363 - INFO - Loaded re

In [9]:
import os
GEMINI_API_KEY = "AIzaSyDlArIQ19XCh_aYVi5LdTk6waDgkEIGaWM"

In [6]:
print(os.getenv("GEMINI_API_KEY"))

None


In [12]:
pip install google-generativeai

Collecting google-generativeai
  Downloading google_generativeai-0.8.4-py3-none-any.whl.metadata (4.2 kB)
Collecting google-ai-generativelanguage==0.6.15 (from google-generativeai)
  Downloading google_ai_generativelanguage-0.6.15-py3-none-any.whl.metadata (5.7 kB)
Collecting google-api-python-client (from google-generativeai)
  Downloading google_api_python_client-2.166.0-py2.py3-none-any.whl.metadata (6.6 kB)
Collecting httplib2<1.0.0,>=0.19.0 (from google-api-python-client->google-generativeai)
  Downloading httplib2-0.22.0-py3-none-any.whl.metadata (2.6 kB)
Collecting google-auth-httplib2<1.0.0,>=0.2.0 (from google-api-python-client->google-generativeai)
  Downloading google_auth_httplib2-0.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting uritemplate<5,>=3.0.1 (from google-api-python-client->google-generativeai)
  Downloading uritemplate-4.1.1-py2.py3-none-any.whl.metadata (2.9 kB)
Downloading google_generativeai-0.8.4-py3-none-any.whl (175 kB)
Downloading google_ai_generative

In [9]:
results

{'model_evaluations': {'gpt-4': {'gpt-4-turbo': {'english': {'evaluations': [{'correctness': {'score': 3,
        'justification': 'Failed to parse structured response'},
       'completeness': {'score': 3,
        'justification': 'Failed to parse structured response'},
       'clarity': {'score': 3,
        'justification': 'Failed to parse structured response'},
       'cultural_appropriateness': {'score': 3,
        'justification': 'Failed to parse structured response'},
       'question': 'My baby weight is 452 gram in 22weeks..is this normal',
       'response': "Based on the information provided, I don't have specific guidelines regarding fetal weight at 22 weeks to determine if 452 grams is within a normal range. It's important to discuss this with your healthcare provider, who can assess the growth of your baby in the context of overall health and development during your pregnancy. They can provide you with the best advice and care. Remember, regular prenatal check-ups are cr