<a href="https://colab.research.google.com/github/wjleece/rag-experimentation-framework/blob/main/RAG_Experimentation_Framework_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

If you use this code, please cite:

{
  title = {RAG Experimentation Framework},

  author = {Bill Leece},

  year = {2024}
}

#Setup

In [None]:
!pip install -U transformers --quiet
#!pip install -U optimum --quiet
!pip install -U accelerate  --quiet
!pip install -U bitsandbytes  --quiet
!pip install -U torch --quiet
!pip install -U sentencepiece --quiet
!pip install -U llama-index --quiet
!pip install -U llama-index-llms-mistralai --quiet
!pip install -U llama-index-embeddings-mistralai --quiet
!pip install -U llama-index-llms-langchain --quiet
!pip install -U langchain --quiet
!pip install -U langchain-community --quiet
!pip install -U langchain-mistralai --quiet
!pip install -U langchain_huggingface --quiet
!pip install -U faiss-gpu --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m906.4/906.4 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m56.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m52.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m41.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
import json
import numpy as np
import faiss
import transformers
import torch
import gc
from google.colab import drive, userdata
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain.prompts import PromptTemplate
from langchain_huggingface import HuggingFacePipeline
from langchain_core.output_parsers import StrOutputParser
from langchain_mistralai.chat_models import ChatMistralAI
from llama_index.embeddings.mistralai import MistralAIEmbedding
from llama_index.core import SimpleDirectoryReader, Settings
from llama_index.core.node_parser import SemanticSplitterNodeParser
import time
from typing import List, Dict, Tuple
from contextlib import contextmanager
from langchain.schema.runnable import RunnableSequence
from langchain.schema.output_parser import StrOutputParser


In [None]:
os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN')
os.environ["MISTRAL_API_KEY"] = userdata.get('MISTRAL_API_KEY')

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu' #Use GPUs for quantization!

In [None]:
!nvidia-smi

Wed Nov 13 20:01:42 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L4                      Off | 00000000:00:03.0 Off |                    0 |
| N/A   34C    P8              12W /  72W |      4MiB / 23034MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

#Experiment Configurations

In [None]:
# Setup configurations
MODEL_CONFIGS = {
    "models": [
    #    {
    #        "name": "open-mixtral-8x7b",
    #        "type": "mistral_api",
    #        "tokenizer": None,  # Not needed for API models
    #    },
         {
            "name": "open-mistral-nemo",
            "type": "mistral_api",
            "tokenizer": None,  # Not needed for API models
         },
   #     {
   #         "name": "ministral-8b-latest",
   #         "type": "mistral_api",
   #         "tokenizer": None,  # Not needed for API models
   #     },
   #   {
   #         "name": "wjleece/quantized-mistral-7b",
   #         "type": "huggingface_quantized",
   #         "tokenizer": "mistralai/Mixtral-8x7B-v0.1",  # The same tokenizer that works on the base model will work on the quantized model - there is no 'quantized tokenizer'
   #          "quantization_config": {                    #Quantization config left here as a reference, but not used in the code (as we're using an already quantized model from HuggingFace)
   #             "load_in_4bit": True,
   #             "bnb_4bit_compute_dtype": "float16",
   #             "bnb_4bit_quant_type": "nf4",
   #             "bnb_4bit_use_double_quant": False
   #         }
   #     },
      {
              "name": "wjleece/quantized-mistral-nemo-12b",
              "type": "huggingface_quantized",
              "tokenizer": "mistralai/Mistral-Nemo-Instruct-2407",  # The same tokenizer that works on the base model will work on the quantized model - there is no 'quantized tokenizer'
              "quantization_config": {                    #Quantization config left here as a reference, but not used in the code (as we're using an already quantized model from HuggingFace)
                  "load_in_4bit": True,
                  "bnb_4bit_compute_dtype": "float16",
                  "bnb_4bit_quant_type": "nf4",
                  "bnb_4bit_use_double_quant": False
             }
          },
     #  {
     #         "name": "wjleece/quantized-mistral-8b",
     #         "type": "huggingface_quantized",
     #         "tokenizer": "mistralai/Ministral-8B-Instruct-2410",  # The same tokenizer that works on the base model will work on the quantized model - there is no 'quantized tokenizer'
     #         "quantization_config": {                    #Quantization config left here as a reference, but not used in the code (as we're using an already quantized model from HuggingFace)
     #             "load_in_4bit": True,
     #             "bnb_4bit_compute_dtype": "float16",
     #             "bnb_4bit_quant_type": "nf4",
     #             "bnb_4bit_use_double_quant": False
     #         }
     #     }
       ],
    #RAG semantic chunking thresholds (higher thresholds --> fewer RAG chunks created)
    "thresholds": [95] #RAG semantic chunking thresholds (higher thresholds --> fewer RAG chunks created)
}
""
QUESTION_CONFIGS = {
    "questions": [
        "What were cloud revenues in Q2 2024?",
        "What were the main drivers of revenue growth in Q2?",
        "How much did YouTube ad revenues grow in Q2 in APAC?",
        "Can you summarize recent key antitrust matters?",
        "What were YouTube ad revenues in Q2?"
    ] #These quetsions should relate to the RAG document --> these are your 'business use cases'
}

FILE_CONFIGS = {
    "save_directory": '/content/drive/My Drive/AI/Model_Analysis'
}

#Load RAG Document

In [None]:
drive.mount('/content/drive')
documents = SimpleDirectoryReader(input_files=["/content/drive/My Drive/AI/Datasets/Google-10-q/goog-10-q-q2-2024.pdf"]).load_data()

Mounted at /content/drive


#RAG Pipeline Class

In [None]:
class RAGPipeline:
    def __init__(self):
        self.chunk_cache = {}
        self.embedding_cache = {}
        self.embedding_model = None

    def initialize_embedding_model(self):
        """Initialize the embedding model if not already initialized"""
        if self.embedding_model is None:
            mistral_api_key = userdata.get('MISTRAL_API_KEY')
            self.embedding_model = MistralAIEmbedding(
                model_name="mistral-embed",
                api_key=mistral_api_key
            )
        return self.embedding_model

    def create_semantic_chunks(self, documents: List, threshold: int) -> Dict:
        """Create or retrieve semantic chunks with memory optimization"""
        if self.embedding_model is None:
            self.initialize_embedding_model()

        if threshold not in self.chunk_cache:
            print(f"Creating new semantic chunks for threshold {threshold}")

            # Clear other thresholds from cache if memory pressure is high
            if len(self.chunk_cache) > 2:  # Keep only 2 thresholds in memory
                oldest_threshold = min(self.chunk_cache.keys())
                if oldest_threshold != threshold:
                    del self.chunk_cache[oldest_threshold]
                    if oldest_threshold in self.embedding_cache:
                        del self.embedding_cache[oldest_threshold]
                    gc.collect()

            splitter = SemanticSplitterNodeParser(
                buffer_size=1,
                breakpoint_percentile_threshold=threshold,
                embed_model=self.embedding_model
            )

            nodes = splitter.get_nodes_from_documents(documents)
            texts = [node.text for node in nodes]

            self.chunk_cache[threshold] = {
                'texts': texts
            }

        return self.chunk_cache[threshold]

    def run_cosine_search(self, query: str, threshold: int, k=5) -> List[Dict]:
        """Run cosine similarity search with memory optimization"""
        if self.embedding_model is None:
            self.initialize_embedding_model()

        if threshold not in self.embedding_cache:
            texts = self.chunk_cache[threshold]['texts']

            # Generate embeddings in batches to reduce memory usage
            batch_size = 32
            embeddings = []

            for i in range(0, len(texts), batch_size):
                batch_texts = texts[i:i + batch_size]
                batch_embeddings = [self.embedding_model.get_text_embedding(text)
                                  for text in batch_texts]
                embeddings.extend(batch_embeddings)

                # Optional: Clear some memory after each batch
                if i % (batch_size * 4) == 0:
                    gc.collect()

            embeddings_array = np.array(embeddings).astype('float32')
            normalized_embeddings = embeddings_array / np.linalg.norm(embeddings_array, axis=1)[:, np.newaxis]

            dimension = embeddings_array.shape[1]
            cosine_index = faiss.IndexFlatIP(dimension)
            cosine_index.add(normalized_embeddings)

            self.embedding_cache[threshold] = {
                'embeddings': embeddings_array,
                'cosine_index': cosine_index
            }

        # Perform search
        query_vector = self.embedding_model.get_text_embedding(query)
        query_vector = np.array([query_vector]).astype('float32')
        query_normalized = query_vector / np.linalg.norm(query_vector)

        distances, indices = self.embedding_cache[threshold]['cosine_index'].search(
            query_normalized.reshape(1, -1).astype('float32'), k
        )

        return [
            {
                'text': self.chunk_cache[threshold]['texts'][idx],
                'distance': float(score)
            }
            for score, idx in zip(distances[0], indices[0])
        ]

    def generate_response(self, query: str, context_rag: list, model: Dict) -> dict:
        """Generate response using provided context"""
        try:
            context_texts = [doc['text'] for doc in context_rag]
            if not context_texts:
                return {"response_text": "No relevant context found.", "sources": []}

            context = "\n\n".join(context_texts)

            prompt = PromptTemplate(template="""
            Instructions:

            You are a helpful assistant who answers questions from context that has been provided to you.
            Given the context information, provide a direct and concise answer to the question: {query}

            Focus only on information present in the context. If you don't know the answer, say "I don't know."
            You must format your response as a JSON string object, starting with the word "LLM_Response:"

            Your answer to {query} will be a JSON string object that starts with "LLM_Response:" as shown below:

            LLM_Response:
            {{
                "response_text": "Your detailed answer here",
                "sources": [
                    "Copy and paste here the exact text segments from the context that you used to generate your answer. Include all relevant segments, verbatim."
                ]
            }}

            Important: In your response, the "sources" field must contain the exact text passages from the provided context that you used to formulate your answer. Copy these passages word-for-word.

            Do not include a hypothetical example in your answer, only include your final answer after "LLM_Response:"

            The context information that you will use for your answer is below:

            ---------------
            {context}
            ---------------
            """)

            model_type = model['type']

            llm = model['llm']

            # Create the chain using the | operator
            chain = prompt | llm | StrOutputParser()

            # Use invoke() with the input dictionary
            response = chain.invoke({
                "query": query,
                "context": context
               })

            # Extract the JSON part
            response_text = response.split("LLM_Response:")[-1].strip()

            # Try to parse as JSON
            try:
                if '{' in response_text and '}' in response_text:
                    json_str = response_text[response_text.find('{'):response_text.rfind('}')+1]
                    parsed_response = json.loads(json_str)
                    return {
                        "response_text": parsed_response.get("response_text", response_text),
                        "sources": parsed_response.get("sources", [])
                    }
                else:
                    return {
                        "response_text": response_text,
                        "sources": []
                    }
            except json.JSONDecodeError:
                return {
                    "response_text": response_text,
                    "sources": []
                }

        except Exception as e:
            print(f"An error occurred: {str(e)}")
            return {"response_text": "An error occurred while generating the response.", "sources": []}


# Global RAG pipeline instance
_GLOBAL_RAG_PIPELINE = None

#Experiment Class

In [None]:
class MemoryOptimizedExperimentConfig:
    def __init__(self,
                 models: List[Dict],
                 thresholds: List[int],
                 questions: List[str],
                 temperature: float):
        self.models = models
        self.thresholds = thresholds
        self.questions = questions
        self.temperature = temperature

        # Use global RAG pipeline
        global _GLOBAL_RAG_PIPELINE
        if _GLOBAL_RAG_PIPELINE is None:
            print("Initializing global RAG pipeline")
            _GLOBAL_RAG_PIPELINE = RAGPipeline()
        else:
            print("Using existing global RAG pipeline")
        self.rag_pipeline = _GLOBAL_RAG_PIPELINE

        self.current_model = None
        self.current_model_name = None

    @contextmanager
    def load_model(self, model_config: Dict):
        """Context manager for lazy loading and proper cleanup of models"""
        try:
            model_name = model_config["name"]
            model_type = model_config["type"]

            # Clear any existing model
            self.cleanup_current_model()

            if model_type == "mistral_api":
                mistral_api_key = userdata.get('MISTRAL_API_KEY')
                self.current_model = {
                    'llm': ChatMistralAI(
                        model=model_name,
                        temperature=self.temperature,
                        api_key=mistral_api_key
                    ),
                    'type': 'mistral_api'
                }
            else:  # huggingface_quantized
                print(f"Loading quantized model: {model_name}")

                # Empty CUDA cache before loading new model
                torch.cuda.empty_cache()
                gc.collect()

                tokenizer = AutoTokenizer.from_pretrained(
                    pretrained_model_name_or_path=model_config["tokenizer"],
                    trust_remote_code=True,
                    use_fast=True,
                    padding_side="left"
                )

                model = AutoModelForCausalLM.from_pretrained(
                    pretrained_model_name_or_path=model_name,
                    device_map="auto",
                    trust_remote_code=True,
                    torch_dtype=torch.float16,
                    use_cache=True,
                    low_cpu_mem_usage=True,
                )

                pipe = pipeline(
                    "text-generation",
                    model=model,
                    tokenizer=tokenizer,
                    max_new_tokens=512,
                    temperature=self.temperature,
                    top_p=0.95,
                    top_k=50,
                    do_sample=True,
                    device_map="auto"
                )

                self.current_model = {
                    'llm': HuggingFacePipeline(pipeline=pipe),
                    'type': 'huggingface_quantized',
                    'model': model,  # Keep reference for cleanup
                    'pipe': pipe     # Keep reference for cleanup
                }

            self.current_model_name = model_name
            yield self.current_model

        finally:
            # Cleanup will happen in cleanup_current_model()
            pass

    def cleanup_current_model(self):
        """Clean up the current model and free memory"""
        if self.current_model is not None:
            if self.current_model['type'] == 'huggingface_quantized':
                # Delete model components explicitly
                del self.current_model['llm']
                del self.current_model['model']
                del self.current_model['pipe']

                # Clear CUDA cache
                torch.cuda.empty_cache()

                # Run garbage collection
                gc.collect()

            self.current_model = None
            self.current_model_name = None

    def run_experiment(self):
        """Run experiments with optimized memory management"""
        results = {
            "metadata": {
                "timestamp": time.strftime("%Y%m%d-%H%M%S"),
                "models_tested": [model["name"] for model in self.models],
                "thresholds_tested": self.thresholds,
                "temperature": self.temperature
            },
            "results": []
        }

        # Process each threshold
        for threshold in self.thresholds:
            print(f"\nProcessing threshold: {threshold}")
            self.rag_pipeline.create_semantic_chunks(documents, threshold)

            # Process each model one at a time
            for model_config in self.models:
                model_name = model_config["name"]
                print(f"\nTesting model: {model_name}")

                # Use context manager to handle model lifecycle
                with self.load_model(model_config) as model:
                    # Process all questions for this model and threshold
                    for question in self.questions:
                        print(f"Processing question: {question}")

                        context = self.rag_pipeline.run_cosine_search(
                            query=question,
                            threshold=threshold
                        )

                        answer = self.rag_pipeline.generate_response(
                            query=question,
                            context_rag=context,
                            model=model
                        )

                        results["results"].append({
                            "model": model_name,
                            "threshold": threshold,
                            "question": question,
                            "response": answer
                        })

        return results


#Evaluator Class

In [None]:
import openai
import json
import tiktoken
import textwrap
import time
from datetime import datetime
from typing import Dict, List, Any

class ExperimentEvaluator:
    def __init__(self, api_key: str):
        self.client = openai.OpenAI(api_key=api_key)
        self.encoder = tiktoken.encoding_for_model("gpt-4o")

    def _get_baseline_answers(self, questions: List[str], source_doc: str) -> Dict[str, str]:
        """Get GPT-4o's own answers to the questions as baseline"""
        baseline_prompt = f"""Source Document:
        {source_doc}

        Using only the information from the source document above, answer these questions.
        Format your response as a valid JSON object with questions as keys and answers as values.
        Keep answers concise and factual.

        Questions to answer:
        {json.dumps(questions, indent=2)}

        Response format example:
        {{
            "Question 1": "Answer 1",
            "Question 2": "Answer 2"
        }}"""

        try:
            print("\n--- Getting Baseline Answers ---")
            response = self.client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant that provides JSON-formatted answers based on source documents."},
                    {"role": "user", "content": baseline_prompt}
                ],
                temperature=0.1
            )

            content = response.choices[0].message.content
            print("Baseline response received:", content[:200] + "...")

            if '{' in content and '}' in content:
                json_str = content[content.find('{'):content.rfind('}')+1]
                return json.loads(json_str)
            else:
                print("Warning: No JSON structure found in GPT-4o's baseline response")
                return {"error": "No JSON structure found", "questions": questions}

        except json.JSONDecodeError as e:
            print(f"Warning: Could not parse GPT-4o's baseline answers as JSON: {str(e)}")
            return {"error": "JSON parsing failed", "questions": questions}
        except Exception as e:
            print(f"Warning: Error getting baseline answers: {str(e)}")
            return {"error": str(e), "questions": questions}

    def evaluate_experiments(self, experiment_results: Dict, source_doc: str) -> Dict:
        """Evaluate experiment results using GPT-4o"""
        try:
            print("\n=== Starting Evaluation Process ===")
            questions = list(set(result["question"] for result in experiment_results["results"]))
            print(f"Number of unique questions to evaluate: {len(questions)}")

            # Get unique model/threshold combinations
            model_threshold_pairs = set((result["model"], result["threshold"])
                                     for result in experiment_results["results"])
            print(f"Number of model/threshold combinations: {len(model_threshold_pairs)}")

            # Get baseline answers for comparison
            baseline_answers = self._get_baseline_answers(questions, source_doc)
            print("Baseline answers received")

            # Evaluate each model/threshold/question combination separately
            all_evaluations = []

            for model, threshold in model_threshold_pairs:
                print(f"\nEvaluating model: {model}, threshold: {threshold}")
                relevant_results = [r for r in experiment_results["results"]
                                  if r["model"] == model and r["threshold"] == threshold]

                for result in relevant_results:
                    evaluation_prompt = f"""Evaluate this specific response:

                    Question: {result["question"]}
                    Baseline Answer: {baseline_answers.get(result["question"], "No baseline available")}
                    Model: {result["model"]}
                    Threshold: {result["threshold"]}
                    Response: {json.dumps(result["response"], indent=2)}

                    Score the response on these criteria (0-100):
                    - Accuracy: How well does it match the baseline/source
                    - Conciseness: Clear, direct answer without extra information
                    - Source Attribution: Uses relevant source text as evidence
                    - Reasonableness: Answer is properly contextualized

                    Provide your evaluation in this exact JSON format:
                    {{
                        "model": "{result["model"]}",
                        "threshold": {result["threshold"]},
                        "question": "{result["question"]}",
                        "scores": {{
                            "accuracy": <score>,
                            "conciseness": <score>,
                            "source_attribution": <score>,
                            "reasonableness": <score>
                        }},
                        "composite_score": <average of scores>,
                        "explanation": "detailed explanation"
                    }}"""

                    try:
                        response = self.client.chat.completions.create(
                            model="gpt-4o",
                            messages=[
                                {"role": "system", "content": "You are an expert at evaluating LLM responses for accuracy and quality."},
                                {"role": "user", "content": evaluation_prompt}
                            ],
                            temperature=0.7,
                            max_tokens=1000
                        )

                        content = response.choices[0].message.content
                        print(f"\nEvaluating {model}/{threshold}/{result['question']}")
                        print("Raw response:", content[:200] + "...")

                        if '{' in content and '}' in content:
                            json_str = content[content.find('{'):content.rfind('}')+1]
                            evaluation = json.loads(json_str)
                            all_evaluations.append(evaluation)
                        else:
                            print(f"No JSON found in response for {model}/{threshold}/{result['question']}")

                    except Exception as e:
                        print(f"Error evaluating {model}/{threshold}/{result['question']}: {str(e)}")

            # Create final evaluation structure
            final_evaluation = {
                "metadata": {
                    "timestamp": datetime.now().isoformat(),
                    "model_used": "gpt-4o",
                    "num_permutations_evaluated": len(experiment_results["results"]),
                    "num_questions_evaluated": len(questions),
                    "evaluation_status": "success" if all_evaluations else "failed"
                },
                "evaluations": all_evaluations,
                "summary": self._generate_summary(all_evaluations)
            }

            return final_evaluation

        except Exception as e:
            print(f"\nCritical error in evaluation process: {str(e)}")
            return self._create_default_evaluation(experiment_results)

    def _generate_summary(self, evaluations: List[Dict]) -> Dict:
        """Generate summary statistics from evaluations"""
        if not evaluations:
            return {
                "overall_performance": "No evaluations available",
                "optimal_permutation": "Not available",
                "performance_analysis": "Evaluation process failed"
            }

        # Calculate average scores by model/threshold
        model_scores = {}
        for eval in evaluations:
            key = (eval["model"], eval["threshold"])
            if key not in model_scores:
                model_scores[key] = {
                    "count": 0,
                    "total_accuracy": 0,
                    "total_conciseness": 0,
                    "total_source_attribution": 0,
                    "total_reasonableness": 0,
                    "total_composite": 0
                }

            scores = model_scores[key]
            scores["count"] += 1
            scores["total_accuracy"] += eval["scores"]["accuracy"]
            scores["total_conciseness"] += eval["scores"]["conciseness"]
            scores["total_source_attribution"] += eval["scores"]["source_attribution"]
            scores["total_reasonableness"] += eval["scores"]["reasonableness"]
            scores["total_composite"] += eval["composite_score"]

        # Find best performing model/threshold
        best_score = 0
        best_model = None
        best_threshold = None

        for (model, threshold), scores in model_scores.items():
            avg_composite = scores["total_composite"] / scores["count"]
            if avg_composite > best_score:
                best_score = avg_composite
                best_model = model
                best_threshold = threshold

        return {
            "overall_performance": f"Average composite score across all evaluations: {sum(e['composite_score'] for e in evaluations)/len(evaluations):.2f}/100",
            "optimal_permutation": f"Best performance: {best_model} with threshold {best_threshold} (score: {best_score:.2f}/100)",
            "performance_analysis": f"Evaluated {len(evaluations)} responses across {len(model_scores)} model/threshold combinations."
        }

    def _create_default_evaluation(self, experiment_results: Dict) -> Dict:
        """Create a default evaluation structure when parsing fails"""
        print("\n--- Creating Default Evaluation Due to Failure ---")
        default_eval = {
            "metadata": {
                "timestamp": datetime.now().isoformat(),
                "model_used": "gpt-4o",
                "num_permutations_evaluated": len(experiment_results["results"]),
                "num_questions_evaluated": len(set(r["question"] for r in experiment_results["results"])),
                "evaluation_status": "failed"
            },
            "evaluations": [],
            "summary": {
                "overall_performance": "Evaluation failed - using default structure",
                "optimal_permutation": "Not available",
                "performance_analysis": "Evaluation process encountered errors"
            }
        }

        for result in experiment_results["results"]:
            default_eval["evaluations"].append({
                "model": result["model"],
                "threshold": result["threshold"],
                "question": result["question"],
                "scores": {
                    "accuracy": 0,
                    "conciseness": 0,
                    "source_attribution": 0,
                    "reasonableness": 0
                },
                "composite_score": 0,
                "explanation": "Evaluation failed - default scores assigned"
            })

        print("Created default evaluation with", len(default_eval["evaluations"]), "empty evaluations")
        return default_eval

    def format_and_save_results(self, experiment_results: Dict, evaluation_results: Dict, save_dir: str):
        """Format and save both experiment and evaluation results"""
        print("\n=== Starting Results Formatting ===")
        try:
            print("Input evaluation_results keys:", list(evaluation_results.keys()))

            if not isinstance(evaluation_results, dict):
                print("Warning: evaluation_results is not a dictionary")
                evaluation_results = self._create_default_evaluation(experiment_results)

            if "metadata" not in evaluation_results:
                print("Warning: metadata missing from evaluation_results")
                evaluation_results = self._create_default_evaluation(experiment_results)

            # Format experiment results
            formatted_experiment = {
                "metadata": experiment_results.get("metadata", {}),
                "results": []
            }

            # Group results by model and threshold
            for result in experiment_results["results"]:
                formatted_result = {
                    "model": result["model"],
                    "threshold": result["threshold"],
                    "question": result["question"],
                    "response": {
                        "answer": result["response"].get("response_text", ""),
                        "sources": result["response"].get("sources", [])
                    }
                }
                formatted_experiment["results"].append(formatted_result)

            # Format evaluation results with aggregated scores
            formatted_evaluation = {
                "metadata": evaluation_results["metadata"],
                "model_evaluations": {},
                "overall_summary": evaluation_results.get("summary", {})
            }

            # Process evaluations if they exist
            if "evaluations" in evaluation_results:
                for eval in evaluation_results["evaluations"]:
                    model_name = eval["model"]
                    threshold = eval["threshold"]

                    if model_name not in formatted_evaluation["model_evaluations"]:
                        formatted_evaluation["model_evaluations"][model_name] = {
                            "thresholds": {}
                        }

                    if threshold not in formatted_evaluation["model_evaluations"][model_name]["thresholds"]:
                        formatted_evaluation["model_evaluations"][model_name]["thresholds"][threshold] = {
                            "questions": [],
                            "average_scores": {
                                "accuracy": 0,
                                "conciseness": 0,
                                "source_attribution": 0,
                                "reasonableness": 0,
                                "composite": 0
                            }
                        }

                    # Add question evaluation
                    formatted_evaluation["model_evaluations"][model_name]["thresholds"][threshold]["questions"].append({
                        "question": eval["question"],
                        "scores": eval["scores"],
                        "composite_score": eval.get("composite_score", 0),
                        "explanation": eval.get("explanation", "")
                    })

                    # Update average scores
                    questions = formatted_evaluation["model_evaluations"][model_name]["thresholds"][threshold]["questions"]
                    avg_scores = formatted_evaluation["model_evaluations"][model_name]["thresholds"][threshold]["average_scores"]

                    avg_scores["accuracy"] = sum(q["scores"]["accuracy"] for q in questions) / len(questions)
                    avg_scores["conciseness"] = sum(q["scores"]["conciseness"] for q in questions) / len(questions)
                    avg_scores["source_attribution"] = sum(q["scores"]["source_attribution"] for q in questions) / len(questions)
                    avg_scores["reasonableness"] = sum(q["scores"]["reasonableness"] for q in questions) / len(questions)
                    avg_scores["composite"] = sum(q["composite_score"] for q in questions) / len(questions)

            # Save formatted results
            timestamp = time.strftime("%Y%m%d-%H%M%S")

            experiment_file = f"{save_dir}/experiment_results_{timestamp}.json"
            evaluation_file = f"{save_dir}/evaluation_results_{timestamp}.json"

            with open(experiment_file, 'w', encoding='utf-8') as f:
                json.dump(formatted_experiment, f, indent=2, ensure_ascii=False)

            with open(evaluation_file, 'w', encoding='utf-8') as f:
                json.dump(formatted_evaluation, f, indent=2, ensure_ascii=False)

            print("\n=== Results Formatting Complete ===")
            return formatted_experiment, formatted_evaluation

        except Exception as e:
            print(f"\nError in formatting and saving results: {str(e)}")
            default_eval = self._create_default_evaluation(experiment_results)
            return experiment_results, default_eval

    def evaluate_and_format(self, experiment_results: Dict, source_doc: str) -> Dict:
        """Convenience method for evaluation and formatting"""
        print("\n=== Starting evaluate_and_format ===")
        print("Step 1: Running evaluation")
        evaluation = self.evaluate_experiments(experiment_results, source_doc)

        print("\nStep 2: Running display_results")
        self.display_results(evaluation)

        print("\nStep 3: Returning evaluation")
        return evaluation


    def display_results(self, evaluation_results: Dict = None):
            """Format and display evaluation results"""
            try:
                results = evaluation_results

                print("\n" + "="*80)
                print("MODEL EVALUATION RESULTS")
                print("="*80 + "\n")

                if "evaluations" in results:
                    print("DETAILED MODEL PERFORMANCE")
                    print("-"*80)
                    current_model = None
                    current_threshold = None

                    # Sort evaluations by model, threshold, then question
                    sorted_evaluations = sorted(
                        results["evaluations"],
                        key=lambda x: (x["model"], x["threshold"], x["question"])
                    )

                    for eval in sorted_evaluations:
                        # Print model header if it's a new model
                        if eval["model"] != current_model:
                            current_model = eval["model"]
                            print(f"\nModel: {current_model}")
                            current_threshold = None

                        # Print threshold header if it's a new threshold
                        if eval["threshold"] != current_threshold:
                            current_threshold = eval["threshold"]
                            print(f"\nThreshold: {current_threshold}")
                            print("─"*40)

                        # Print evaluation details
                        print(f"\nQuestion: {eval['question']}")
                        print(f"Accuracy Score:          {eval['scores']['accuracy']:>3}/100")
                        print(f"Conciseness Score:       {eval['scores']['conciseness']:>3}/100")
                        print(f"Source Attribution:      {eval['scores']['source_attribution']:>3}/100")
                        print(f"Reasonableness Score:    {eval['scores']['reasonableness']:>3}/100")
                        print(f"Final Composite Score:   {eval['composite_score']:>3}/100")
                        print("\nExplanation:")
                        print(textwrap.fill(eval['explanation'], width=80))

                # Print summary section
                print("\n" + "="*80)
                print("OVERALL ANALYSIS")
                print("="*80)

                if "summary" in results:
                    print("\nPerformance Summary:")
                    print("-"*80)
                    print(textwrap.fill(results["summary"]["overall_performance"], width=80))

                    print("\nOptimal Configuration:")
                    print("-"*80)
                    print(textwrap.fill(results["summary"]["optimal_permutation"], width=80))

                    print("\nPerformance Analysis:")
                    print("-"*80)
                    print(textwrap.fill(results["summary"]["performance_analysis"], width=80))

                # Print metadata
                if "metadata" in results:
                    print("\n" + "="*80)
                    print("METADATA")
                    print("="*80)
                    print(f"Timestamp:           {results['metadata']['timestamp']}")
                    print(f"Model Used:          {results['metadata']['model_used']}")
                    print(f"Permutations:        {results['metadata']['num_permutations_evaluated']}")
                    print(f"Questions Evaluated: {results['metadata']['num_questions_evaluated']}")
                    print(f"Evaluation Status:   {results['metadata']['evaluation_status']}")

            except json.JSONDecodeError as e:
                print("Error parsing JSON results:", e)
            except KeyError as e:
                print("Error accessing result data:", e)
            except Exception as e:
                print(f"Error displaying results: {str(e)}")

#Main

In [None]:
def main():
    global _GLOBAL_RAG_PIPELINE
    if _GLOBAL_RAG_PIPELINE is not None:
        print("Existing RAG piple and associated document chunks found. Preserving cached chunks...")

    # Single temperature setting for all models
    GLOBAL_TEMPERATURE = 0.3

    config = MemoryOptimizedExperimentConfig(
        models=MODEL_CONFIGS["models"],
        thresholds=MODEL_CONFIGS["thresholds"],
        questions=QUESTION_CONFIGS["questions"],
        temperature=GLOBAL_TEMPERATURE
    )

    print("Starting experiment with configurations:")
    print(f"Global temperature: {GLOBAL_TEMPERATURE}")
    print(f"Models: {[model['name'] for model in config.models]}")
    print(f"Thresholds: {config.thresholds}")
    print(f"Number of questions: {len(config.questions)}")

    # Run the experiment
    results = config.run_experiment()

    # Get source document text from the global documents variable
    source_doc = documents[0].text  # documents is loaded at the start of this script

    # Initialize the evaluator
    print("\nInitializing GPT-4 evaluation...")
    openai_api_key = userdata.get('OPENAI_API_KEY')
    evaluator = ExperimentEvaluator(openai_api_key)

    # Run evaluation and get intermediate formatted results
    evaluation = evaluator.evaluate_and_format(results, source_doc)

    # Format and save final results
    formatted_results, formatted_evaluation = evaluator.format_and_save_results(
        results,
        evaluation,
        FILE_CONFIGS['save_directory']
    )

    print("\nExperiment and evaluation completed successfully")
    return formatted_results, formatted_evaluation


if __name__ == "__main__":
    results, evaluation = main()

Initializing global RAG pipeline
Starting experiment with configurations:
Global temperature: 0.3
Models: ['open-mistral-nemo', 'wjleece/quantized-mistral-nemo-12b']
Thresholds: [95]
Number of questions: 5

Processing threshold: 95
Creating new semantic chunks for threshold 95

Testing model: open-mistral-nemo
Processing question: What were cloud revenues in Q2 2024?
Processing question: What were the main drivers of revenue growth in Q2?
Processing question: How much did YouTube ad revenues grow in Q2 in APAC?
Processing question: Can you summarize recent key antitrust matters?
Processing question: What were YouTube ad revenues in Q2?

Testing model: wjleece/quantized-mistral-nemo-12b
Loading quantized model: wjleece/quantized-mistral-nemo-12b


tokenizer_config.json:   0%|          | 0.00/181k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.26M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


model.safetensors.index.json:   0%|          | 0.00/111k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.84G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Processing question: What were cloud revenues in Q2 2024?


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Processing question: What were the main drivers of revenue growth in Q2?


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Processing question: How much did YouTube ad revenues grow in Q2 in APAC?


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Processing question: Can you summarize recent key antitrust matters?


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Processing question: What were YouTube ad revenues in Q2?


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Initializing GPT-4 evaluation...

=== Starting evaluate_and_format ===
Step 1: Running evaluation

=== Starting Evaluation Process ===
Number of unique questions to evaluate: 5
Number of model/threshold combinations: 2

--- Getting Baseline Answers ---
Baseline response received: ```json
{
    "How much did YouTube ad revenues grow in Q2 in APAC?": "Information not provided in the document.",
    "What were the main drivers of revenue growth in Q2?": "Information not provided ...
Baseline answers received

Evaluating model: open-mistral-nemo, threshold: 95

Evaluating open-mistral-nemo/95/What were cloud revenues in Q2 2024?
Raw response: ```json
{
    "model": "open-mistral-nemo",
    "threshold": 95,
    "question": "What were cloud revenues in Q2 2024?",
    "scores": {
        "accuracy": 100,
        "conciseness": 100,
        "s...

Evaluating open-mistral-nemo/95/What were the main drivers of revenue growth in Q2?
Raw response: ```json
{
    "model": "open-mistral-nemo",
    "