<a href="https://colab.research.google.com/github/wjleece/rag-experimentation-framework/blob/main/rag_experimentation_framework_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

If you use this code, please cite:

{
  title = {RAG Experimentation Framework},

  author = {Bill Leece},

  year = {2024}
}

#Setup

In [None]:
!pip install -U transformers --quiet
#!pip install -U optimum --quiet
!pip install -U accelerate  --quiet
!pip install -U bitsandbytes  --quiet
!pip install -U torch --quiet
!pip install -U sentencepiece --quiet
!pip install -U llama-index --quiet
!pip install -U llama-index-llms-mistralai --quiet
!pip install -U llama-index-embeddings-mistralai --quiet
!pip install -U llama-index-llms-langchain --quiet
!pip install -U langchain --quiet
!pip install -U langchain-community --quiet
!pip install -U langchain-mistralai --quiet
!pip install -U langchain_huggingface --quiet
!pip install -U faiss-gpu --quiet

In [None]:
import os
import json
import numpy as np
import faiss
import transformers
import torch
import gc
import openai
import json
import tiktoken
import textwrap
import time
from google.colab import drive, userdata
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain.prompts import PromptTemplate
from langchain_huggingface import HuggingFacePipeline
from langchain_core.output_parsers import StrOutputParser
from langchain_mistralai.chat_models import ChatMistralAI
from llama_index.embeddings.mistralai import MistralAIEmbedding
from llama_index.core import SimpleDirectoryReader, Settings
from llama_index.core.node_parser import SemanticSplitterNodeParser
import time
from typing import List, Dict, Tuple
from contextlib import contextmanager
from langchain.schema.runnable import RunnableSequence
from langchain.schema.output_parser import StrOutputParser
from langchain_text_splitters.markdown import MarkdownHeaderTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from datetime import datetime
from typing import Dict, List, Any

In [None]:
os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN')
os.environ["MISTRAL_API_KEY"] = userdata.get('MISTRAL_API_KEY')
api_key = userdata.get('OPENAI_API_KEY')

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu' #Use GPUs when possible

#Experiment Configurations

In [None]:
# Setup configurations
MODEL_CONFIGS = {
    "models": [
    #    {
    #        "name": "open-mixtral-8x7b",
    #        "type": "mistral_api",
    #        "tokenizer": None,  # Not needed for API models
    #    },

          {
            "name": "mistral-large-latest",
            "type": "mistral_api",
            "tokenizer": None,  # Not needed for API models
         },

         {
            "name": "open-mistral-nemo",
            "type": "mistral_api",
            "tokenizer": None,  # Not needed for API models
        },
#        {
#            "name": "ministral-8b-latest",
#            "type": "mistral_api",
#            "tokenizer": None,  # Not needed for API models
#        },
 #       {
 #             "name": "meta-llama/Llama-3.1-8B-Instruct",
 #             "type": "huggingface",
 #             "tokenizer": "meta-llama/Llama-3.1-8B-Instruct"
 #       },

   #   {
   #         "name": "wjleece/quantized-mistral-7b",
   #         "type": "huggingface",
   #         "tokenizer": "mistralai/Mixtral-8x7B-v0.1",  # The same tokenizer that works on the base model will work on the quantized model - there is no 'quantized tokenizer'
   #          "quantization_config": {                    #Quantization config left here as a reference, but not used in the code (as we're using an already quantized model from HuggingFace)
   #             "load_in_4bit": True,
   #             "bnb_4bit_compute_dtype": "float16",
   #             "bnb_4bit_quant_type": "nf4",
   #             "bnb_4bit_use_double_quant": False
   #         }
   #     },
      {
              "name": "wjleece/quantized-mistral-nemo-12b",
              "type": "huggingface",
              "tokenizer": "mistralai/Mistral-Nemo-Instruct-2407",  # The same tokenizer that works on the base model will work on the quantized model - there is no 'quantized tokenizer'
              "quantization_config": {                    #Quantization config left here as a reference, but not used in the code (as we're using an already quantized model from HuggingFace)
                  "load_in_4bit": True,
                  "bnb_4bit_compute_dtype": "float16",
                  "bnb_4bit_quant_type": "nf4",
                  "bnb_4bit_use_double_quant": False
             }
          },
   #    {
   #           "name": "wjleece/quantized-mistral-8b",
   #           "type": "huggingface",
   #           "tokenizer": "mistralai/Ministral-8B-Instruct-2410",  # The same tokenizer that works on the base model will work on the quantized model - there is no 'quantized tokenizer'
   #           "quantization_config": {                    #Quantization config left here as a reference, but not used in the code (as we're using an already quantized model from HuggingFace)
   #               "load_in_4bit": True,
   #               "bnb_4bit_compute_dtype": "float16",
   #               "bnb_4bit_quant_type": "nf4",
   #              "bnb_4bit_use_double_quant": False
   #           }
   #       },
   #     {
   #           "name": "wjleece/quantized-llama-3.1-8b",
   #           "type": "huggingface",
   #           "tokenizer": "meta-llama/Llama-3.1-8B-Instruct",  # The same tokenizer that works on the base model will work on the quantized model - there is no 'quantized tokenizer'
   #           "quantization_config": {                    #Quantization config left here as a reference, but not used in the code (as we're using an already quantized model from HuggingFace)
   #               "load_in_4bit": True,
   #               "bnb_4bit_compute_dtype": "float16",
   #               "bnb_4bit_quant_type": "nf4",
   #               "bnb_4bit_use_double_quant": False
   #           }
   #       }
       ]
}


CHUNKING_CONFIGS = {
    "strategies": ["paragraph", "header"],
    "semantic_config": {
        "enabled": True,
        "thresholds": [85, 95] if True else []
    },
    "max_chunk_size": 2048,
    "chunk_overlap": 100,
    "min_chunk_size": 35 #we'll ignore any chunk ~5 words or less
}

QUESTION_CONFIGS = {
    "questions": [
        "What were cloud revenues in the most recent quarter?",
        "What were the main drivers of revenue growth in the most recent quarter?",
        "How much did YouTube ad revenues grow in the most recent quarter in APAC?",
        "Can you summarize recent key antitrust matters?",
        "Compare the revenue growth across all geographic regions and explain the main factors for each region.",
        "Summarize all mentioned risk factors related to international operations.",
        "What were the major changes in operating expenses across all categories and their stated reasons?",
    ] #These quetsions should relate to the RAG document --> these are your 'business use cases'
}

FILE_CONFIGS = {
    "save_directory": '/content/drive/My Drive/AI/Model_Analysis'
}

#Load RAG Document

In [None]:
drive.mount('/content/drive')
documents = SimpleDirectoryReader(input_files=["/content/drive/My Drive/AI/Datasets/Google-10-q/goog-10-q-q3-2024.pdf"]).load_data()

#RAG Pipeline Class

In [None]:
# Global singleton instance
_GLOBAL_RAG_PIPELINE = None

class RAGPipeline:
    def __init__(self):
        self.chunk_cache = {}
        self.embedding_cache = {}
        self.embedding_model = None

    @classmethod
    def get_instance(cls):
        """Get or create singleton instance"""
        global _GLOBAL_RAG_PIPELINE
        if _GLOBAL_RAG_PIPELINE is None:
            _GLOBAL_RAG_PIPELINE = cls()
        return _GLOBAL_RAG_PIPELINE


    def initialize_embedding_model(self):
        """Initialize the embedding model if not already initialized"""
        if self.embedding_model is None:
            mistral_api_key = userdata.get('MISTRAL_API_KEY')
            self.embedding_model = MistralAIEmbedding(
                model_name="mistral-embed",
                api_key=mistral_api_key
            )
        return self.embedding_model

    def convert_to_markdown_headers(self, text):
        """Convert document section titles to markdown headers"""
        import re

        patterns = [
            (r'^(?:ITEM|Section)\s+\d+[.:]\s*(.+)$', '# '),
            (r'^\d+\.\d+\s+(.+)$', '## '),
            (r'^\([a-z]\)\s+(.+)$', '### ')
        ]

        lines = text.split('\n')
        markdown_lines = []

        for line in lines:
            line = line.strip()
            converted = False

            for pattern, header_mark in patterns:
                if re.match(pattern, line, re.IGNORECASE):
                    markdown_lines.append(f"{header_mark}{line}")
                    converted = True
                    break

            if not converted:
                markdown_lines.append(line)

        return '\n'.join(markdown_lines)


    def create_chunks(self, documents: List, threshold: int, chunk_strategy: str = "semantic") -> Dict:
        """Create or retrieve chunks based on specified strategy"""

        MAX_CHUNK_SIZE = CHUNKING_CONFIGS['max_chunk_size']
        CHUNK_OVERLAP = CHUNKING_CONFIGS['chunk_overlap']
        MIN_CHUNK_SIZE = CHUNKING_CONFIGS['min_chunk_size']


        if chunk_strategy == "semantic":
            cache_key = f"{chunk_strategy}_{threshold}"
            print(f"Using semantic cache key: {cache_key} with threshold: {threshold}")
        else:
            cache_key = f"{chunk_strategy}_{MAX_CHUNK_SIZE}"
            print(f"Using non-semantic cache key: {cache_key}")


        if cache_key not in self.chunk_cache:
            print("\nStarting new chunk creation:")
            texts = []

            try:
                if chunk_strategy == "semantic":
                    print("Processing semantic chunking...")
                    if self.embedding_model is None:
                        print("Initializing embedding model")
                        self.initialize_embedding_model()

                    splitter = SemanticSplitterNodeParser(
                        buffer_size=1,
                        breakpoint_percentile_threshold=threshold,
                        embed_model=self.embedding_model
                    )
                    nodes = splitter.get_nodes_from_documents(documents)
                    texts = [node.text for node in nodes]
                    print(f"Generated {len(texts)} semantic chunks")

                elif chunk_strategy == "paragraph":
                    print("Processing paragraph chunking...")
                    text_splitter = RecursiveCharacterTextSplitter(
                        separators=["\n\n", "\n", ". ", " ", ""],
                        chunk_size=MAX_CHUNK_SIZE,
                        chunk_overlap=CHUNK_OVERLAP,
                        length_function=len
                    )

                    for idx, doc in enumerate(documents):
                        print(f"\nProcessing document {idx + 1}/{len(documents)}")
                        print(f"Document length: {len(doc.text)} characters")
                        doc_chunks = text_splitter.split_text(doc.text)
                        print(f"Initial chunks from document: {len(doc_chunks)}")
                        if doc_chunks:
                            print(f"Sample chunk lengths: {[len(c) for c in doc_chunks[:3]]}")
                        texts.extend(doc_chunks)

                elif chunk_strategy == "header":
                    print("Processing header chunking...")
                    text_splitter = RecursiveCharacterTextSplitter(
                        separators=["\n\n", "\n", ". ", " ", ""],
                        chunk_size=MAX_CHUNK_SIZE,
                        chunk_overlap=CHUNK_OVERLAP,
                        length_function=len
                    )

                    for idx, doc in enumerate(documents):
                        print(f"\nProcessing document {idx + 1}/{len(documents)}")
                        md_text = self.convert_to_markdown_headers(doc.text)
                        print("Headers identified. First 100 chars of markdown text:")
                        print(md_text[:100] + "...")

                        headers_to_split_on = [
                            ("#", "Header 1"),
                            ("##", "Header 2"),
                            ("###", "Header 3"),
                        ]

                        header_splitter = MarkdownHeaderTextSplitter(
                            headers_to_split_on=headers_to_split_on
                        )

                        splits = header_splitter.split_text(md_text)
                        print(f"Generated {len(splits)} header sections")
                        if splits:
                            print("Sample section lengths:", [len(s.page_content) for s in splits[:3]])

                        for split in splits:
                            if len(split.page_content) > MAX_CHUNK_SIZE:
                                print(f"Splitting large section: {len(split.page_content)} chars")
                                subsections = text_splitter.split_text(split.page_content)
                                print(f"Created {len(subsections)} subsections")
                                texts.extend(subsections)
                            else:
                                texts.append(split.page_content)

                print("\nCleaning and filtering chunks...")
                initial_count = len(texts)
                cleaned_texts = []
                for idx, text in enumerate(texts):
                    if not isinstance(text, str):
                        print(f"Warning: Non-string chunk found at index {idx}")
                        continue

                    cleaned_text = text.strip()
                    if len(cleaned_text) >= MIN_CHUNK_SIZE:
                        cleaned_texts.append(cleaned_text)
                    else:
                        print(f"Filtered out small chunk: {len(cleaned_text)} chars")

                texts = cleaned_texts
                print(f"Chunks after cleaning: {len(texts)} (removed {initial_count - len(texts)})")

                if not texts:
                    print("WARNING: No valid chunks generated!")
                    return {
                        'texts': [],
                        'strategy': chunk_strategy,
                        'chunk_stats': {
                            'num_chunks': 0,
                            'avg_chunk_size': 0,
                            'min_chunk_size': 0,
                            'max_chunk_size': 0
                        }
                    }

                # Calculate chunk statistics
                chunk_lengths = [len(t) for t in texts]
                chunk_stats = {
                    'num_chunks': len(texts),
                    'avg_chunk_size': sum(chunk_lengths)/len(texts),
                    'min_chunk_size': min(chunk_lengths),
                    'max_chunk_size': max(chunk_lengths)
                }

                print("\nFinal Chunk Statistics:")
                print(f"Total chunks: {chunk_stats['num_chunks']}")
                print(f"Average chunk size: {chunk_stats['avg_chunk_size']:.2f} chars")
                print(f"Minimum chunk size: {chunk_stats['min_chunk_size']} chars")
                print(f"Maximum chunk size: {chunk_stats['max_chunk_size']} chars")

                print("\nSample of first chunk:")
                if texts:
                    print(texts[0][:200] + "...")

                # Store in cache
                self.chunk_cache[cache_key] = {
                    'texts': texts,
                    'strategy': chunk_strategy,
                    'chunk_stats': chunk_stats
                }
                print(f"\nStored chunks in cache with key: {cache_key}")

            except Exception as e:
                print("\nERROR in chunk creation:")
                print(f"Error type: {type(e).__name__}")
                print(f"Error message: {str(e)}")
                import traceback
                print("\nTraceback:")
                print(traceback.format_exc())
                return {
                    'texts': [],
                    'strategy': chunk_strategy,
                    'chunk_stats': {
                        'num_chunks': 0,
                        'avg_chunk_size': 0,
                        'min_chunk_size': 0,
                        'max_chunk_size': 0
                    }
                }
        else:
            print(f"\nRetrieving {len(self.chunk_cache[cache_key]['texts'])} existing chunks from cache")

        result = self.chunk_cache[cache_key]
        print(f"\nFinal Output:")
        print(f"Number of chunks: {len(result['texts'])}")
        print(f"Strategy: {result['strategy']}")
        print("="*50)
        return result

    def run_cosine_search(self, query: str, threshold: int, chunk_strategy: str = "semantic", k: int = 5) -> List[Dict]:
        """Run cosine similarity search with enhanced error handling and debugging"""
        print("\n" + "="*50)
        print("COSINE SEARCH DEBUG LOG")
        print("="*50)
        print(f"Query: {query}")
        print(f"Strategy: {chunk_strategy}")
        print(f"Threshold: {threshold}")
        print(f"Requested k: {k}")

        if chunk_strategy == "semantic":
            cache_key = f"{chunk_strategy}_{threshold}"
        else:
            cache_key = f"{chunk_strategy}_{CHUNKING_CONFIGS['max_chunk_size']}"

        print("\nCache Status:")
        print(f"Cache key: {cache_key}")
        print(f"Available cache keys: {list(self.chunk_cache.keys())}")
        print(f"Chunks cache hit: {cache_key in self.chunk_cache}")
        print(f"Embeddings cache hit: {cache_key in self.embedding_cache}")

        # First, ensure we have chunks
        if cache_key not in self.chunk_cache:
            print(f"\nERROR: No chunks found in cache for {cache_key}")
            print("This suggests chunk creation failed or wasn't called")
            return []

        chunks_data = self.chunk_cache[cache_key]
        if not chunks_data['texts']:
            print("\nERROR: Chunks list is empty")
            print("This suggests chunk creation succeeded but produced no chunks")
            return []

        print(f"\nFound {len(chunks_data['texts'])} chunks to search")
        print(f"Sample chunk (first 100 chars): {chunks_data['texts'][0][:100]}...")

        try:
            if self.embedding_model is None:
                print("\nInitializing embedding model")
                self.initialize_embedding_model()

            if cache_key not in self.embedding_cache:
                print("\nGenerating embeddings for chunks...")
                chunk_embeddings = []

                # Process in batches
                batch_size = 32
                total_batches = (len(chunks_data['texts']) + batch_size - 1) // batch_size

                for i in range(0, len(chunks_data['texts']), batch_size):
                    batch = chunks_data['texts'][i:i + batch_size]
                    print(f"\nProcessing batch {i//batch_size + 1}/{total_batches}")
                    print(f"Batch size: {len(batch)} chunks")

                    batch_embeddings = [self.embedding_model.get_text_embedding(text) for text in batch]
                    chunk_embeddings.extend(batch_embeddings)
                    print(f"Total embeddings so far: {len(chunk_embeddings)}")

                print("\nConverting to numpy array...")
                embeddings_array = np.array(chunk_embeddings).astype('float32')
                print(f"Embeddings shape: {embeddings_array.shape}")

                print("Normalizing embeddings...")
                norms = np.linalg.norm(embeddings_array, axis=1)[:, np.newaxis]
                norms[norms == 0] = 1  # Prevent division by zero
                normalized_embeddings = embeddings_array / norms

                print("Creating FAISS index...")
                dimension = embeddings_array.shape[1]
                index = faiss.IndexFlatIP(dimension)
                index.add(normalized_embeddings)

                self.embedding_cache[cache_key] = {
                    'embeddings': embeddings_array,
                    'index': index
                }
                print("Embeddings cached successfully")

            print("\nProcessing query...")
            query_embedding = self.embedding_model.get_text_embedding(query)
            query_embedding = np.array([query_embedding]).astype('float32')

            print("Normalizing query embedding...")
            query_norm = np.linalg.norm(query_embedding)
            if query_norm == 0:
                print("ERROR: Zero query vector")
                return []
            query_normalized = query_embedding / query_norm

            print(f"\nSearching for top {k} matches...")
            distances, indices = self.embedding_cache[cache_key]['index'].search(
                query_normalized, k
            )

            print("\nFormatting results...")
            results = []
            for score, idx in zip(distances[0], indices[0]):
                if idx >= 0 and idx < len(chunks_data['texts']):
                    results.append({
                        'text': chunks_data['texts'][idx],
                        'distance': float(score),
                        'strategy': chunk_strategy
                    })
                    print(f"\nMatch {len(results)}:")
                    print(f"Score: {float(score):.4f}")
                    print(f"Text preview: {chunks_data['texts'][idx][:100]}...")

            print(f"\nTotal matches found: {len(results)}")
            print("="*50)
            return results

        except Exception as e:
            print("\nERROR in cosine search:")
            print(f"Error type: {type(e).__name__}")
            print(f"Error message: {str(e)}")
            import traceback
            print("\nTraceback:")
            print(traceback.format_exc())
            print("="*50)
            return []

    def generate_response(self, query: str, context_rag: list, model: Dict) -> dict:
        """Generate response using provided context with source tracking"""
        try:
            if not context_rag:
                return {
                    "response_text": "No relevant context found.",
                    "sources": [],
                    "source_tracking": {
                        "num_sources_provided": 0,
                        "source_ids": [],
                        "verification_status": "no_context"
                    },
                    "strategy": None
                }

            print("\n=== DEBUG: Context Chunks Passed to LLM ===")
            print(f"Query: {query}")
            print(f"Number of chunks: {len(context_rag)}")

            # Generate unique IDs for each source chunk
            context_with_ids = []
            for idx, doc in enumerate(context_rag):
                source_id = f"src_{idx}"
                context_with_ids.append({
                    "text": doc['text'],
                    "id": source_id,
                    "distance": doc.get('distance', 0)
                })
                print(f"\nChunk {source_id}:")
                print(f"Distance: {doc.get('distance', 'N/A')}")
                print("Text:", doc['text'])
            print("="*50)

            # Format context with source IDs
            formatted_context = "\n\n".join([
                f"[{doc['id']}] {doc['text']}"
                for doc in context_with_ids
            ])

            prompt = PromptTemplate(template="""
            Instructions:

            You are a helpful assistant who answers questions strictly from the provided context.
            Given the context information, provide a direct and concise answer to the question: {query}

            Important rules:
            1. Only use information present in the context
            2. If you don't know or can't find the information, say "I don't know"
            3. You must cite the source IDs [src_X] for every piece of information you use
            4. Do not make assumptions or use external knowledge

            You must format your response as a JSON string object, starting with "LLM_Response:"

            Your answer must follow this exact format:

            LLM_Response:
            {{
                "response_text": "Your detailed answer here with [src_X] citations inline",
                "sources": [
                    "Copy and paste here the exact text segments you used, with their source IDs"
                ],
                "source_ids_used": ["List of all source IDs referenced in your answer"]
            }}

            Context (with source IDs):
            ---------------
            {context}
            ---------------
            """)

            model_type = model['type']
            llm = model['llm']

            chain = prompt | llm | StrOutputParser()

            response = chain.invoke({
                "query": query,
                "context": formatted_context
            })

            response_text = response.split("LLM_Response:")[-1].strip()

            try:
                if '{' in response_text and '}' in response_text:
                    json_str = response_text[response_text.find('{'):response_text.rfind('}')+1]
                    parsed_response = json.loads(json_str)

                    # Verify source usage
                    claimed_sources = set(parsed_response.get("source_ids_used", []))
                    available_sources = {doc["id"] for doc in context_with_ids}

                    verification_status = {
                        "status": "verified" if claimed_sources.issubset(available_sources) else "source_mismatch",
                        "claimed_sources": list(claimed_sources),
                        "available_sources": list(available_sources),
                        "unauthorized_sources": list(claimed_sources - available_sources)
                    }

                    return {
                        "response_text": parsed_response.get("response_text", response_text),
                        "sources": parsed_response.get("sources", []),
                        "source_tracking": {
                            "num_sources_provided": len(context_with_ids),
                            "source_ids": [doc["id"] for doc in context_with_ids],
                            "verification_status": verification_status
                        },
                        "strategy": context_rag[0]['strategy'] if context_rag else None
                    }
                else:
                    return {
                        "response_text": response_text,
                        "sources": [],
                        "source_tracking": {
                            "num_sources_provided": len(context_with_ids),
                            "source_ids": [doc["id"] for doc in context_with_ids],
                            "verification_status": {
                                "status": "parsing_failed",
                                "error": "Response not in JSON format"
                            }
                        },
                        "strategy": context_rag[0]['strategy'] if context_rag else None
                    }

            except json.JSONDecodeError:
                return {
                    "response_text": response_text,
                    "sources": [],
                    "source_tracking": {
                        "num_sources_provided": len(context_with_ids),
                        "source_ids": [doc["id"] for doc in context_with_ids],
                        "verification_status": {
                            "status": "parsing_failed",
                            "error": "JSON decode error"
                        }
                    },
                    "strategy": context_rag[0]['strategy'] if context_rag else None
                }

        except Exception as e:
            print(f"An error occurred: {str(e)}")
            return {
                "response_text": "An error occurred while generating the response.",
                "sources": [],
                "source_tracking": {
                    "num_sources_provided": 0,
                    "source_ids": [],
                    "verification_status": {
                        "status": "error",
                        "error": str(e)
                    }
                },
                "strategy": None
            }

#ModelConfig Class

In [None]:
class ModelConfig:
    """Handles model configuration and management"""
    def __init__(self,
                 models: List[Dict],
                 temperature: float = 0.3):
        self.models = models
        self.temperature = temperature
        self.current_model = None
        self.current_model_name = None


    @contextmanager
    def load_model(self, model_config: Dict):
        """Context manager for lazy loading and proper cleanup of models"""
        try:
            model_name = model_config["name"]
            model_type = model_config["type"]

            # Clear any existing model
            self.cleanup_current_model()

            if model_type == "mistral_api":
                mistral_api_key = userdata.get('MISTRAL_API_KEY')
                self.current_model = {
                    'llm': ChatMistralAI(
                        model=model_name,
                        temperature=self.temperature,
                        api_key=mistral_api_key
                    ),
                    'type': 'mistral_api'
                }
            else:  # huggingface
                print(f"Loading huggingface model: {model_name}")

                # Empty CUDA cache before loading new model
                torch.cuda.empty_cache()
                gc.collect()

                tokenizer = AutoTokenizer.from_pretrained(
                    pretrained_model_name_or_path=model_config["tokenizer"],
                    trust_remote_code=True,
                    use_fast=True,
                    padding_side="left"
                )

                model = AutoModelForCausalLM.from_pretrained(
                    pretrained_model_name_or_path=model_name,
                    device_map="auto",
                    trust_remote_code=True,
                    torch_dtype=torch.float16,
                    use_cache=True,
                    low_cpu_mem_usage=True,
                )

                pipe = pipeline(
                    "text-generation",
                    model=model,
                    tokenizer=tokenizer,
                    max_new_tokens=512,
                    temperature=self.temperature,
                    top_p=0.95,
                    top_k=50,
                    do_sample=True,
                    device_map="auto"
                )

                self.current_model = {
                    'llm': HuggingFacePipeline(pipeline=pipe),
                    'type': 'huggingface',
                    'model': model,  # Keep reference for cleanup
                    'pipe': pipe     # Keep reference for cleanup
                }

            self.current_model_name = model_name
            yield self.current_model

        finally:
            # Cleanup will happen in cleanup_current_model()
            pass

    def cleanup_current_model(self):
        """Clean up the current model and free memory"""
        if self.current_model is not None:
            if self.current_model['type'] == 'huggingface':
                # Delete model components explicitly
                del self.current_model['llm']
                del self.current_model['model']
                del self.current_model['pipe']

                # Clear CUDA cache
                torch.cuda.empty_cache()

                # Run garbage collection
                gc.collect()

            self.current_model = None
            self.current_model_name = None

#ExperimentRunner Class

In [None]:
class ExperimentRunner:
    """Handles experiment execution"""
    def __init__(self,
                 model_config: ModelConfig,
                 questions: List[str],
                 chunk_strategies: List[str],
                 semantic_enabled: bool = False,
                 semantic_thresholds: List[int] = None,
                 rag_pipeline: RAGPipeline = None):
        self.model_config = model_config
        self.questions = questions
        self.chunk_strategies = chunk_strategies
        self.semantic_enabled = semantic_enabled
        self.semantic_thresholds = semantic_thresholds if semantic_enabled else []

        # Use existing RAG pipeline or create new one
        global _GLOBAL_RAG_PIPELINE
        if rag_pipeline:
            self.rag_pipeline = rag_pipeline
        elif _GLOBAL_RAG_PIPELINE:
            self.rag_pipeline = _GLOBAL_RAG_PIPELINE
        else:
            print("Initializing new RAG pipeline")
            _GLOBAL_RAG_PIPELINE = RAGPipeline()
            self.rag_pipeline = _GLOBAL_RAG_PIPELINE

    def run_experiments(self) -> Dict:
        results = {
            "metadata": {
                "timestamp": time.strftime("%Y%m%d-%H%M%S"),
                "models_tested": [model["name"] for model in self.model_config.models],
                "semantic_enabled": self.semantic_enabled,
                "semantic_thresholds": self.semantic_thresholds if self.semantic_enabled else [],
                "chunk_strategies": self.chunk_strategies,
                "temperature": self.model_config.temperature
            },
            "results": []
        }

        for model_config in self.model_config.models:
            model_name = model_config["name"]
            print(f"\nTesting model: {model_name}")

            with self.model_config.load_model(model_config) as model:
                for strategy in self.chunk_strategies:
                    # Handle thresholds based on strategy type
                    if strategy == "semantic" and self.semantic_enabled:
                        thresholds_to_test = self.semantic_thresholds
                    else:
                        thresholds_to_test = [None]

                    for threshold in thresholds_to_test:
                        chunks_data = self.rag_pipeline.create_chunks(
                            documents,
                            threshold=threshold,
                            chunk_strategy=strategy
                        )

                        chunk_stats = {
                            "strategy": strategy,
                            "threshold": threshold,
                            "stats": chunks_data["chunk_stats"]
                        }

                        for question in self.questions:
                            print(f"Processing question: {question}")

                            context = self.rag_pipeline.run_cosine_search(
                                query=question,
                                threshold=threshold,
                                chunk_strategy=strategy
                            )

                            answer = self.rag_pipeline.generate_response(
                                query=question,
                                context_rag=context,
                                model=model
                            )

                            results["results"].append({
                                "model": model_name,
                                "threshold": threshold if strategy == "semantic" else None,
                                "chunk_strategy": strategy,
                                "question": question,
                                "response": answer,
                                "chunk_stats": chunk_stats["stats"]
                            })

        return results

#Evaluator Class

In [None]:
class ExperimentEvaluator:
    """Handles pure evaluation logic"""
    def __init__(self, api_key: str):
        self.client = openai.OpenAI(api_key=api_key)
        self.encoder = tiktoken.encoding_for_model("gpt-4o")

    def _get_baseline_answers(self, questions: List[str], source_docs: List) -> Dict[str, str]:
        """Get GPT-4o's own answers to the questions as baseline"""
        print("\n=== DEBUG: _get_baseline_answers ===")
        print(f"Questions received: {questions}")
        print(f"Number of document parts: {len(source_docs)}")

        # Concatenate all document parts
        full_document = "\n\n".join([doc.text for doc in source_docs])
        print(f"\nFull document length: {len(full_document)} characters")

        # Print sample from document
        print("\nSampling from document:")
        print("Start:", full_document[:200], "...")
        print("Middle:", full_document[len(full_document)//2:len(full_document)//2 + 200], "...")
        print("End:", full_document[-200:], "...")

        baseline_prompt = f"""Source Document:
        {full_document}

        Using ONLY the information from the source document above, answer these questions.
        - If the exact information is found, provide it with specific numbers
        - If information is not found, explicitly state that
        - If there are metrics, make sure to include appropriate units

        Format your response as a valid JSON object with questions as keys and answers as values.
        Keep answers concise and factual.

        Questions to answer:
        {json.dumps(questions, indent=2)}"""

        try:
            print("\n--- Getting Baseline Answers ---")
            response = self.client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant that provides JSON-formatted answers based on source documents."},
                    {"role": "user", "content": baseline_prompt}
                ],
                temperature=0.1
            )

            content = response.choices[0].message.content
            print("\nRaw GPT-4 Response:")
            print(content)

            if '{' in content and '}' in content:
                json_str = content[content.find('{'):content.rfind('}')+1]
                baseline_answers = json.loads(json_str)
                print("\nParsed Baseline Answers:")
                print(baseline_answers)
                return baseline_answers
            print("\nWarning: No JSON structure found in response")
            return {"error": "No JSON structure found", "questions": questions}

        except Exception as e:
            print(f"\nError in _get_baseline_answers: {str(e)}")
            return {"error": str(e), "questions": questions}

    def evaluate_experiments(self, experiment_results: Dict, *, source_docs: List) -> Dict:  # Updated signature
        """Core evaluation logic"""
        try:
            print("\n=== DEBUG: evaluate_experiments ===")
            print("Getting questions...")
            questions = list(set(result["question"] for result in experiment_results["results"]))
            print(f"Questions extracted: {questions}")

            print("\nGetting baseline answers...")
            baseline_answers = self._get_baseline_answers(questions, source_docs)  # Pass source_docs
            print(f"Baseline answers received: {baseline_answers}")

            model_strategy_combinations = set(
                (result["model"],
                result["chunk_strategy"],
                result["threshold"] if result["chunk_strategy"] == "semantic" else None)
                for result in experiment_results["results"]
            )

            all_evaluations = []

            for model, strategy, threshold in model_strategy_combinations:
                relevant_results = [r for r in experiment_results["results"]
                                  if r["model"] == model and
                                     r["chunk_strategy"] == strategy and
                                     (r["threshold"] == threshold if strategy == "semantic" else True)]

                for result in relevant_results:
                    print(f"\nEvaluating response for: {result['question']}")
                    baseline = baseline_answers.get(result["question"], "No baseline available")
                    print(f"Using baseline answer: {baseline}")

                    evaluation = self._evaluate_single_response(result, baseline)
                    all_evaluations.append(evaluation)

            return {
                "metadata": {
                    "timestamp": datetime.now().isoformat(),
                    "model_used": "gpt-4o",
                    "num_combinations_evaluated": len(model_strategy_combinations),
                    "num_questions_evaluated": len(questions),
                    "evaluation_status": "success"
                },
                "evaluations": all_evaluations,
                "summary": self._generate_summary(all_evaluations)
            }

        except Exception as e:
            print(f"\nCritical error in evaluate_experiments: {str(e)}")
            return self._create_default_evaluation(experiment_results)

    def _evaluate_single_response(self, result: Dict, baseline: str) -> Dict:
        """Evaluate a single response with clearer scoring criteria"""
        evaluation_prompt = f"""Compare and evaluate this response. You must evaluate three separate aspects:

    1. ACCURACY - Compare the model's answer against the baseline (ground truth)
    2. SOURCE ATTRIBUTION - Check if the model's answer matches its cited sources
    3. CONCISESNESS - Check if the model's answer is clear and direct

    Question: {result["question"]}

    Baseline (Ground Truth): {baseline}

    Model Response: {result.get("response", {}).get("response_text", "")}
    Sources Cited: {json.dumps(result.get("response", {}).get("sources", []), indent=2)}

    Scoring Criteria:

    1. ACCURACY (0-100):
      - Compare ONLY the model's answer against the baseline
      - 100: Exact match with baseline (including numbers and units)
      - 50: Partially correct but with some errors
      - 0: Completely different from baseline or wrong

    2. SOURCE ATTRIBUTION (0-100):
      - Compare ONLY the model's answer against its cited sources
      - 100: Answer exactly matches what appears in cited sources INCLUDING UNITS
      - 50: Answer partially matches cited sources
      - 0: Answer doesn't match cited sources or no sources cited

      Note: For large numbers, different formats are acceptable (e.g., $19,000 million = $19 billion)
      BUT the units must match what appears in the source document for full attribution score.
      The units in the source document are authoritative.

    3. CONCISENESS (0-100):
      - 100: Clear, direct answer without extra information
      - 50: Contains some irrelevant information
      - 0: Verbose or unclear

    Note: A response can have perfect source attribution (100) even if the answer is wrong,
    as long as it accurately reflects what's in its cited sources.

    Provide your evaluation in this exact JSON format:
    {{
        "model": "{result["model"]}",
        "chunk_strategy": "{result["chunk_strategy"]}",
        "threshold": {result["threshold"] if result["chunk_strategy"] == "semantic" else "null"},
        "question": "{result["question"]}",
        "baseline_answer": "{baseline}",
        "model_response": {json.dumps(result.get("response", {}), indent=2)},
        "chunk_stats": {json.dumps(result.get("chunk_stats", {}), indent=2)},
        "scores": {{
            "accuracy": <score>,
            "source_attribution": <score>,
            "conciseness": <score>
        }},
        "composite_score": <average of scores>,
        "detailed_analysis": {{
            "accuracy_analysis": "Explain ONLY how the answer compares to baseline. Explicitly state if numbers match or differ.",
            "attribution_analysis": "Explain ONLY how well the answer matches its cited sources, regardless of accuracy.",
            "conciseness_analysis": "Explain how clear and direct the answer is"
        }}
    }}

    Examples:

    Bad Response (Perfect Attribution, Wrong Answer):
    - If baseline is "$10,347M" but model answers "$19,921M [src_2]" and src_2 contains "$19,921M"
    - Accuracy: 0 (completely different from baseline)
    - Attribution: 100 (perfectly matches its cited source)

    Good Response (Perfect Both):
    - If baseline is "$10,347M" and model answers "$10,347M [src_2]" and src_2 contains "$10,347M"
    - Accuracy: 100 (matches baseline)
    - Attribution: 100 (matches source)
    """

        try:
            response = self.client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": "You are an expert at evaluating response accuracy against both baseline answers and source data."},
                    {"role": "user", "content": evaluation_prompt}
                ],
                temperature=0.7,
                max_tokens=1000
            )

            content = response.choices[0].message.content
            if '{' in content and '}' in content:
                json_str = content[content.find('{'):content.rfind('}')+1]
                return json.loads(json_str)
            return self._create_default_single_evaluation(result, baseline)

        except Exception as e:
            print(f"Error evaluating response: {str(e)}")
            return self._create_default_single_evaluation(result, baseline)

    def _create_default_single_evaluation(self, result: Dict, baseline: str) -> Dict:
        """Create a default evaluation for a single response when evaluation fails"""
        return {
            "model": result["model"],
            "chunk_strategy": result["chunk_strategy"],
            "threshold": result["threshold"] if result["chunk_strategy"] == "semantic" else None,
            "question": result["question"],
            "baseline_answer": baseline,
            "model_response": result.get("response", {}),
            "scores": {
                "source_accuracy": 0,
                "source_attribution": 0,
                "conciseness": 0
            },
            "composite_score": 0,
            "detailed_analysis": {
                "accuracy_analysis": "Evaluation failed",
                "attribution_analysis": "Evaluation failed",
                "conciseness_analysis": "Evaluation failed"
            }
        }

    def _generate_summary(self, evaluations: List[Dict]) -> Dict:
        """Generate summary statistics from evaluations with ordered results"""
        if not evaluations:
            return {
                "overall_performance": "No evaluations available",
                "optimal_permutation": "Not available",
                "performance_analysis": "Evaluation process failed",
                "chunking_statistics": {}
            }

        # Create ordered list of expected configurations
        ordered_configs = []
        if CHUNKING_CONFIGS["semantic_config"]["enabled"]:
            for threshold in CHUNKING_CONFIGS["semantic_config"]["thresholds"]:
                ordered_configs.append(("semantic", threshold))

        for strategy in [s for s in CHUNKING_CONFIGS["strategies"] if s != "semantic"]:
            ordered_configs.append((strategy, None))

        # Get unique models from evaluations
        unique_models = sorted(set(eval["model"] for eval in evaluations))

        # Track chunk statistics and performance scores
        chunking_statistics = {}
        performance_scores = {}
        ordered_analysis = {}

        # Get document name from the documents list
        document_name = os.path.basename(documents[0].metadata.get('file_path', 'Unknown Document'))

        # Initialize tracking for all model-strategy combinations
        for model in unique_models:
            for strategy, threshold in ordered_configs:
                key = (model, strategy, threshold)
                performance_scores[key] = {
                    "count": 0,
                    "total_composite": 0
                }

        # First pass: calculate scores and collect statistics
        best_score = 0
        best_config = None

        for eval in evaluations:
            model = eval["model"]
            strategy = eval["chunk_strategy"]
            threshold = eval["threshold"] if strategy == "semantic" else None
            key = (model, strategy, threshold)

            # Track performance scores
            if key in performance_scores:
                performance_scores[key]["count"] += 1
                performance_scores[key]["total_composite"] += eval["composite_score"]

            # Track chunk statistics (only need one entry per strategy/threshold combination)
            chunk_key = (strategy, threshold)
            if chunk_key not in chunking_statistics:
                chunk_stats = eval.get("chunk_stats", {})
                if chunk_stats:
                    if threshold is not None:
                        config_str = f"{document_name} with {strategy} chunking (threshold: {threshold})"
                    else:
                        config_str = f"{document_name} with {strategy} chunking"

                    chunking_statistics[chunk_key] = {
                        "config_str": config_str,
                        "stats": {
                            "number_of_chunks": chunk_stats.get("num_chunks", "N/A"),
                            "average_chunk_size": round(chunk_stats.get("avg_chunk_size", 0), 2),
                            "min_chunk_size": chunk_stats.get("min_chunk_size", "N/A"),
                            "max_chunk_size": chunk_stats.get("max_chunk_size", "N/A")
                        }
                    }

        # Second pass: create ordered performance analysis and chunk statistics
        ordered_chunking_stats = {}
        for strategy, threshold in ordered_configs:
            # Add chunk statistics
            chunk_key = (strategy, threshold)
            if chunk_key in chunking_statistics:
                config_str = chunking_statistics[chunk_key]["config_str"]
                ordered_chunking_stats[config_str] = chunking_statistics[chunk_key]["stats"]

            # Add performance analysis for each model
            for model in unique_models:
                key = (model, strategy, threshold)
                scores = performance_scores[key]

                if scores["count"] > 0:
                    avg_composite = round(scores["total_composite"] / scores["count"], 2)

                    if threshold is not None:
                        perf_key = f"{model} with {strategy} chunking (threshold: {threshold})"
                    else:
                        perf_key = f"{model} with {strategy} chunking"

                    ordered_analysis[perf_key] = avg_composite

                    if avg_composite > best_score:
                        best_score = avg_composite
                        best_config = perf_key

        # Calculate overall average score
        total_score = sum(eval["composite_score"] for eval in evaluations)
        avg_score = round(total_score / len(evaluations), 2) if evaluations else 0

        return {
            "overall_performance": f"Average composite score across all evaluations: {avg_score:.2f}/100",
            "optimal_permutation": f"Best performance: {best_config} (score: {best_score:.2f}/100)",
            "performance_analysis": ordered_analysis,
            "chunking_statistics": ordered_chunking_stats
        }


    def _create_default_evaluation(self, experiment_results: Dict) -> Dict:
        """Create a default evaluation result when the evaluation process fails"""
        return {
            "metadata": {
                "timestamp": datetime.now().isoformat(),
                "model_used": "gpt-4o",
                "num_combinations_evaluated": 0,
                "num_questions_evaluated": 0,
                "evaluation_status": "failed"
            },
            "evaluations": [
                self._create_default_single_evaluation(result, "Evaluation failed")
                for result in experiment_results["results"]
            ],
            "summary": {
                "overall_performance": "Evaluation failed",
                "optimal_permutation": "Not available",
                "performance_analysis": "Evaluation process failed",
                "chunking_statistics": {}
            }
        }

#Results Manager Class

In [None]:
class ResultsManager:
    """Handles formatting, saving, and displaying evaluation results"""
    def __init__(self, save_directory: str):
        self.save_directory = save_directory
        os.makedirs(save_directory, exist_ok=True)

    def format_results(self, experiment_results: Dict, evaluation_results: Dict) -> Tuple[Dict, Dict]:
        """Format experiment and evaluation results into structured output"""
        print("\n=== Starting Results Formatting ===")

        # Format experiment results
        formatted_experiment = {
            "metadata": experiment_results.get("metadata", {}),
            "results": [{
                "model": result["model"],
                "chunk_strategy": result["chunk_strategy"],
                "threshold": result["threshold"],
                "question": result["question"],
                "response": {
                    "answer": result["response"].get("response_text", ""),
                    "sources": result["response"].get("sources", [])
                }
            } for result in experiment_results["results"]]
        }

        # Format evaluation results with baseline answer
        formatted_evaluation = {
            "metadata": evaluation_results["metadata"],
            "evaluations": [{
                "model": eval.get("model"),
                "chunk_strategy": eval.get("chunk_strategy"),
                "threshold": eval.get("threshold"),
                "question": eval.get("question"),
                "baseline_answer": eval.get("baseline_answer", "No baseline available"),  # Include baseline answer
                "model_response": eval.get("model_response", {}),
                "scores": eval.get("scores", {}),
                "composite_score": eval.get("composite_score"),
                "detailed_analysis": eval.get("detailed_analysis", {})
            } for eval in evaluation_results.get("evaluations", [])],
            "overall_summary": evaluation_results.get("summary", {})
        }

        return formatted_experiment, formatted_evaluation

    def save_results(self, formatted_experiment: Dict, formatted_evaluation: Dict) -> Tuple[str, str]:
        """Save formatted results to JSON files"""
        timestamp = time.strftime("%Y%m%d-%H%M%S")

        experiment_file = f"{self.save_directory}/experiment_results_{timestamp}.json"
        evaluation_file = f"{self.save_directory}/evaluation_results_{timestamp}.json"

        for filepath, data in [
            (experiment_file, formatted_experiment),
            (evaluation_file, formatted_evaluation)
        ]:
            with open(filepath, 'w', encoding='utf-8') as f:
                json.dump(data, f, indent=2, ensure_ascii=False)

        return experiment_file, evaluation_file

    def display_results(self, evaluation_results: Dict):
        """Display evaluation results in a clear, formatted manner"""
        print("\n" + "="*80)
        print("EVALUATION RESULTS")
        print("="*80)

        # Display metadata
        metadata = evaluation_results.get("metadata", {})
        print("\nMETADATA:")
        print("-"*80)
        print(f"Timestamp:           {metadata.get('timestamp', 'Not available')}")
        print(f"Model Used:          {metadata.get('model_used', 'Not available')}")
        print(f"Combinations:        {metadata.get('num_combinations_evaluated', 'Not available')}")
        print(f"Questions:           {metadata.get('num_questions_evaluated', 'Not available')}")
        print(f"Evaluation Status:   {metadata.get('evaluation_status', 'Not available')}")

        # Display evaluations
        evaluations = evaluation_results.get("evaluations", [])
        if evaluations:
            print("\nDETAILED EVALUATIONS:")
            print("-"*80)
            for eval in evaluations:
                print(f"\nQuestion: {eval.get('question', 'No question provided')}")
                print(f"Model: {eval.get('model', 'No model specified')}")
                print(f"Strategy: {eval.get('chunk_strategy', 'No strategy specified')}")
                if eval.get('threshold'):
                    print(f"Threshold: {eval.get('threshold')}")

                # Display baseline answer
                print("\nBaseline Answer:")
                baseline = eval.get('baseline_answer', 'No baseline answer available')
                print(textwrap.fill(str(baseline), width=80))

                print("\nModel Response:")
                response = eval.get('model_response', {})
                response_text = response.get('response_text', 'No response available')
                if response_text:
                    print(textwrap.fill(str(response_text), width=80))
                else:
                    print("No response available")

                print("\nSource Data:")
                sources = response.get('sources', [])
                if sources:
                    for source in sources:
                        if source:  # Check if source is not empty
                            print(textwrap.fill(str(source), width=80))
                else:
                    print("No source data available")

                print("\nScores:")
                scores = eval.get('scores', {})
                for metric, score in scores.items():
                    print(f"- {metric.replace('_', ' ').capitalize()}: {score}/100")
                print(f"Composite Score: {eval.get('composite_score', 0)}/100")

                print("\nDetailed Analysis:")
                analysis = eval.get('detailed_analysis', {})
                for aspect, details in analysis.items():
                    if details:  # Check if details is not empty
                        print(f"\n{aspect.replace('_', ' ').capitalize()}:")
                        print(textwrap.fill(str(details), width=80))

        # Display summary
        summary = evaluation_results.get("overall_summary", {})
        if summary:
            print("\nOVERALL SUMMARY:")
            print("-"*80)

            if "overall_performance" in summary:
                print("\nOverall Performance:")
                print(textwrap.fill(str(summary["overall_performance"]), width=80))

            if "optimal_permutation" in summary:
                print("\nOptimal Configuration:")
                print(textwrap.fill(str(summary["optimal_permutation"]), width=80))

            if "chunking_statistics" in summary:
                print("\nChunking Statistics:")
                chunk_stats = summary["chunking_statistics"]
                for config, stats in chunk_stats.items():
                    print(f"\n{config}:")
                    print(f"  Number of Chunks: {stats['number_of_chunks']}")
                    print(f"  Average Chunk Size: {stats['average_chunk_size']}")
                    print(f"  Min Chunk Size: {stats['min_chunk_size']}")
                    print(f"  Max Chunk Size: {stats['max_chunk_size']}")

            if "performance_analysis" in summary:
                print("\nPerformance Analysis:")
                analysis = summary["performance_analysis"]
                if isinstance(analysis, dict):
                    for config, score in analysis.items():
                        print(f"{config}: {score:.2f}")
                else:
                    print(textwrap.fill(str(analysis), width=80))

#Main

In [None]:
def main():
    # Initialize configurations with semantic settings from config
    semantic_enabled = CHUNKING_CONFIGS["semantic_config"]["enabled"]
    semantic_thresholds = CHUNKING_CONFIGS["semantic_config"]["thresholds"]

    # Update strategies list if semantic is enabled
    strategies = CHUNKING_CONFIGS["strategies"]
    if semantic_enabled:
        strategies = ["semantic"] + strategies

    model_config = ModelConfig(
        models=MODEL_CONFIGS["models"],
        temperature=0.3
    )

    # Initialize experiment runner with flexible configuration
    experiment_runner = ExperimentRunner(
        model_config=model_config,
        questions=QUESTION_CONFIGS["questions"],
        chunk_strategies=strategies,
        semantic_enabled=semantic_enabled,
        semantic_thresholds=semantic_thresholds
    )

    print("Starting experiment with configurations:")
    print(f"Models: {[model['name'] for model in model_config.models]}")
    if semantic_enabled:
        print(f"Semantic thresholds: {semantic_thresholds}")
    print(f"Chunk strategies: {strategies}")
    print(f"Number of questions: {len(QUESTION_CONFIGS['questions'])}")

    # Rest of the main function remains the same
    experiment_results = experiment_runner.run_experiments()

    print("\nInitializing GPT-4o evaluation...")
    evaluator = ExperimentEvaluator(api_key=userdata.get('OPENAI_API_KEY'))

    evaluation_results = evaluator.evaluate_experiments(
        experiment_results=experiment_results,
        source_docs=documents
    )

    results_manager = ResultsManager(save_directory=FILE_CONFIGS['save_directory'])

    formatted_experiment, formatted_evaluation = results_manager.format_results(
        experiment_results=experiment_results,
        evaluation_results=evaluation_results
    )

    experiment_file, evaluation_file = results_manager.save_results(
        formatted_experiment=formatted_experiment,
        formatted_evaluation=formatted_evaluation
    )

    results_manager.display_results(evaluation_results=formatted_evaluation)

    print("\nExperiment complete!")
    print(f"Results saved to:")
    print(f"  Experiment results: {experiment_file}")
    print(f"  Evaluation results: {evaluation_file}")

    torch.cuda.empty_cache()
    gc.collect()

    return formatted_experiment, formatted_evaluation


if __name__ == "__main__":
    results, evaluation = main()