In [None]:
import requests
import pandas as pd
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore

from llama_index.core import Document
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core import PromptTemplate
from dotenv import load_dotenv
import os

from langchain.agents import initialize_agent, Tool, AgentType
from langchain.llms import OpenAI
from langchain_community.tools import DuckDuckGoSearchRun
 
from dotenv import load_dotenv
import openai
from datasets import load_dataset
from chromadb import PersistentClient
from chromadb.config import Settings
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings


from concurrent.futures import ThreadPoolExecutor
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from tqdm import tqdm  # Import tqdm for progress bar
import json


In [None]:

# Load OpenAI API key
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")


In [None]:
from datasets import load_dataset

# Load dataset splits
corpus_dataset = load_dataset("rag-datasets/rag-mini-wikipedia", "text-corpus", split="passages")
eval_dataset = load_dataset("rag-datasets/rag-mini-wikipedia", "question-answer", split="test")


In [None]:

# Step 2: Convert the text corpus into Document objects
documents = [example['passage'] for example in corpus_dataset]

documents = [Document(text=text, metadata={"source": f"doc_{i}"}) for i, text in enumerate(documents)]

# Extract the text corpus
print(f"Extracted {len(documents)} passages.")


In [None]:
# Step 3: Initialize storage and embedding
chroma_client = chromadb.PersistentClient(path="./chroma_db")  
chroma_collection = chroma_client.get_or_create_collection(name="knowledge_base")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
 

In [None]:

# Initialize OpenAI Embedding model
embed_model = OpenAIEmbeddings(model="text-embedding-ada-002")

# Embed documents in chunks
text_splitter = CharacterTextSplitter(chunk_size=768, chunk_overlap=56)
all_embeddings = []

def get_batch_embeddings(batch_chunks):
    """Fetch embeddings for a batch of chunks."""
    embeddings = embed_model.embed_documents(batch_chunks)  # Correct method to use after update
    return embeddings

def process_document(doc, progress_bar):
    """Process a single document and return its embeddings."""
    chunks = text_splitter.split_text(doc.text)
    batch_size = 100  # Process 100 chunks at once for batching
    chunk_batches = [chunks[i:i+batch_size] for i in range(0, len(chunks), batch_size)]
    
    embeddings_for_doc = []
    
    # Send batches to be processed concurrently
    with ThreadPoolExecutor() as executor:
        for batch_embeddings in executor.map(get_batch_embeddings, chunk_batches):
            embeddings_for_doc.extend(batch_embeddings)
    
    # Update the progress bar after processing each document
    progress_bar.update(1)
    
    return embeddings_for_doc

# Iterate over all documents and process them in parallel with a progress bar
def embed_documents(documents):
    total_documents = len(documents)
    
    # Initialize progress bar
    with tqdm(total=total_documents, desc="Processing documents") as progress_bar:
        with ThreadPoolExecutor() as executor:
            embeddings = list(executor.map(process_document, documents, [progress_bar] * total_documents))
    
    all_embeddings = [embedding for sublist in embeddings for embedding in sublist]
    return all_embeddings

all_embeddings = embed_documents(documents)

In [None]:
import faiss
import numpy as np

# Step: Create FAISS Index
embedding_dim = 1536  # Dimension of text-embedding-ada-002

# Initialize FAISS index (using L2 distance)
index = faiss.IndexFlatL2(embedding_dim)

# Convert embeddings to a NumPy array
embedding_matrix = np.array(all_embeddings).astype(np.float32)

# Add embeddings to the FAISS index
index.add(embedding_matrix)

print(f"FAISS index created with {index.ntotal} vectors.")


In [None]:
# Step: Retrieve Relevant Documents

def get_top_k_documents(query, top_k=5):
    """Retrieve top-k most relevant documents using FAISS."""
    query_embedding = embed_model.embed_query(query)  # Get query embedding
    query_vector = np.array(query_embedding).astype(np.float32).reshape(1, -1)

    # Search the FAISS index
    distances, indices = index.search(query_vector, top_k)

    # Fetch the top-k matching documents
    top_docs = [documents[i].text for i in indices[0]]
    return top_docs

# Test retrieval
query = "When did he die?"
retrieved_docs = get_top_k_documents(query)
print(f"Top documents for query: {query}")
for i, doc in enumerate(retrieved_docs):
    print(f"\nDocument {i+1}:\n{doc}")


In [None]:
# Step: Generate Answer with OpenAI (Updated for latest API)

def generate_answer(query, retrieved_docs):
    """Generate an answer using the query and retrieved documents."""
    context = "\n\n".join(retrieved_docs)

    # Create the augmented prompt
    prompt = f"Directly and briefly answer the question based on the following context:\n\n{context}\n\nQuestion: {query}\nAnswer:"

    # Use OpenAI's GPT-4 to generate the answer
    client = openai.OpenAI(api_key=api_key)
 
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}]
    )

    # Extract the assistant's reply
    return response.choices[0].message.content

# Example usage
answer = generate_answer(query, retrieved_docs)
print(f"\nGenerated Answer:\n{answer}")


In [None]:
# Step: Evaluate and Aggregate Results

def evaluate_system(eval_dataset, top_k=5):
    """Evaluate the system using OpenAI API and the improved evaluation prompt for a subset of questions."""
    results = []

    client = openai.OpenAI(api_key=api_key)
 
    for example in tqdm(eval_dataset, desc="Evaluating system"):
        question = example['question']
        ground_truth_answer = example['answer']

        # Retrieve relevant documents and generate system answer
        retrieved_docs = get_top_k_documents(question, top_k=top_k)
        retriever_response = generate_answer(question, retrieved_docs)

        # Build the improved evaluation prompt
        deepval_prompt = f"""
            You are an expert evaluator assessing a Retrieval-Augmented Generation (RAG) system. Your task is to score the system's answer based on the given question and ground truth.

            Please assign scores for the following metrics on a scale of 0 to 1. A score of 1 indicates perfect performance, and 0 indicates complete failure. The score should be based on the given instructions.

            **Question:** {question}
            **Ground Truth Answer:** {ground_truth_answer}
            **System Answer:** {retriever_response}

            ### Instructions for Scoring:

            - **Answer Relevancy (0-1):** How well does the system answer the question? 
            - A score of 1 means the answer directly addresses the question. 
            - A score of 0 means the answer is irrelevant or does not address the question at all. 
            - Scores between 0 and 1 can be given for partial relevance.

            - **Faithfulness (0-1):** Is the system answer factually consistent with the information in the retrieved context?
            - A score of 1 means the answer is entirely factually correct and consistent with the retrieved content. 
            - A score of 0 means the answer contains false information.
            - Scores between 0 and 1 should reflect degrees of factual correctness.

            - **Contextual Precision (0-1):** How specific and concise is the system answer in relation to the ground truth?
            - A score of 1 means the system answer is precise and matches the specific details of the ground truth.
            - A score of 0 means the system answer is vague or includes incorrect or irrelevant details.

            - **Contextual Recall (0-1):** Does the system answer capture the key information from the ground truth?
            - A score of 1 means the answer captures all essential details.
            - A score of 0 means the answer misses critical information.
            - Scores between 0 and 1 reflect partial recall.

            Please respond with the scores in the following format:

            {{
                "answer_relevancy": <value>,
                "faithfulness": <value>,
                "contextual_precision": <value>,
                "contextual_recall": <value>
            }}
            If the system answer is perfectly correct, return a score of 1 for all metrics and if it was completely wrong return a score of 0.
            """


        # Get evaluation scores from GPT-4o-mini
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": deepval_prompt}]
        )

        # Parse the response
        eval_scores_raw = response.choices[0].message.content.strip()
 
        try:
            eval_scores = json.loads(eval_scores_raw)
        except json.JSONDecodeError:
            print(f"Failed to parse scores for question: {question}")
            eval_scores = {
                "answer_relevancy": 0,
                "faithfulness": 0,
                "contextual_precision": 0,
                "contextual_recall": 0
            }

        # Store the results
        results.append({
            "question": question,
            "ground_truth_answer": ground_truth_answer,
            "retriever_response": retriever_response,
            "scores": eval_scores
        })

    return results

# Calculate average scores
def calculate_average_scores(results):
    """Calculate the average scores for each metric."""
    total_scores = {
        "answer_relevancy": 0,
        "faithfulness": 0,
        "contextual_precision": 0,
        "contextual_recall": 0
    }

    num_questions = len(results)

    for result in results:
        scores = result["scores"]
        for metric in total_scores:
            total_scores[metric] += scores.get(metric, 0)

    # Compute averages
    avg_scores = {metric: score / num_questions for metric, score in total_scores.items()}
    return avg_scores

# Run the evaluation for 5 questions  
evaluation_results = evaluate_system(eval_dataset)


# Print average scores
average_scores = calculate_average_scores(evaluation_results)
print("\nAverage Scores for 5 Questions:")
for metric, avg_score in average_scores.items():
    print(f"{metric}: {avg_score:.2f}")
