In [None]:
import os
import PyPDF2
from openai import OpenAI
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter
import pandas as pd
from dotenv import load_dotenv
from huggingface_hub import login
from sentence_transformers import SentenceTransformer
import numpy as np
from openai import OpenAI
load_dotenv()

In [None]:
%pip install spacy

In [None]:
from transformers import pipeline
import spacy


In [None]:
!python -m spacy download en_core_web_sm


In [None]:

# Load models once (outside the function)
nli_model = pipeline("text-classification", model="roberta-large-mnli", device=0)  # device=0 for GPU
nlp = spacy.load("en_core_web_sm")


In [None]:
HF_TOKEN = os.getenv("HUGGING_FACE_API_KEY")
DEEPSEEK_TOKEN = os.getenv("DEEPSEEK_API_KEY")
# Login with Hugging Face token
login(HF_TOKEN)

# Load the model
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')


In [None]:
# Step 3: Set up the retrieval system using Sentence Transformers
class DocumentRetriever:
    def __init__(self, documents, embedding_model):
        self.documents = [doc.text for doc in documents]
        self.embedding_model = embedding_model
        self.document_embeddings = self.embed_documents()

    def embed_documents(self):
        # Encode documents using the Sentence Transformers model
        return self.embedding_model.encode(self.documents)

    def retrieve(self, query, top_k=10, similarity_threshold=0.3):
        # Encode the query
        query_embedding = self.embedding_model.encode([query])

        # Compute cosine similarity between query and documents
        similarities = np.dot(self.document_embeddings, query_embedding.T).flatten()
        top_indices = similarities.argsort()[-top_k:][::-1]

        # Filter documents based on similarity threshold
        relevant_docs = []
        for i in top_indices:
            if similarities[i] >= similarity_threshold:
                relevant_docs.append(self.documents[i])

        return relevant_docs


In [None]:
# Step 4: Integrate DeepSeek API for generation with strict document adherence
def generate_with_deepseek(query, context):
    # Initialize the DeepSeek client
    client = OpenAI(
        api_key=DEEPSEEK_TOKEN,
        base_url="https://api.deepseek.com/v1",
    )

    response = client.chat.completions.create(
        model="deepseek-chat", 
        messages=[
            {
                "role": "system", 
                "content": "You are a helpful assistant. Answer the following question using only the provided context. Your answer must be concise and no longer than one sentence. If the context does not provide a clear answer, respond with 'Not available.'"
            },
            {
                "role": "user", 
                "content": f"You are a helpful assistant. Answer the following question using only the provided context. Your answer must be concise and no longer than one sentence. If the context does not provide a clear answer, respond with 'Not available.' Context: {context}\n\nQuestion: {query}\nAnswer:"
            }
        ],
        temperature=0.3,
        max_tokens=256
    )
    return response.choices[0].message.content


In [None]:
# Step 5: Combine retrieval and generation
class RAGPipeline:
    def __init__(self, retriever):
        self.retriever = retriever

    def run(self, query):
        # Retrieve relevant documents
        retrieved_docs = self.retriever.retrieve(query)

        # If no relevant documents are found, return "Not available"
        if not retrieved_docs:
            return "Not available or not in context."

        # Combine retrieved documents into context
        context = "\n".join(retrieved_docs)

        # Generate response using DeepSeek API
        response = generate_with_deepseek(query, context)
        return response

# Now we use the wikipedia Dataset
---------------------------------

In [None]:
from datasets import load_dataset

# Load dataset splits
corpus_dataset = load_dataset("rag-datasets/rag-mini-wikipedia", "text-corpus", split="passages")
eval_dataset = load_dataset("rag-datasets/rag-mini-wikipedia", "question-answer", split="test")

# Prepare knowledge base from text-corpus
corpus_documents = [Document(text=example['passage']) for example in corpus_dataset]


In [None]:
print("Corpus Documents:", corpus_documents[:2])

In [None]:
# Document processing with chunking
parser = SentenceSplitter(chunk_size=768, chunk_overlap=56)
document_chunks = []
for doc in corpus_documents:
    chunks = parser.split_text(doc.text)
    for chunk in chunks:
        document_chunks.append(Document(text=chunk))



In [None]:
# Step 7: Initialize components
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
retriever = DocumentRetriever(document_chunks, embedding_model)
rag_pipeline = RAGPipeline(retriever)


In [None]:
query = "When did he die?"
response = rag_pipeline.run(query)
print("Generated Response:\n", response)

---------------------------------
RAG pipeline is done up to here, now Evaluation
---------------------------------
---------------------------------


In [None]:
# Simple Document class to hold text and metadata.
class Document:
    def __init__(self, text, metadata=None):
        self.text = text
        self.metadata = metadata or {}

# --- Retriever Component ---
class DocumentRetriever:
    def __init__(self, documents, embedding_model):
        # Store only the text for retrieval purposes.
        self.documents = [doc.text for doc in documents]
        self.embedding_model = embedding_model
        self.document_embeddings = self.embed_documents()

    def embed_documents(self):
        # Encode documents once during initialization.
        return self.embedding_model.encode(self.documents)

    def retrieve(self, query, top_k=10, similarity_threshold=0.3):
        # Encode query and compute cosine similarities.
        query_embedding = self.embedding_model.encode([query])
        similarities = np.dot(self.document_embeddings, query_embedding.T).flatten()
        top_indices = similarities.argsort()[-top_k:][::-1]
        # Filter documents based on similarity threshold.
        relevant_docs = [self.documents[i] for i in top_indices if similarities[i] >= similarity_threshold]
        return relevant_docs

# --- RAG Pipeline ---
class RAGPipeline:
    def __init__(self, retriever):
        self.retriever = retriever

    def run(self, query):
        # Retrieve relevant documents.
        retrieved_docs = self.retriever.retrieve(query)
        if not retrieved_docs:
            return "Not available or not in context.", retrieved_docs
        # Combine retrieved documents into a context string.
        context = "\n".join(retrieved_docs)
        # Generate answer based on query and context.
        response = generate_with_deepseek(query, context)
        return response, retrieved_docs

In [None]:

import pandas as pd

# Assuming eval_dataset is a list of dictionaries with keys: 'query', 'context', 'response'
eval_data_df = pd.DataFrame(eval_dataset)  # Convert eval_dataset to a DataFrame

# Save the extracted data to CSV
csv_save_path = "evaluation_data.csv"
eval_data_df.to_csv(csv_save_path, index=False)

print(f"Evaluation dataset saved to {csv_save_path}")


In [None]:
retriever = DocumentRetriever(document_chunks, embedding_model)
rag_pipeline = RAGPipeline(retriever)

#  generate response and context for the first five questions in the eval_data_df
eval_df_new = eval_data_df.head(2).copy()
# now create a new dataframe with additional columns with the generated response and retrieved context using the RAG pipeline
eval_df_new[["generated_response", "retrieved_context"]] = eval_df_new.apply(lambda row: rag_pipeline.run(row["question"]), axis=1, result_type="expand")



In [None]:
eval_df_new

In [None]:
eval_df_resp_cont = eval_data_df.copy()

# Apply the RAG pipeline to each question and create two new columns:
# "generated_response" and "retrieved_context"
eval_df_resp_cont[["generated_response", "retrieved_context"]] = eval_df_resp_cont.apply(
    lambda row: rag_pipeline.run(row["question"]), axis=1, result_type="expand"
)

# save the new dataframe to a CSV file
eval_df_resp_cont.to_csv("eval_data_with_responses.csv", index=False)


In [None]:
print("Evaluation data with responses saved to eval_data_with_responses.csv")