In [85]:
# Load your doc
from langchain_community.document_loaders import PyPDFLoader, CSVLoader, DirectoryLoader

loader = DirectoryLoader("data", glob="*.pdf", loader_cls=PyPDFLoader)

documents = loader.load()

In [2]:
# Instantaniate embedding
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange


In [6]:
# Instantaniate LLM
from langchain_community.chat_models import ChatDatabricks

chat_llm = ChatDatabricks(endpoint="databricks-dbrx-instruct", max_tokens=200)

In [7]:
# Document Splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1250,
    chunk_overlap = 100,
    length_function = len,
    is_separator_regex = False
)
#
split_docs = text_splitter.split_documents(documents)
print(len(split_docs))

254


In [8]:
# Instantaniate Vectorstore
from langchain_community.vectorstores import Chroma
vectorstore = Chroma(embedding_function=embeddings,
                     persist_directory="/content/drive/MyDrive/Vectorstore/chromadb",
                     collection_name="full_documents")
# Load and persist the split documents into the vectorstore
vectorstore.add_documents(split_docs)
vectorstore.persist()

  warn_deprecated(


In [9]:
# Instantiate the Keyword / Sparse embeddings model
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.retrievers import ContextualCompressionRetriever
#
bm25_retriever = BM25Retriever.from_documents(split_docs)
bm25_retriever.k=10

In [10]:
# Instantiate Reranker — Cross Encoders
from __future__ import annotations
from typing import Dict, Optional, Sequence
from langchain.schema import Document
from langchain.pydantic_v1 import Extra, root_validator

from langchain.callbacks.manager import Callbacks
from langchain.retrievers.document_compressors.base import BaseDocumentCompressor

from sentence_transformers import CrossEncoder
# from config import bge_reranker_large

class BgeRerank(BaseDocumentCompressor):
    model_name:str = 'BAAI/bge-reranker-large'
    """Model name to use for reranking."""
    top_n: int = 3
    """Number of documents to return."""
    model:CrossEncoder = CrossEncoder(model_name)
    """CrossEncoder instance to use for reranking."""

    def bge_rerank(self,query,docs):
        model_inputs =  [[query, doc] for doc in docs]
        scores = self.model.predict(model_inputs)
        results = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)
        return results[:self.top_n]


    class Config:
        """Configuration for this pydantic object."""

        extra = Extra.forbid
        arbitrary_types_allowed = True

    def compress_documents(
        self,
        documents: Sequence[Document],
        query: str,
        callbacks: Optional[Callbacks] = None,
    ) -> Sequence[Document]:
        """
        Compress documents using BAAI/bge-reranker models.

        Args:
            documents: A sequence of documents to compress.
            query: The query to use for compressing the documents.
            callbacks: Callbacks to run during the compression process.

        Returns:
            A sequence of compressed documents.
        """
        if len(documents) == 0:  # to avoid empty api call
            return []
        doc_list = list(documents)
        _docs = [d.page_content for d in doc_list]
        results = self.bge_rerank(query, _docs)
        final_results = []
        for r in results:
            doc = doc_list[r[0]]
            doc.metadata["relevance_score"] = r[1]
            final_results.append(doc)
        return final_results



In [11]:
# Instantiate a Contextual Compression Pipeline
from langchain_community.document_transformers.embeddings_redundant_filter import EmbeddingsRedundantFilter
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain.retrievers import ContextualCompressionRetriever
from langchain_community.document_transformers.long_context_reorder import LongContextReorder
from langchain.retrievers.multi_query import MultiQueryRetriever
#
vs_retriever = vectorstore.as_retriever(search_kwargs={"k":10})
#

ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever,vs_retriever],
                                       weight=[0.5,0.5])
#

redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings)
#
reordering = LongContextReorder()
#
reranker = BgeRerank()
#
pipeline_compressor = DocumentCompressorPipeline(transformers=[redundant_filter,reordering,reranker])
#
compression_pipeline = ContextualCompressionRetriever(base_compressor=pipeline_compressor,
                                                      base_retriever=ensemble_retriever)

In [13]:
# Helper function to display retrieved documents
def pretty_print_docs(docs):
  print(
      f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n + {d.page_content}" for i,d in enumerate(docs)])
  )

docs = compression_pipeline.get_relevant_documents("What is hallucination keep your answer under 30 words?")
pretty_print_docs(docs)

Document 1:

 + 2023.
[544] N. Gillibrand and C. Draper, “Informational sovereignty: A
new framework for ai regulation,” Gillibrand, Nicky, and Chris
Draper.“Informational Sovereignty: A New Framework For AI Reg-
ulation”(July 17, 2023). , 2023.
[545] D. Oba, M. Kaneko, and D. Bollegala, “In-contextual bias suppression
for large language models,” arXiv preprint arXiv:2309.07251 , 2023.
[546] Z. Ji, N. Lee, R. Frieske, T. Yu, D. Su, Y . Xu, E. Ishii, Y . J. Bang,
A. Madotto, and P. Fung, “Survey of hallucination in natural language
generation,” ACM Computing Surveys , vol. 55, no. 12, pp. 1–38, 2023.
[547] J. Greene, “Will ChatGPT Make Lawyers Obsolete? (Hint: Be Afraid),”
Reuters , December 2022.
[548] T. McCoy, E. Pavlick, and T. Linzen, “Right for the wrong reasons:
Diagnosing syntactic heuristics in natural language inference,” in
Proceedings of the 57th Annual Meeting of the Association for Compu-
tational Linguistics , (Florence, Italy), pp. 3428–3448, Association for
Computationa

In [14]:
# Define an Advanced RAG
from langchain.chains import RetrievalQA
#
qa_advanced = RetrievalQA.from_chain_type(llm=chat_llm,
                                 chain_type="stuff",
                                 retriever=compression_pipeline,
                                 return_source_documents=True)
#
qa_adv_response = qa_advanced("What is Negative prompting?")  
qa_adv_response["result"]

  warn_deprecated(


"Negative prompting is a technique used to guide language models, such as the one you're interacting with, to avoid generating certain types of content. It provides specific directions to the model about aspects of the prompt that it should not include or generate during the generation process. This can help fine-tune the results generated by the model while keeping the prompt generic. Negative prompting can also be used to moderate the output content generated by the model, preventing harmful, offensive, or inappropriate content from being generated. It's a way to ensure that the model's responses are safe, accurate, and relevant to the prompt."

In [15]:
qa_adv_response

{'query': 'What is Negative prompting?',
 'result': "Negative prompting is a technique used to guide language models, such as the one you're interacting with, to avoid generating certain types of content. It provides specific directions to the model about aspects of the prompt that it should not include or generate during the generation process. This can help fine-tune the results generated by the model while keeping the prompt generic. Negative prompting can also be used to moderate the output content generated by the model, preventing harmful, offensive, or inappropriate content from being generated. It's a way to ensure that the model's responses are safe, accurate, and relevant to the prompt.",
 'source_documents': [_DocumentWithState(page_content='20\nTABLE VI: Image generation examples\nPrompt: Different famous personalities in roles other than their original ones\nNegative Prompt: blurry, photorealistic\nGenerated Images:\na b c d\nPrompt: Generate an image of Monalisa showing h

In [33]:
evaldf = pd.read_csv(r"C:\Users\TanmayRaju\Documents\GitHub\AdvRAG\new\data\combined_testset.csv")
evaldf.head(10)

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,What was the revenue decline in mainland China...,"['------- \n \n Thank you, Tim, and g...",The revenue decline in mainland China for the ...,simple,[{'source': 'C:\\Users\\TanmayRaju\\Documents\...,True
1,What was the value of Apple's channel inventor...,"[""market now reaching a pretty mature growth b...",The value of Apple's channel inventory reducti...,simple,[{'source': 'C:\\Users\\TanmayRaju\\Documents\...,True
2,What is the significance of Apple's 1 billion ...,"[""The services business is powered by our huge...",Apple's 1 billion active devices are a signifi...,simple,[{'source': 'C:\\Users\\TanmayRaju\\Documents\...,True
3,What is the significance of the 10 million con...,"[""The services business is powered by our huge...",The 10 million contactless ready locations in ...,simple,[{'source': 'C:\\Users\\TanmayRaju\\Documents\...,True
4,What was the date of Apple Inc's Q1 2018 earni...,['Thomson Reuters StreetEvents Event Brief \nE...,The date of Apple Inc's Q1 2018 earnings call ...,simple,[{'source': 'C:\\Users\\TanmayRaju\\Documents\...,True
5,What is the purpose of the Thomson Reuters Str...,['Thomson Reuters StreetEvents Event Brief \nE...,The Thomson Reuters StreetEvents Event Brief p...,simple,[{'source': 'C:\\Users\\TanmayRaju\\Documents\...,True
6,How has the upgrade rate for Apple devices cha...,"['you\'ve talked about 15% for Q2. <Sync id=""L...",The context does not provide specific informat...,simple,[{'source': 'C:\\Users\\TanmayRaju\\Documents\...,True
7,How has the upgrade rate for Apple devices cha...,"['you\'ve talked about 15% for Q2. <Sync id=""L...",The context does not provide specific informat...,simple,[{'source': 'C:\\Users\\TanmayRaju\\Documents\...,True
8,How has Apple's focus on creating a great cust...,['driving earnings. How do you think about it?...,Apple's focus on creating a great customer exp...,simple,[{'source': 'C:\\Users\\TanmayRaju\\Documents\...,True
9,1. How might the drive for improvement in larg...,['25\nFig. 10: Demonstration of code generatio...,,multi_context,"[{'source': 'data\\LLM_Final.pdf', 'page': 25}]",True


In [16]:
# Synthetic Test Set Generation
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
#
#load documents again to avoid any kind of bias
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 200
)
documents = text_splitter.split_documents(documents)
#
generator_llm = ChatDatabricks(endpoint="databricks-dbrx-instruct", max_tokens = 200)
critic_llm = ChatDatabricks(endpoint="databricks-dbrx-instruct", max_tokens = 200)
embeddings = HuggingFaceEmbeddings()

#
generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)
#
testset = generator.generate_with_langchain_docs(documents, test_size=10, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})

Filename and doc_id are the same for all nodes.                   
Generating: 100%|██████████| 10/10 [00:46<00:00,  4.67s/it]


In [17]:
test_df = testset.to_pandas()
test_questions = test_df["question"].values.tolist()
test_groundtruths = test_df["ground_truth"].values.tolist()
test_df.head()

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,What is the difference between artificial gene...,"[30\n•Source of Inspiration, Suggestions: LLMs...",Artificial general intelligence (AGI) refers t...,simple,"[{'source': 'data\LLM_Final.pdf', 'page': 30}]",True
1,What is the first step in using large language...,"[human language.\nLastly, third-party plugins ...",The first step in using large language models ...,simple,"[{'source': 'data\LLM_Final.pdf', 'page': 23}]",True
2,How can a question answering system be general...,"[tasks via text generation,” in International ...",A question answering system can be generalized...,simple,"[{'source': 'data\LLM_Final.pdf', 'page': 36}]",True
3,What is the use case for the Language Translat...,[24\nTABLE IX: Some ChatGPT Plugins. This list...,The Language Translation plugin in ChatGPT can...,simple,"[{'source': 'data\LLM_Final.pdf', 'page': 24}]",True
4,How has ChatGPT been applied in the fields of ...,"[vol. 237, no. 8, pp. 1855–1876, 2023.\n[367] ...","According to the provided context, ChatGPT has...",simple,"[{'source': 'data\LLM_Final.pdf', 'page': 38}]",True


In [18]:
from datasets import Dataset

# Generate responses using our Advanced RAG pipeline using the questions we’ve generated.
adv_answers = []
adv_contexts = []

for question in test_questions:
  response = qa_advanced.invoke({"query" : question})
  adv_answers.append(response["result"])
  adv_contexts.append([context.page_content for context in response['source_documents']])

#wrap into huggingface dataset
response_dataset_advanced_retrieval = Dataset.from_dict({
    "question" : test_questions,
    "answer" : adv_answers,
    "contexts" : adv_contexts,
    "ground_truth" : test_groundtruths
})
response_dataset_advanced_retrieval[0]

{'question': 'What is the difference between artificial general intelligence (AGI) and the current capabilities of language models like me?',
 'answer': 'The current capabilities of language models like you are impressive, but they are still far from the hypothetical concept of Artificial General Intelligence (AGI). AGI refers to a type of artificial intelligence that has the ability to learn and perform any intellectual task, much like a human. This means that an AGI system would be able to understand, learn, and apply knowledge across a wide range of tasks and domains, and adapt to new situations and environments.\n\nWhile language models like you have made significant contributions to various domains and can perform a variety of tasks, such as solving math problems, writing creative content, and answering questions in an informative way, they still have significant limitations and challenges. For example, you may rely too heavily on surface-level patterns, have limited common sense 

In [39]:
phase2data =response_dataset_advanced_retrieval.to_pandas()

In [49]:
customphase2data =phase2data[["question", "answer"]]

In [68]:
customphase2data = Dataset.from_dict({
    "question" : test_questions,
    "ground_truth" : test_groundtruths
})

customphase2data.to_csv(r'C:\Users\TanmayRaju\Documents\GitHub\AdvRAG\new\data\customphase2data.csv', index=False)

Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 334.61ba/s]


5732

In [67]:
from ragas import evaluate
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    answer_correctness
)

metrics = [
    # faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]

advanced_retrieval_results = evaluate(customphase2data, metrics, llm=chat_llm, embeddings=embeddings, raise_exceptions=False)
advanced_retrieval_results

ValueError: The metric [answer_relevancy] that that is used requires the following additional columns ['contexts', 'answer'] to be present in the dataset. 

In [20]:
from ragas import evaluate
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    answer_correctness
)

metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]

advanced_retrieval_results = evaluate(response_dataset_advanced_retrieval, metrics, llm=chat_llm, embeddings=embeddings, raise_exceptions=False)
advanced_retrieval_results

Evaluating:  18%|█▊        | 9/50 [00:13<00:53,  1.29s/it]Failed to parse output. Returning None.
Evaluating:  22%|██▏       | 11/50 [00:15<00:39,  1.01s/it]Failed to parse output. Returning None.
Failed to parse output. Returning None.
Evaluating:  24%|██▍       | 12/50 [00:16<00:42,  1.11s/it]Failed to parse output. Returning None.
Evaluating:  26%|██▌       | 13/50 [00:16<00:30,  1.20it/s]Failed to parse output. Returning None.
Evaluating:  32%|███▏      | 16/50 [00:18<00:19,  1.72it/s]Failed to parse output. Returning None.
Evaluating:  34%|███▍      | 17/50 [00:18<00:19,  1.69it/s]Failed to parse output. Returning None.
Evaluating:  40%|████      | 20/50 [00:20<00:16,  1.77it/s]Failed to parse output. Returning None.
Evaluating:  50%|█████     | 25/50 [00:24<00:27,  1.09s/it]Failed to parse output. Returning None.
Evaluating:  62%|██████▏   | 31/50 [00:28<00:15,  1.26it/s]Failed to parse output. Returning None.
Evaluating:  64%|██████▍   | 32/50 [00:29<00:11,  1.59it/s]Failed to p

{'faithfulness': nan, 'answer_relevancy': 0.7251, 'context_recall': 0.9259, 'context_precision': 0.9833, 'answer_correctness': 0.7202}

In [34]:
# Testing our evalframework function

def eval_framework(filepath, customchain, myllm, testsize ):

    # Load your doc
    loader = PyPDFLoader(filepath)
    documents = loader.load()

    # Document Splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 1000,
        chunk_overlap = 200
    )
    documents = text_splitter.split_documents(documents)
    #
    generator_llm = myllm
    critic_llm = myllm
    embeddings = HuggingFaceEmbeddings()

    #
    generator = TestsetGenerator.from_langchain(
        generator_llm,
        critic_llm,
        embeddings
    )
    #
    testset = generator.generate_with_langchain_docs(documents, test_size=testsize, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})

    test_df = testset.to_pandas()
    test_questions = test_df["question"].values.tolist()
    test_groundtruths = test_df["ground_truth"].values.tolist()

    # Generate responses using our Advanced RAG pipeline using the questions we’ve generated.
    adv_answers = []
    adv_contexts = []

    for question in test_questions:
        response = customchain.invoke({"query" : question})
        adv_answers.append(response["result"])
        adv_contexts.append([context.page_content for context in response['source_documents']])

    #wrap into huggingface dataset
    response_dataset_advanced_retrieval = Dataset.from_dict({
        "question" : test_questions,
        "answer" : adv_answers,
        "contexts" : adv_contexts,
        "ground_truth" : test_groundtruths
    })

    metrics = [
        faithfulness,
        answer_relevancy,
        context_recall,
        context_precision,
        answer_correctness,
    ]

    advanced_retrieval_results = evaluate(response_dataset_advanced_retrieval, metrics, llm=myllm, embeddings=embeddings, raise_exceptions=False)
    
    return advanced_retrieval_results


In [36]:
# testing evalframework
scoreoutput = eval_framework(filepath="C:/Users/TanmayRaju/Documents/GitHub/AdvRAG/new/data/LLM_Final.pdf", customchain=qa_advanced, myllm=chat_llm, testsize=2)
print(scoreoutput)

Filename and doc_id are the same for all nodes.                   
Generating: 100%|██████████| 2/2 [00:06<00:00,  3.42s/it]
Evaluating:  60%|██████    | 6/10 [00:06<00:03,  1.07it/s]Failed to parse output. Returning None.
Failed to parse output. Returning None.
Evaluating:  70%|███████   | 7/10 [00:11<00:05,  1.97s/it]Failed to parse output. Returning None.
Evaluating:  90%|█████████ | 9/10 [00:14<00:01,  1.79s/it]Failed to parse output. Returning None.
Evaluating: 100%|██████████| 10/10 [00:15<00:00,  1.52s/it]
  value = np.nanmean(self.scores[cn])


{'faithfulness': nan, 'answer_relevancy': 0.9069, 'context_recall': 0.8333, 'context_precision': 0.9167, 'answer_correctness': 0.2283}


In [90]:
# Testing our V2 evalframework function

def eval_frameworkv2(filepath, groundtruthdataset, customchain, myllm):

    # Load your doc
    loader = PyPDFLoader(filepath)
    documents = loader.load()

    # Document Splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 1000,
        chunk_overlap = 200
    )
    documents = text_splitter.split_documents(documents)
    #
    embeddings = HuggingFaceEmbeddings()
    #
    usertestdf = pd.read_csv(groundtruthdataset)
    test_questions = usertestdf["question"].values.tolist()
    test_groundtruths = usertestdf["ground_truth"].values.tolist()

    # Generate responses using our Advanced RAG pipeline using the questions we’ve generated.
    adv_answers = []
    adv_contexts = []

    for question in test_questions:
        response = customchain.invoke({"query" : question})
        adv_answers.append(response["result"])
        adv_contexts.append([context.page_content for context in response['source_documents']])

    #wrap into huggingface dataset
    response_dataset_advanced_retrieval = Dataset.from_dict({
        "question" : test_questions,
        "answer" : adv_answers,
        "contexts" : adv_contexts,
        "ground_truth" : test_groundtruths
    })

    metrics = [
        faithfulness,
        answer_relevancy,
        context_recall,
        context_precision,
        answer_correctness,
    ]

    advanced_retrieval_results = evaluate(response_dataset_advanced_retrieval, metrics, llm=myllm, embeddings=embeddings, raise_exceptions=False)
    
    return advanced_retrieval_results


In [78]:
# testing evalframework V2
scoreoutputv2 = eval_frameworkv2(filepath="C:/Users/TanmayRaju/Documents/GitHub/AdvRAG/new/data/LLM_Final.pdf", groundtruthdataset="C:/Users/TanmayRaju/Documents/GitHub/AdvRAG/new/data/customphase2data.csv", customchain=qa_advanced, myllm=chat_llm)
print(scoreoutputv2)

Evaluating:  30%|███       | 15/50 [00:16<00:39,  1.14s/it]Failed to parse output. Returning None.
Evaluating:  32%|███▏      | 16/50 [00:18<00:49,  1.45s/it]Failed to parse output. Returning None.
Evaluating:  34%|███▍      | 17/50 [00:20<00:47,  1.43s/it]Failed to parse output. Returning None.
Evaluating:  36%|███▌      | 18/50 [00:21<00:41,  1.29s/it]Failed to parse output. Returning None.
Evaluating:  44%|████▍     | 22/50 [00:22<00:18,  1.55it/s]Failed to parse output. Returning None.
Evaluating:  46%|████▌     | 23/50 [00:23<00:18,  1.44it/s]Failed to parse output. Returning None.
Evaluating:  48%|████▊     | 24/50 [00:24<00:22,  1.14it/s]Failed to parse output. Returning None.
Evaluating:  58%|█████▊    | 29/50 [00:28<00:16,  1.27it/s]Failed to parse output. Returning None.
Evaluating:  64%|██████▍   | 32/50 [00:30<00:15,  1.15it/s]Failed to parse output. Returning None.
Evaluating:  68%|██████▊   | 34/50 [00:31<00:11,  1.36it/s]Failed to parse output. Returning None.
Evaluating

{'faithfulness': nan, 'answer_relevancy': 0.7483, 'context_recall': 1.0000, 'context_precision': 1.0000, 'answer_correctness': 0.3779}


In [None]:
# code for generating question for evaluation df

import pandas as pd 

def testdf_generator(myllm, testsize ):

    # Load your doc
    loader1 = PyPDFLoader(r"C:\Users\TanmayRaju\Documents\GitHub\AdvRAG\new\data\2016-Apr-26-AAPL.pdf")
    loader2 = PyPDFLoader(r"C:\Users\TanmayRaju\Documents\GitHub\AdvRAG\new\data\2017-Aug-01-AAPL.pdf")
    loader3 = PyPDFLoader(r"C:\Users\TanmayRaju\Documents\GitHub\AdvRAG\new\data\2018-Feb-01-AAPL.pdf")
    loader4 = PyPDFLoader(r"C:\Users\TanmayRaju\Documents\GitHub\AdvRAG\new\data\2019-Apr-30-AAPL.pdf")
    loader5 = PyPDFLoader(r"C:\Users\TanmayRaju\Documents\GitHub\AdvRAG\new\data\2020-Apr-30-AAPL.pdf")

    document1 = loader1.load()
    document2 = loader2.load()
    document3 = loader3.load()
    document4 = loader4.load()
    document5 = loader5.load()

    # Document Splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 1000,
        chunk_overlap = 200
    )
    documents1 = text_splitter.split_documents(document1)
    documents2 = text_splitter.split_documents(document2)
    documents3 = text_splitter.split_documents(document3)
    documents4 = text_splitter.split_documents(document4)
    documents5 = text_splitter.split_documents(document5)
    #
    generator_llm = myllm
    critic_llm = myllm
    embeddings = HuggingFaceEmbeddings()

    #
    generator = TestsetGenerator.from_langchain(
        generator_llm,
        critic_llm,
        embeddings
    )
    #
    testset1 = generator.generate_with_langchain_docs(documents1, test_size=testsize, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})
    testset2 = generator.generate_with_langchain_docs(documents2, test_size=testsize, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})
    testset3 = generator.generate_with_langchain_docs(documents3, test_size=testsize, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})
    testset4 = generator.generate_with_langchain_docs(documents4, test_size=testsize, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})
    testset5 = generator.generate_with_langchain_docs(documents5, test_size=testsize, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})

    testset1 = testset1.to_pandas()
    testset2 = testset2.to_pandas()
    testset3 = testset3.to_pandas()
    testset4 = testset4.to_pandas()
    testset5 = testset5.to_pandas()

     # Concatenate the DataFrames
    evaldf = pd.concat([testset1, testset2, testset3, testset4, testset5], ignore_index=True)

    # Save the combined dataset to a new CSV file
    evaldf.to_csv(r'C:\Users\TanmayRaju\Documents\GitHub\AdvRAG\new\data\combined_testset.csv', index=False)

    print("Saved final evaldf")


eval = testdf_generator(myllm=chat_llm, testsize=2)

In [None]:
# Testing our evalframework function
def allfile(filepath, customchain, myllm, testsize ):

    # Load your doc
    loader = DirectoryLoader(filepath, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()

    # Document Splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 1000,
        chunk_overlap = 200
    )
    documents = text_splitter.split_documents(documents)
    #
    generator_llm = myllm
    critic_llm = myllm
    embeddings = HuggingFaceEmbeddings()

    #
    generator = TestsetGenerator.from_langchain(
        generator_llm,
        critic_llm,
        embeddings
    )
    #
    testset = generator.generate_with_langchain_docs(documents, test_size=testsize, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})

    test_df = testset.to_pandas()
    test_questions = test_df["question"].values.tolist()
    test_groundtruths = test_df["ground_truth"].values.tolist()

    # Generate responses using our Advanced RAG pipeline using the questions we’ve generated.
    adv_answers = []
    adv_contexts = []

    for question in test_questions:
        response = customchain.invoke({"query" : question})
        adv_answers.append(response["result"])
        adv_contexts.append([context.page_content for context in response['source_documents']])

    #wrap into huggingface dataset
    response_dataset_advanced_retrieval = Dataset.from_dict({
        "question" : test_questions,
        "answer" : adv_answers,
        "contexts" : adv_contexts,
        "ground_truth" : test_groundtruths
    })

    metrics = [
        faithfulness,
        answer_relevancy,
        context_recall,
        context_precision,
        answer_correctness,
    ]

    advanced_retrieval_results = evaluate(response_dataset_advanced_retrieval, metrics, llm=myllm, embeddings=embeddings, raise_exceptions=False)
    
    return advanced_retrieval_results

scoreoutput = allfile(filepath="data", customchain=qa_advanced, myllm=chat_llm, testsize=1)
print(scoreoutput)

In [None]:
loader = DirectoryLoader("data", glob="*.docx", loader_cls=PyPDFLoader)
documents = loader.load()