In [7]:
from tqdm.auto import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import json
import datasets
import glob
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import CharacterTextSplitter,  RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader
pd.set_option("display.max_colwidth", None)

In [3]:
from langchain_community.llms.ollama import Ollama
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableParallel

#### Load gemma using Ollama

In [4]:
llm = Ollama(model="gemma:2b")

#### Read PDF and create chunks

In [9]:
def get_chunks():
    all_files = glob.glob(r"C:\Users\tyagi\Desktop\FinanceFusionRAG\data\test/*.pdf")
    all_chunk = []
    for file_path in all_files:
        loader = PyMuPDFLoader(file_path)
        doc = loader.load()
        text_splitter =  RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=75,separators=["\n\n", "\n", ".", " ", ""])
        chunks = text_splitter.split_documents(doc)
        all_chunk+=chunks
    return all_chunk

In [10]:
docs_processed=get_chunks()

#### Create a synthetic dataset for evaluation

In [17]:
QA_generation_prompt = """
Your task is to write a factoid question and an answer given a context.
Your factoid question should be answerable with a specific, concise piece of factual information from the context.
Your factoid question should be formulated in the same style as questions users could ask in a search engine.
This means that your factoid question MUST NOT mention something like "according to the passage" or "context".

Provide your answer as follows:

Output:::
Factoid question: (your factoid question)
Answer: (your answer to the factoid question)

Now here is the context.

Context: {context}\n
Output:::"""

def generate_qna(context):
    prompt = PromptTemplate.from_template(QA_generation_prompt)
    chain = prompt | llm | StrOutputParser() 
    return chain.invoke({"context":context})

In [18]:
import random

N_GENERATIONS = 10  # We intentionally generate only 10 QA couples here for cost and time considerations

print(f"Generating {N_GENERATIONS} QA couples...")

outputs = []
for sampled_context in tqdm(random.sample(docs_processed, N_GENERATIONS)):
    # Generate QA couple
    output_QA_couple = generate_qna(context=sampled_context.page_content)
    try:
        question = output_QA_couple.split("Factoid question: ")[-1].split("Answer: ")[0]
        answer = output_QA_couple.split("Answer: ")[-1]
        assert len(answer) < 300, "Answer is too long"
        outputs.append(
            {
                "context": sampled_context.page_content,
                "question": question,
                "answer": answer,
                "source_doc": sampled_context.metadata["source"],
            }
        )
    except:
        continue


Generating 10 QA couples...


  0%|          | 0/10 [00:00<?, ?it/s]

In [20]:
outputs

[{'context': 'lower the cost to borrow. AAA/Aaa ratings are the highest ratings issued by the credit-rating agencies\nand likely result in the lowest borrowing costs or yields. Investors seeking a better return should look\ndown the credit-ratings scale for bond issuers with lower ratings and higher yields.',
  'question': 'Which credit-rating agency issues AAA/Aaa ratings?\n\n',
  'answer': 'The context does not specify which credit-rating agency issues AAA/Aaa ratings, so I cannot answer this question from the context.',
  'source_doc': 'C:\\Users\\tyagi\\Desktop\\FinanceFusionRAG\\data\\test\\Investopedia_A_What_Is_Aaa.pdf'},
 {'context': "emerging market to an advanced economy , there is substantial demand for Chinese equity . Stock\nexchange regulators continue efforts to make A-shares more broadly available to foreign investors and\nhave them recognized by the global investing community. In June 2017, the MSCI Emerging Markets\nIndex announced a two-phase plan in which it would g

#### Setup critique agents to check the quality of generated guestions

In [19]:
#Groundedness

In [None]:
question_groundedness_critique_prompt = """
You will be given a context and a question.
Your task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context.
Give your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, and 5 means that the question is clearly and unambiguously answerable with the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here are the question and context.

Question: {question}\n
Context: {context}\n
Answer::: """

In [None]:
#relevance

In [None]:
question_relevance_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how useful this question can be to machine learning developers building NLP applications with the Hugging Face ecosystem.
Give your answer on a scale of 1 to 5, where 1 means that the question is not useful at all, and 5 means that the question is extremely useful.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """

In [None]:
#standalone

In [None]:
question_standalone_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how context-independant this question is.
Give your answer on a scale of 1 to 5, where 1 means that the question depends on additional information to be understood, and 5 means that the question makes sense by itself.
For instance, if the question refers to a particular setting, like 'in the context' or 'in the document', the rating must be 1.
The questions can contain obscure technical nouns or acronyms like Gradio, Hub, Hugging Face or Space and still be a 5: it must simply be clear to an operator with access to documentation what the question is about.

For instance, "What is the name of the checkpoint from which the ViT model is imported?" should receive a 1, since there is an implicit mention of a context, thus the question is not independant from the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """