In [1]:
from tqdm.auto import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import json
import datasets
import glob
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import CharacterTextSplitter,  RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader
pd.set_option("display.max_colwidth", None)

In [2]:
from langchain_community.llms.ollama import Ollama
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableParallel

In [3]:
import os

In [26]:
os.environ["OPENAI_API_KEY"]="type ur open ai api"

In [5]:
from langchain_openai import OpenAI

In [6]:
llm=OpenAI()

In [18]:
def call_llm(prompt):
    return llm.invoke(prompt)

#### Load gemma using Ollama

In [8]:
llm = Ollama(model="gemma:2b")

#### Read PDF and create chunks

In [20]:
from cleantext import clean

In [35]:
def clean_text(text):
    return clean(text,
    fix_unicode=True,               # fix various unicode errors
    to_ascii=True,                  # transliterate to closest ASCII representation
    lower=True,                     # lowercase text
    no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
    no_urls=False,                  # replace all URLs with a special token
    lang="en"                       
)

In [36]:
def get_chunks():
    all_files = glob.glob(r"D:\projects\FinanceFusionRAG\data\test/*.pdf")
    all_chunk = []
    for file_path in all_files:
        loader = PyMuPDFLoader(file_path)
        doc = loader.load()
        text_splitter =  RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100,separators=["\n\n", "\n", ". ", " ", ""])
        chunks = text_splitter.split_documents(doc)
        all_chunk+=chunks
    return all_chunk

In [37]:
docs_processed=get_chunks()

In [38]:
for doc in docs_processed:
    doc.page_content = clean_text(doc.page_content)

#### Generate a synthetic dataset for evaluation

In [39]:
QA_generation_prompt = """
Your task is to write a factoid question and an answer given a context.
Your factoid question should be answerable with a specific, concise piece of factual information from the context.
Your factoid question should be formulated in the same style as questions users could ask in a search engine.
This means that your factoid question MUST NOT mention something like "according to the passage" or "context".

Provide your answer as follows:

Output:::
Factoid question: (your factoid question)
Answer: (your answer to the factoid question)

Now here is the context.

Context: {context}\n
Output:::"""

def generate_qna(context):
    prompt = PromptTemplate.from_template(QA_generation_prompt)
    chain = prompt | llm | StrOutputParser() 
    return chain.invoke({"context":context})

In [42]:
import random

N_GENERATIONS = 30 

print(f"Generating {N_GENERATIONS} QA couples...")

outputs = []
for sampled_context in tqdm(random.sample(docs_processed, N_GENERATIONS)):
    # Generate QA couple
    output_QA_couple = generate_qna(context=sampled_context.page_content)
    try:
        question = output_QA_couple.split("Factoid question: ")[-1].split("Answer: ")[0]
        answer = output_QA_couple.split("Answer: ")[-1]
        assert len(answer) < 300, "Answer is too long"
        outputs.append(
            {
                "context": sampled_context.page_content,
                "question": question,
                "answer": answer,
                "source_doc": sampled_context.metadata["source"],
            }
        )
    except:
        continue


Generating 30 QA couples...


  0%|          | 0/30 [00:00<?, ?it/s]

In [43]:
outputs

[{'context': "debt with a cash-out refinance can lower the back-end ratio. however, because lenders incur greater risk on a cash-out refinance, the interest rate is often slightly higher versus a standard rate-term refinance to compensate for the higher risk. in addition, many lenders require a borrower paying off the revolving debt in a cash-out refinance to close the debt accounts being paid off, lest they run the balance back up. what are the back-end ratio requirements? typically, lenders want to see a back-end ratio of at least 36%, although some lenders may allow for a higher back-end ratio. for example, some lenders may allow for a maximum back-end ratio of 43%. what is a front-end ratio? a front-end ratio is the percentage of your housing expenses in proportion to your total income. to calculate a front-end ratio, divide your total housing expenses, including your mortgage payment, property taxes, mortgage insurance, and homeowner's association fees by your total income. what i

In [12]:
#with open("dataset.json", "w") as f:
#    f.write(json.dumps(outputs))
#   f.close()
with open("dataset.json", "r") as f:
    outputs = json.load(f)

#### Setup critique agents to check the quality of generated guestions

In [16]:
# Groundedness Prompt
question_groundedness_critique_prompt = """
You will be given a context and a question.
Your task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context.
Give your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, and 5 means that the question is clearly and unambiguously answerable with the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here are the question and context.

Question: {question}\n
Context: {context}\n
Answer::: """

# Relevance Prompt 
question_relevance_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how useful this question can be to machine learning developers building NLP applications with the Hugging Face ecosystem.
Give your answer on a scale of 1 to 5, where 1 means that the question is not useful at all, and 5 means that the question is extremely useful.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """


# Standalone Prompt

question_standalone_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how context-independant this question is.
Give your answer on a scale of 1 to 5, where 1 means that the question depends on additional information to be understood, and 5 means that the question makes sense by itself.
For instance, if the question refers to a particular setting, like 'in the context' or 'in the document', the rating must be 1.
The questions can contain obscure technical nouns or acronyms like Gradio, Hub, Hugging Face or Space and still be a 5: it must simply be clear to an operator with access to documentation what the question is about.

For instance, "What is the name of the checkpoint from which the ViT model is imported?" should receive a 1, since there is an implicit mention of a context, thus the question is not independant from the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """

In [19]:
print("Generating critique for each QA couple...")
for output in tqdm(outputs):
    evaluations = {
        "groundedness": call_llm(
            question_groundedness_critique_prompt.format(context=output["context"], question=output["question"]),
        ),
        "relevance": call_llm(
            question_relevance_critique_prompt.format(question=output["question"]),
        ),
        "standalone": call_llm(
            question_standalone_critique_prompt.format(question=output["question"]),
        ),
    }
    try:
        for criterion, evaluation in evaluations.items():
            score, eval = (
                int(evaluation.split("Total rating: ")[-1].strip()),
                evaluation.split("Total rating: ")[-2].split("Evaluation: ")[1],
            )
            output.update(
                {
                    f"{criterion}_score": score,
                    f"{criterion}_eval": eval,
                }
            )
    except Exception as e:
        continue

Generating critique for each QA couple...


  0%|          | 0/26 [00:00<?, ?it/s]

In [20]:
generated_questions = pd.DataFrame.from_dict(outputs)

print("Evaluation dataset before filtering:")
display(
    generated_questions[
        [
            "question",
            "answer",
            "groundedness_score",
            "relevance_score",
            "standalone_score",
        ]
    ]
)
generated_questions = generated_questions.loc[
    (generated_questions["groundedness_score"] >= 3)
    & (generated_questions["relevance_score"] >= 3)
    & (generated_questions["standalone_score"] >= 3)
]
print("============================================")
print("Final evaluation dataset:")
display(
    generated_questions[
        [
            "question",
            "answer",
            "groundedness_score",
            "relevance_score",
            "standalone_score",
        ]
    ]
)


Evaluation dataset before filtering:


Unnamed: 0,question,answer,groundedness_score,relevance_score,standalone_score
0,What is the minimum back-end ratio required by lenders for a cash-out refinance?\n\n,36%,3,1.0,2.0
1,"According to the context, what is the minimum front-end ratio required by lenders for mortgage approval?\n\n",28%,5,3.0,1.0
2,"**Factoid question:** What is the way that backdoor roth iras work?\n\n**Answer:** Backdoor roth iras work by allowing higher-income taxpayers to contribute to a Roth IRA using after-tax dollars, and then recharacterize those contributions back to a traditional IRA contribution.","**Factoid question:** What is the way that backdoor roth iras work?\n\n**Answer:** Backdoor roth iras work by allowing higher-income taxpayers to contribute to a Roth IRA using after-tax dollars, and then recharacterize those contributions back to a traditional IRA contribution.",4,3.0,5.0
3,**Factoid question:** What is a backdoor Roth IRA?\n\n**Answer:** A backdoor Roth IRA is a strategy rather than an official type of individual retirement account that allows high-income earners to convert their traditional IRA to a Roth IRA without paying taxes on any contributions.,**Factoid question:** What is a backdoor Roth IRA?\n\n**Answer:** A backdoor Roth IRA is a strategy rather than an official type of individual retirement account that allows high-income earners to convert their traditional IRA to a Roth IRA without paying taxes on any contributions.,5,3.0,5.0
4,"According to the context, what was the number of babies born in the united states during the baby boom?\n\n",76 million,4,1.0,1.0
5,"According to the context, what is the advantage of the backdoor roth IRA strategy for high earners?\n\n","The backdoor roth IRA strategy allows high earners to save significant tax amounts by holding their IRAs forever and passing them on to their heirs, while avoiding the required minimum distribution age.",5,1.0,1.0
6,What is the taxable amount of money in a traditional IRA that has not been taxed?\n\n\n,The taxable amount of money in a traditional IRA that has not been taxed is the amount contributed to the IRA.,3,3.0,5.0
7,How long did parents have to make monthly contributions for their child to receive the guaranteed minimum amount tax-free upon turning 18?\n\n,10 years,5,1.0,1.0
8,What is the main advantage of using backflush costing over traditional costing systems?\n\n,"Backflush costing allows companies to work backward, calculating the costs of products after they're sold, finished, or shipped.",4,2.0,5.0
9,Which type of letter of credit is more secure for an exporter and less secure for an importer?\n\n,A transferable letter of credit is more secure for an exporter and less secure for an importer.,2,4.0,3.0


Final evaluation dataset:


Unnamed: 0,question,answer,groundedness_score,relevance_score,standalone_score
2,"**Factoid question:** What is the way that backdoor roth iras work?\n\n**Answer:** Backdoor roth iras work by allowing higher-income taxpayers to contribute to a Roth IRA using after-tax dollars, and then recharacterize those contributions back to a traditional IRA contribution.","**Factoid question:** What is the way that backdoor roth iras work?\n\n**Answer:** Backdoor roth iras work by allowing higher-income taxpayers to contribute to a Roth IRA using after-tax dollars, and then recharacterize those contributions back to a traditional IRA contribution.",4,3.0,5.0
3,**Factoid question:** What is a backdoor Roth IRA?\n\n**Answer:** A backdoor Roth IRA is a strategy rather than an official type of individual retirement account that allows high-income earners to convert their traditional IRA to a Roth IRA without paying taxes on any contributions.,**Factoid question:** What is a backdoor Roth IRA?\n\n**Answer:** A backdoor Roth IRA is a strategy rather than an official type of individual retirement account that allows high-income earners to convert their traditional IRA to a Roth IRA without paying taxes on any contributions.,5,3.0,5.0
6,What is the taxable amount of money in a traditional IRA that has not been taxed?\n\n\n,The taxable amount of money in a traditional IRA that has not been taxed is the amount contributed to the IRA.,3,3.0,5.0
10,"**Factoid question:**\n\nWhat is the difference between issuing bonds for a $4 million value and issuing bonds for a $400 face value?\n\n\n**Answer:**\n\nThe issuer can sell 4,000 bond certificates for a $4 million value, but they can sell 10,000 bonds for a $400 face value on the markets.","**Factoid question:**\n\nWhat is the difference between issuing bonds for a $4 million value and issuing bonds for a $400 face value?\n\n\n**Answer:**\n\nThe issuer can sell 4,000 bond certificates for a $4 million value, but they can sell 10,000 bonds for a $400 face value on the markets.",5,4.0,5.0
25,**Factoid question:**\n\nWhat is the advantage of backflush costing over traditional costing methods?\n\n**Answer:** Backflush costing allows companies to save time and reduce their expenses by eliminating work-in-process (wip) accounts and simplifying the accounting process.,**Factoid question:**\n\nWhat is the advantage of backflush costing over traditional costing methods?\n\n**Answer:** Backflush costing allows companies to save time and reduce their expenses by eliminating work-in-process (wip) accounts and simplifying the accounting process.,3,3.0,5.0


In [24]:
generated_questions.to_csv("final_dataset.csv", index=False)