In [None]:
!pip install langchain-google-genai

In [1]:
import os
os.environ["GOOGLE_API_KEY"] = "<gemini api key>"

In [None]:
import pandas as pd
sampled_data=pd.read_excel("datasets/source_data/sampled_data.xlsx")

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document as LangchainDocument


def getLangchainDocs(dfvals):
    langchain_docs=[]
    for idx,vals in dfvals.iterrows():
        category_vals=vals["labels"]
        metadata_vals={'category':category_vals}
        langchain_docs.append(LangchainDocument(page_content=vals["text"], metadata=metadata_vals))
    return langchain_docs

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=200,
    add_start_index=True,
    separators=["\n\n", "\n", ".", " ", ""],
)
langchain_docs=getLangchainDocs(sampled_data)
docs_processed = []
for doc in langchain_docs:
    docs_processed += text_splitter.split_documents([doc])

In [None]:
QA_generation_prompt = """
Your task is to write a factoid question and an answer given a context.
Your factoid question should be answerable with a specific, concise piece of factual information from the context.
Your factoid question should be formulated in the same style as questions users could ask in a search engine.
This means that your factoid question MUST NOT mention something like "according to the passage" or "context".

Please provide a response in a structured JSON format that matches the following format:
{{
  "question": <your factoid question>,
  "answer": <your answer to the factoid question>
}}

Now here is the context.

Context: {context}\n
Output:::"""

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
from pydantic import BaseModel

llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash-preview-05-20")

class FinalResponse(BaseModel):
    question: str
    answer: str

structured_llm = llm.with_structured_output(FinalResponse)
def call_llm(prompt):
    response = structured_llm.invoke(prompt)
    return response

# **Question Generation Agent**

In [None]:
import random,tqdm

N_GENERATIONS = 300

print(f"Generating {N_GENERATIONS} QA couples...")

outputs = []
sampled_docs=random.sample(docs_processed, N_GENERATIONS)
for sampled_context in sampled_docs:
    # Generate QA couple

    output_QA_couple = call_llm(QA_generation_prompt.format(context=sampled_context.page_content))
    try:
        question = output_QA_couple.question
        answer = output_QA_couple.answer
        assert len(answer) < 300, "Answer is too long"
        outputs.append(
            {
                "context": sampled_context.page_content,
                "question": question,
                "answer": answer,
                "source_category": sampled_context.metadata["category"],
            }
        )
    except Exception as e:
        print(e)
        continue

Generating 300 QA couples...


In [None]:
pd.DataFrame(outputs).to_parquet("datasets/output_files/synthetic_qna_pairs.parquet")

## **Critiquing Synthetically generated QA pairs**

In [2]:
import pandas as pd
outputs=pd.read_parquet("datasets/output_files/synthetic_qna_pairs.parquet")

In [3]:
question_groundedness_critique_prompt = """
You will be given a context and a question.
Your task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context.
Give your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, and 5 means that the question is clearly and unambiguously answerable with the context.

Please provide a response in a structured JSON format that matches the following format:
{{
  "evaluation": <your rationale for the rating, as a text>,
  "rating": <your rating, as a number between 1 and 5>
}}

here are the question and context.
Question: {question}\n
Context: {context}\n
"""

question_standalone_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how context-independent this question is.
Give your answer on a scale of 1 to 5, where 1 means that the question depends on additional information to be understood, and 5 means that the question makes sense by itself.
For instance, if the question refers to a particular setting, like 'in the context' or 'in the document', the rating must be 1.
The questions can contain obscure technical nouns or acronyms and still be a 5: it must simply be clear to an operator with access to documentation what the question is about.
For instance, "When does the tournament start?" should receive a 1, since there is an implicit mention of a context, thus the question is not independent from the context.

Please provide a response in a structured JSON format that matches the following format:
{{
  "evaluation": <your rationale for the rating, as a text>,
  "rating": <your rating, as a number between 1 and 5>
}}

Now here is the question.
Question: {question}\n
"""

In [4]:
from langchain_google_genai import ChatGoogleGenerativeAI
from pydantic import BaseModel

critique_llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash")
class CritiqueResponse(BaseModel):
    evaluation: str
    rating: str

structured_llm_critique = critique_llm.with_structured_output(CritiqueResponse)

def call_critique_llm(prompt):
    response = structured_llm_critique.invoke(prompt)
    return response

In [5]:
outputs.head()

Unnamed: 0,context,question,answer,source_category
0,UK youth 'interested' in politics\n\nThe major...,What percentage of 16 to 20-year-olds in the U...,81%,politics
1,Portishead back after eight years\n\nCult Brit...,When did Portishead win the Mercury Music Prize?,1995,entertainment
2,". ""Animals hit but not killed would without do...",What animals are covered by Texas hunting laws...,"State laws on hunting only covered ""regulated ...",tech
3,Howard rejects BNP's claim\n\nTory leader Mich...,What is the slogan for the Conservative Party'...,"""It's not racist to impose limits on immigration""",politics
4,Wada will appeal against ruling\n\nThe World A...,Who is the chairman of Wada?,Dick Pound,sport


In [6]:
call_critique_llm(question_groundedness_critique_prompt.format(context=outputs.iloc[18]["context"], question=outputs.iloc[50]["question"]))

CritiqueResponse(evaluation='The context explicitly states that the G8 countries will meet at Gleneagles in Scotland. Therefore, the question is clearly and unambiguously answerable.', rating='5')

In [8]:
print("Generating critique for each QA couple...")
critiqued_outputs=outputs.copy()

def get_Critique_Score(critiqued_outputs):
    groundedness_output=call_critique_llm(question_groundedness_critique_prompt.format(context=critiqued_outputs["context"], question=critiqued_outputs["question"]))
    standalone_output=call_critique_llm(question_standalone_critique_prompt.format(question=critiqued_outputs["question"]))
    return groundedness_output.rating,groundedness_output.evaluation,standalone_output.rating,standalone_output.evaluation


critiqued_outputs['groundedness_score'],critiqued_outputs['groundedness_eval'],\
critiqued_outputs['standalone_score'],critiqued_outputs['standalone_eval']=zip(*critiqued_outputs.apply(lambda x: get_Critique_Score(x),axis=1))

Generating critique for each QA couple...


In [9]:
critiqued_outputs

Unnamed: 0,context,question,answer,source_category,groundedness_score,groundedness_eval,standalone_score,standalone_eval
0,UK youth 'interested' in politics\n\nThe major...,What percentage of 16 to 20-year-olds in the U...,81%,politics,5,The context directly answers the question. It ...,5,The question is self-contained and does not de...
1,Portishead back after eight years\n\nCult Brit...,When did Portishead win the Mercury Music Prize?,1995,entertainment,5,The context states that Portishead won a Mercu...,5,The question is self-contained and does not de...
2,". ""Animals hit but not killed would without do...",What animals are covered by Texas hunting laws...,"State laws on hunting only covered ""regulated ...",tech,5,The context explicitly mentions that Texas hun...,5,The question is perfectly self-contained. It e...
3,Howard rejects BNP's claim\n\nTory leader Mich...,What is the slogan for the Conservative Party'...,"""It's not racist to impose limits on immigration""",politics,5,The question asks for the slogan of the Conser...,5,The question is self-contained and does not de...
4,Wada will appeal against ruling\n\nThe World A...,Who is the chairman of Wada?,Dick Pound,sport,5,The context clearly states that Dick Pound is ...,5,The question is self-contained and does not de...
...,...,...,...,...,...,...,...,...
295,The Liberal Democrats say in the northern citi...,When is the upcoming general election widely t...,May 5,politics,5,The context explicitly states that the upcomin...,2,"The question requires some external context, s..."
296,House prices drop as sales slow\n\nHouse price...,Which regions in the UK experienced the larges...,The Midlands and South.,business,5,"The context mentions that ""Around the UK, the ...",5,The question is self-contained and doesn't rel...
297,Dozens held over ID fraud site\n\nTwenty-eight...,What was the name of the website involved in i...,Shadowcrew.com,tech,4,The question asks for the name of the website ...,5,The question is about the name of a website in...
298,Monitoring firm Netcraft analysed response tim...,What is Graham Cluley's job title?,Graham Cluley is a senior technology consultan...,tech,5,The context mentions Graham Cluley and his aff...,5,The question is self-contained and does not re...


In [10]:
critiqued_outputs_df=pd.DataFrame(critiqued_outputs)
critiqued_outputs_df.head()

Unnamed: 0,context,question,answer,source_category,groundedness_score,groundedness_eval,standalone_score,standalone_eval
0,UK youth 'interested' in politics\n\nThe major...,What percentage of 16 to 20-year-olds in the U...,81%,politics,5,The context directly answers the question. It ...,5,The question is self-contained and does not de...
1,Portishead back after eight years\n\nCult Brit...,When did Portishead win the Mercury Music Prize?,1995,entertainment,5,The context states that Portishead won a Mercu...,5,The question is self-contained and does not de...
2,". ""Animals hit but not killed would without do...",What animals are covered by Texas hunting laws...,"State laws on hunting only covered ""regulated ...",tech,5,The context explicitly mentions that Texas hun...,5,The question is perfectly self-contained. It e...
3,Howard rejects BNP's claim\n\nTory leader Mich...,What is the slogan for the Conservative Party'...,"""It's not racist to impose limits on immigration""",politics,5,The question asks for the slogan of the Conser...,5,The question is self-contained and does not de...
4,Wada will appeal against ruling\n\nThe World A...,Who is the chairman of Wada?,Dick Pound,sport,5,The context clearly states that Dick Pound is ...,5,The question is self-contained and does not de...


In [11]:
critiqued_outputs_df.to_parquet("datasets/output_files/critiqued_qna_pairs.parquet")

In [12]:
import pandas as pd
critiqued_outputs_df=pd.read_parquet("datasets/output_files/critiqued_qna_pairs.parquet")

In [13]:
critiqued_outputs_df["groundedness_score"]=critiqued_outputs_df["groundedness_score"].astype(int)
critiqued_outputs_df["standalone_score"]=critiqued_outputs_df["standalone_score"].astype(int)

## **Filtering based on critique evaluations**

In [4]:
critiqued_outputs_df.iloc[15]["question"]

'What is the name of the steel mill targeted in the privatization review?'

In [14]:
critiqued_outputs_df.iloc[15]

Unnamed: 0,15
context,Mr Yushchenko became president after two elect...
question,What is the name of the steel mill targeted in...
answer,Krivorizhstal
source_category,business
groundedness_score,5
groundedness_eval,The question is clearly and unambiguously answ...
standalone_score,1
standalone_eval,The question refers to a 'privatization review...


In [5]:
critiqued_outputs_df.iloc[15]["context"]

'Mr Yushchenko became president after two elections in December, the first of which was annulled amid allegations of voting irregularities and massive street protests.\n\nHis opponent, Viktor Yanukovich, still has huge support in the country\'s eastern industrial heartland. Mr Yushchenko\'s administration has accused its predecessor, led by ex-President Leonid Kuchma, of corruption. The privatisation review\'s number one target is a steel mill sold to a consortium which included Viktor Pinchuk, Mr Kuchma\'s son-in-law, for $800m (£424m) despite higher bids from several foreign groups. The mill, Krivorizhstal, is one of the world\'s most profitable. "We say Krivorizhstal was stolen, and at any cost we will return it to the state," Mr Yushchenko told an investors\' conference in Kiev.\n\nOne of the jilted bidders, Netherlands-based group LNM, said it welcomed the possibility that the mill might be back on the market.\n\n"If the original privatisation is annulled and a new tender issued, 

In [15]:
critiqued_outputs_df_filtered=critiqued_outputs_df[(critiqued_outputs_df["groundedness_score"]==5) & (critiqued_outputs_df["standalone_score"]==5)]

In [16]:
critiqued_outputs_df_filtered.to_parquet("datasets/output_files/critiqued_qna_pairs_filtered.parquet")

In [17]:
critiqued_outputs_df_filtered

Unnamed: 0,context,question,answer,source_category,groundedness_score,groundedness_eval,standalone_score,standalone_eval
0,UK youth 'interested' in politics\n\nThe major...,What percentage of 16 to 20-year-olds in the U...,81%,politics,5,The context directly answers the question. It ...,5,The question is self-contained and does not de...
1,Portishead back after eight years\n\nCult Brit...,When did Portishead win the Mercury Music Prize?,1995,entertainment,5,The context states that Portishead won a Mercu...,5,The question is self-contained and does not de...
2,". ""Animals hit but not killed would without do...",What animals are covered by Texas hunting laws...,"State laws on hunting only covered ""regulated ...",tech,5,The context explicitly mentions that Texas hun...,5,The question is perfectly self-contained. It e...
3,Howard rejects BNP's claim\n\nTory leader Mich...,What is the slogan for the Conservative Party'...,"""It's not racist to impose limits on immigration""",politics,5,The question asks for the slogan of the Conser...,5,The question is self-contained and does not de...
4,Wada will appeal against ruling\n\nThe World A...,Who is the chairman of Wada?,Dick Pound,sport,5,The context clearly states that Dick Pound is ...,5,The question is self-contained and does not de...
...,...,...,...,...,...,...,...,...
292,Alicia Keys to open US Super Bowl\n\nR&B star ...,Who will sing America the Beautiful at the Sup...,Alicia Keys,entertainment,5,The question is clearly and unambiguously answ...,5,The question is perfectly self-contained and r...
294,How true. There are extremely worrying paralle...,What quote is attributed to Thomas Jefferson r...,A nation that limits freedom in the name of se...,politics,5,The context directly provides the quote attrib...,5,The question is self-contained and does not de...
296,House prices drop as sales slow\n\nHouse price...,Which regions in the UK experienced the larges...,The Midlands and South.,business,5,"The context mentions that ""Around the UK, the ...",5,The question is self-contained and doesn't rel...
298,Monitoring firm Netcraft analysed response tim...,What is Graham Cluley's job title?,Graham Cluley is a senior technology consultan...,tech,5,The context mentions Graham Cluley and his aff...,5,The question is self-contained and does not re...


In [50]:
critiqued_outputs_df_filtered.iloc[298]["context"]

'Monitoring firm Netcraft analysed response times for some of the sites targeted by the screensaver and found that a number were completely knocked offline.\n\nThe downing of the sites could dent Lycos claims that what it is doing does not amount to a distributed denial of service attack. In such attacks thousands of computers bombard sites with data in an attempt to overwhelm them. Laws in many countries do not explicitly outlaw such attacks but many nations are re-drafting computer use laws to make them specific offences. Lycos Europe now appears to have put the plan on hold. The site hosting the screensaver currently shows a holding page, with the words, "Stay tuned". The numerical internet address of the site has also changed. This is likely to be in response to spammers who have reportedly redirected traffic from their sites back to the Lycos screensaver site. The campaign has come under fire from some corners of the web. Many discussion groups have said that it set a dangerous pr