In [None]:
from datasets import load_dataset,Dataset,DatasetDict
import pandas as pd
import random
import os
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from langchain_core.output_parsers import PydanticOutputParser
from huggingface_hub import login
from dotenv import load_dotenv
load_dotenv()

In [None]:
login()

In [None]:
dataset = load_dataset("Vishva007/RBI-Circular-QA-Dataset")

In [None]:
df = dataset['train'].to_pandas()

In [None]:
initial_eval_df = df.head(5000)

In [None]:
eval_split_df = initial_eval_df.sample(n=100, random_state=42).reset_index(drop=True)


In [None]:
print(f"Shape of the initial evaluation split: {eval_split_df.shape}")
print(eval_split_df.head())

In [None]:
class RephrasedContent(BaseModel):
    rephrased_question: str = Field(description="The rephrased version of the original financial question.")
    rephrased_answer: str = Field(description="The rephrased version of the original financial answer, maintaining factual accuracy.")


In [None]:
model = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=os.environ.get("GOOGLE_API_KEY"))

In [None]:
combined_rephrase_prompt = ChatPromptTemplate.from_messages([
    ("system", """You are a helpful assistant specialized in rephrasing RBI financial questions and answers. 
                Rephrase the provided question to make it slightly different but retain its original meaning. 
                Also, rephrase the provided answer to make it slightly different while retaining its original meaning and factual accuracy."""),
    ("user", "Question: {question}\nAnswer: {answer}")
])

In [None]:
combined_rephrase_chain = combined_rephrase_prompt | model.with_structured_output(RephrasedContent)

In [None]:
eval_split_df['rephrased_question'] = ""
eval_split_df['rephrased_answer'] = ""

In [None]:
# Iterate through the sampled DataFrame and rephrase
for index, row in eval_split_df.iterrows():
    original_question = row['question']
    original_answer = row['answer']

    try:
        # Invoke the combined chain
        rephrased_output = combined_rephrase_chain.invoke({
            "question": original_question,
            "answer": original_answer
        })
        
        eval_split_df.loc[index, 'rephrased_question'] = rephrased_output.rephrased_question
        eval_split_df.loc[index, 'rephrased_answer'] = rephrased_output.rephrased_answer
        
        print(f"Processed sample {index+1}/{len(eval_split_df)}")
        
    except Exception as e:
        print(f"Error processing sample {index}: {e}")
        eval_split_df.loc[index, 'rephrased_question'] = f"Error: {e}"
        eval_split_df.loc[index, 'rephrased_answer'] = f"Error: {e}"

In [None]:
print("\nRephrasing complete. Displaying the first few rows of the rephrased evaluation split with all columns:")
# Print the entire eval_split_df to see all columns
print(eval_split_df.head())

In [None]:
# --- Step 4: Prepare the new 'eval' split and update the DatasetDict ---
print("\nStep 4: Preparing new 'eval' split and updating DatasetDict...")

# Convert the rephrased DataFrame to a Hugging Face Dataset
new_eval_hf_dataset = Dataset.from_pandas(eval_split_df)


In [None]:
original_train_hf_dataset = dataset['train']

In [None]:
new_columns_to_add = ['rephrased_question', 'rephrased_answer']
train_features = original_train_hf_dataset.features.keys()
columns_to_actually_add = [col for col in new_columns_to_add if col not in train_features]


In [None]:
if columns_to_actually_add:
    print(f"Adding missing columns {columns_to_actually_add} to the 'train' split for feature alignment...")
    original_train_hf_dataset = original_train_hf_dataset.map(
        lambda examples: {col: [""] * len(examples["question"]) for col in columns_to_actually_add}, # Initialize with empty strings
        batched=True # Process in batches for efficiency
    )
else:
    print("Train split already has 'rephrased_question' and 'rephrased_answer' columns.")


In [None]:
print("Features of 'train' split after alignment:")
print(original_train_hf_dataset.features)
print("\nFeatures of 'eval' split:")
print(new_eval_hf_dataset.features)

In [None]:
updated_dataset_dict = DatasetDict({
    'train': original_train_hf_dataset,
    'eval': new_eval_hf_dataset
})

In [None]:
print(f"Updated DatasetDict splits: {updated_dataset_dict.keys()}")
print(f"Number of samples in 'train' split: {len(updated_dataset_dict['train'])}")
print(f"Number of samples in 'eval' split: {len(updated_dataset_dict['eval'])}")


In [None]:
updated_dataset_dict['eval'][0]

# {'document': 'RBI_2023-2024_65CO.DGBA.GBD.No.S646_42-01-029_2023-2024_2023-10-03',
#  'filename': 'RBI_2023-2024_65CO.DGBA.GBD.No.S646_42-01-029_2023-2024_2023-10-03_text_part1.txt',
#  'model_name': 'models/gemini-2.0-flash',
#  'regulation_area': 'Government Transactions',
#  'applicable_to': 'All Agency Banks',
#  'issued_on': '2023-10-03',
#  'key_topics': ['Government transactions through e-Kuber',
#   'Processing of transactions on March 31, 2024',
#   'Accounting of transactions for FY 2023-24'],
#  'chunks_text': "![](_page_0_Picture_0.jpeg)\n\n![](_page_0_Picture_1.jpeg)\n\n![](_page_0_Picture_3.jpeg)\n\nRBI/2023-24/65 CO.DGBA.GBD.No.S646/42-01-029/2023-2024 October 03, 2023\n\nAll Agency Banks\n\nMadam/Dear Sir,\n\n## **Status of March 31, 2024 for Government transactions through integration with e-Kuber**\n\nThe 'e-Kuber' which is the Core Banking Solution platform of RBI for Government and other payments does not process any Government transactions on Global holidays (which are 26th January, 15th August, 2nd October, all 2nd and 4th Saturdays of a month and on all Sundays). It is observed that March 31, 2024 falls on a Sunday. The office of Controller General of Accounts, Government of India has advised that in order to account for all the Government transactions relating to receipts and payments in the financial year 2023-24 itself, it has been decided that March 31, 2024 (Sunday) be marked as a working day for the Government transactions so that all the Government transactions through integration with e-Kuber are processed on March 31, 2024 and accounted for in the financial year 2023-24 itself for arriving the cash balance of Government of India as on March 31, 2024.\n\n2. Also, the luggage files from banks for transferring the data related to Government transactions to RBI would also be accepted by e-Kuber system on March 31, 2024 for accounting of the same in the account for the financial year 2023-24.\n\nYours faithfully\n\n(Indranil Chakraborty) Chief General Manager\n\n![](_page_0_Picture_15.jpeg)",
#  'is_table': False,
#  'question': 'Why was March 31, 2024 designated as a working day for government transactions in India?',
#  'answer': "The Controller General of Accounts, Government of India, advised that March 31, 2024, be marked as a working day to ensure all government receipts and payments are accounted for in the financial year 2023-24. This allows for the accurate calculation of the Government of India's cash balance as of March 31, 2024.",
#  'evaluation_criteria': 'Answer should accurately state the reason for designating March 31, 2024, as a working day, including the purpose of accounting for government transactions and calculating the cash balance.',
#  'category': 'fact-based',
#  'estimated_difficulty': 4,
#  'rephrased_question': 'What prompted the designation of March 31, 2024, as a working day for government financial activities in India?',
#  'rephrased_answer': "To guarantee that all governmental income and outgoings were incorporated into the 2023-24 fiscal year, the Controller General of Accounts of the Indian government suggested designating March 31, 2024, as a working day. This ensures an exact computation of the Indian government's cash reserves as they stood on March 31, 2024."}

In [None]:
# --- Step 5: Push the updated DatasetDict to Hugging Face Hub ---
print("\nStep 5: Pushing the updated dataset to Hugging Face Hub...")
repo_id = "Vishva007/RBI-Circular-QA-Dataset" # Your dataset repo ID on Hugging Face Hub


In [None]:
try:
    # Push the updated DatasetDict to the Hugging Face Hub
    # set push_latest=True to overwrite if the split already exists in the repo
    updated_dataset_dict.push_to_hub(repo_id, private=False) # Set private=True if you want it private

    print(f"\nDataset '{repo_id}' successfully updated on Hugging Face Hub with 'eval' split!")
    print("You can view it here: https://huggingface.co/datasets/" + repo_id)

except Exception as e:
    print(f"\nError pushing to Hugging Face Hub: {e}")
    print("Please ensure you are logged in to Hugging Face Hub (`huggingface-cli login` or `notebook_login()`)")
    print(f"Also, verify you have write permissions for '{repo_id}' or choose a new `repo_id` under your namespace.")
