# Synthetic Groundedness Dataset Preparation - Challenger Set of Bank Retail Customer Queries

In [58]:
# Install required packages (if needed)
#!pip install openai requests pandas

# Import required libraries
import os
from openai import OpenAI, OpenAIError, RateLimitError
import requests
import pandas as pd
import random
import time
from dotenv import load_dotenv
from pathlib import Path

# Load environment variables from .env file
load_dotenv(dotenv_path=".env", override=True)

# Configure API client
OpenAI.api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=OpenAI.api_key)

# Define which models to use for context generation, grounded responses, and hallucinated responses.
MODEL_GENERATE_CONTEXT = "gpt-4o-mini" 
MODEL_GROUNDED = "gpt-4o"         
MODEL_HALLUCINATED = "gpt-3.5-turbo-0125"

# File paths for data flow
SAMPLED_QUERIES_PATH = "../data/sampled_banking_77_queries.csv"
CONTEXT_OUTPUT_PATH = "../data/sampled_banking_77_queries_with_context.csv"
FINAL_OUTPUT_PATH = "../data/synthetic_groundedness_challenger_set.csv"

### Sampling Customer Queries

In [None]:
# Load initial customer query dataset
df = pd.read_csv("../data/banking_77_train_set.csv")
df = df.dropna(subset=["text"])    # Ensure no missing queries
df = df.rename(columns={"text": "query"})  # Rename 'text' to 'query' for consistency

# Randomly sample five customer queries
#sampled_random = df.sample(n=5, random_state=42) # small sample for initial testing of endpoint and prompting
#sampled_random.to_csv("../data/test_customer_queries.csv", index=False)

# Sample evenly across categories
n_per_category = 5  
df_sampled = (
    df.groupby("category", group_keys=False)
      .apply(lambda x: x.sample(min(len(x), n_per_category), random_state=42))
)

# Save sampled queries
df_sampled.to_csv(SAMPLED_QUERIES_PATH, index=False)
print(f"✅ Saved {len(df_sampled)} sampled rows across {df_sampled['category'].nunique()} categories to {SAMPLED_QUERIES_PATH}")

### Generate Synthetic FAQ Context for Each Query

In [None]:
# This section generates realistic help content using GPT for each sampled query.
SAVE_INTERVAL = 40
MAX_RETRIES = 6
SLEEP_BETWEEN_REQUESTS = 1  # seconds

df_sampled = pd.read_csv(SAMPLED_QUERIES_PATH)

# Prompt template for generating help content
def generate_context_prompt(query, category):
    return f"""You are writing content for the Help or FAQ section of a retail bank's website. 

Write a synthetic help page excerpt (150–250 words) that would support answering the following customer query, using appropriate terminology and informative tone typical of a real bank website. Can you use similar information as what you would find on the big four banks in Australia's websites and make sure they are banking products you would find in Australia. 

Category: {category}
Query: "{query}"
"""

# Load existing file or initialize fresh run
df_sampled["context"] = None
if Path(CONTEXT_OUTPUT_PATH).exists():
    df_existing = pd.read_csv(CONTEXT_OUTPUT_PATH)
    df_sampled.loc[df_existing.index, "context"] = df_existing["context"]
    print(f"Resuming from row {df_existing['context'].last_valid_index() + 1}")

# Generate missing contexts using GPT
for i, row in df_sampled.iterrows():
    if pd.notnull(row["context"]):
        continue

    prompt = generate_context_prompt(row["query"], row["category"])
    retries = 0
    context = None

    while retries < MAX_RETRIES:
        try:
            response = client.chat.completions.create(
                model=MODEL_GENERATE_CONTEXT,
                messages=[{"role": "user", "content": prompt}],
                temperature=0.7
            )
            context = response.choices[0].message.content.strip()
            break
        except RateLimitError:
            wait_time = 2 ** retries
            print(f"[Rate Limit] Row {i}: waiting {wait_time}s (retry {retries + 1})")
            time.sleep(wait_time)
            retries += 1
        except OpenAIError as e:
            print(f"[OpenAIError] Row {i}: {e}")
            break
        except Exception as e:
            print(f"[Unexpected Error] Row {i}: {e}")
            break

    df_sampled.at[i, "context"] = context if context else "ERROR"
    time.sleep(SLEEP_BETWEEN_REQUESTS)

    if i % SAVE_INTERVAL == 0 or i == len(df_sampled) - 1:
        df_sampled.to_csv(CONTEXT_OUTPUT_PATH, index=False)
        print(f"Progress saved up to row {i}")

print("✅ Context generation complete.")


#### Generate Grounded and Hallucinated Responses, and save final Challenger Set

In [59]:
# This section loads the context-enriched queries to generate grounded and hallucinated responses.
# Load previously generated synthetic context data
df = pd.read_csv(CONTEXT_OUTPUT_PATH)
df["response_grounded"] = None
df["response_ungrounded"] = None

# Define prompt templates
# These functions generate the instructions for GPT to create grounded or hallucinated answers.
def prompt_grounded(query, context):
    return f"""You are a customer support assistant. Using ONLY the information provided below, write a helpful and accurate answer to the customer query.

DO NOT include any facts, assumptions, or language that are not directly supported by the context.

Context:
{context}

Query:
{query}
"""

def prompt_ungrounded(query, context):
    return f"""You are a customer support assistant. Write a plausible-sounding response to the customer's query, but include subtle hallucinations.

- Change numbers (e.g., fees, timeframes)
- Make up a service/product
- Assume a specific type of card or account
- Repeat the first part of the query in the response before shifting to unrelated details

Do NOT reuse the content directly from context. Make the hallucination subtle but detectable.

Query:
{query}

Context (do NOT copy from this):
{context}
"""

In [60]:
# API Wrapper for Completion Calls
# Handles GPT API calls with retry logic in case of errors or rate limits.
def generate_response(prompt, model, temperature=0.0, retries=3):
    for attempt in range(retries):
        try:
            response = client.chat.completions.create(
                model=model,
                messages=[{"role": "user", "content": prompt}],
                temperature=temperature
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            print(f"Retry {attempt + 1} failed: {e}")
            time.sleep(2)
    return None

# Generate and Save Responses - generates grounded and hallucinated answers for each query and saves output incrementally.
for i, row in df.iterrows():
    if pd.notnull(row["response_grounded"]) and pd.notnull(row["response_ungrounded"]):
        continue

    query = row["query"]
    context = row["context"]

    grounded = generate_response(prompt_grounded(query, context), MODEL_GROUNDED, temperature=0.0)
    time.sleep(1)
    ungrounded = generate_response(prompt_ungrounded(query, context), MODEL_HALLUCINATED, temperature=0.8)
    time.sleep(1)

    df.at[i, "response_grounded"] = grounded
    df.at[i, "response_ungrounded"] = ungrounded

    if i % 20 == 0:
        df.to_csv(FINAL_OUTPUT_PATH, index=False)
        print(f"✅ Progress saved at index {i}")

# Final save
df.to_csv(FINAL_OUTPUT_PATH, index=False)
print(f"✅ Final data saved to {FINAL_OUTPUT_PATH}")

✅ Progress saved at index 0
✅ Progress saved at index 20
✅ Progress saved at index 40
✅ Progress saved at index 60
✅ Progress saved at index 80
✅ Progress saved at index 100
✅ Progress saved at index 120
✅ Progress saved at index 140
✅ Progress saved at index 160
✅ Progress saved at index 180
✅ Progress saved at index 200
✅ Progress saved at index 220
✅ Progress saved at index 240
✅ Progress saved at index 260
✅ Progress saved at index 280
✅ Progress saved at index 300
✅ Progress saved at index 320
✅ Progress saved at index 340
✅ Progress saved at index 360
✅ Progress saved at index 380
✅ Final data saved to ../data/synthetic_groundedness_challenger_set.csv
