# Groundedness Dataset Preparation using Benchmark HaluEval Dataset

This notebook generates a groundedness dataset from QA-style JSONL input files obtained from:
 
- https://github.com/RUCAIBox/HaluEval/blob/main/README.md
- https://github.com/RUCAIBox/HaluEval/blob/main/data/qa_data.json

In [3]:
# Import Required Libraries
#!pip install openai requests pandas
import pandas as pd
import json
import argparse
import os
from pathlib import Path

In [10]:
# Configuration
JSONL_PATH = "../data/qa_data.json"
OUTPUT_PATH = "../data/halueval_groundedness.csv"
SAMPLE_SIZE = 500  # Number of queries to sample
SEED = 42  # Random seed for reproducibility


### Sampling Customer Queries

In [11]:
# Main Function Definition for Data Processing
def generate_groundedness_dataset(jsonl_path, output_path, sample_size, seed=42):
    """
    Load a QA-style JSONL file, sample N examples, and expand each into
    one grounded and one hallucinated row. Save the result as a CSV file.

    Each question contributes two rows:
        - Grounded answer: label = "PASS"
        - Hallucinated answer: label = "FAIL"
    """
    # Load JSONL data
    with open(jsonl_path, "r", encoding="utf-8") as f:
        data = [json.loads(line.strip()) for line in f]
    
    # Convert to DataFrame and Sample
    df = pd.DataFrame(data)
    sampled = df.sample(n=sample_size, random_state=seed).reset_index(drop=True)
    
    # Generate rows for groundedness dataset
    rows = []
    for _, row in sampled.iterrows():
        rows.append({
            "query": row["question"],
            "context": row["knowledge"],
            "response": row["right_answer"],
            "label": "PASS"
        })
        rows.append({
            "query": row["question"],
            "context": row["knowledge"],
            "response": row["hallucinated_answer"],
            "label": "FAIL"
        })

    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    pd.DataFrame(rows).to_csv(output_path, index=False)
    print(f"✅ Saved {len(rows)} rows to '{output_path}'")

In [12]:
# Generate the Groundedness Dataset
result_df = generate_groundedness_dataset(
    jsonl_path=JSONL_PATH,
    output_path=OUTPUT_PATH,
    sample_size=SAMPLE_SIZE,
    seed=SEED)


✅ Saved 1000 rows to '../data/halueval_groundedness.csv'


In [None]:
#