In [None]:
import json
from tqdm import tqdm
import os
import pandas as pd
from openai import OpenAI

In [None]:
# =========================
# CONFIG
# =========================
# Optional LLM
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
LLM_TEMPERATURE = 0
client = OpenAI()

In [None]:
# ---------- LOAD PROCESSED DATA ----------
df = pd.read_csv("../data/data.csv")
documents = df.to_dict(orient='records')

In [None]:
prompt_template = """
You act as customer who want to ask our support assistant application.
Formulate 5 questions this customer might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

question: {question}
answer: {answer}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [None]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [None]:
results = {}

In [None]:
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions

In [None]:
parsed_resulst = {}

for doc_id, json_questions in results.items():
    parsed_resulst[doc_id] = json.loads(json_questions)

In [None]:
for doc_id, questions in parsed_resulst.items():
    if 'questions' in questions:
        parsed_resulst[doc_id] = questions['questions']

In [None]:
doc_index = {d['id']: d for d in documents}

In [None]:
final_results = []
for doc_id, questions in parsed_resulst.items():
    expected_answer = doc_index[doc_id]['answer']
    for q in questions:
        final_results.append((doc_id,q,expected_answer))



In [None]:
df = pd.DataFrame(final_results, columns=['id','question','expected_answer'])

In [None]:
df.to_csv('../data/ground-truth-data.csv', index=False)