In [1]:
import pandas as pd

In [2]:
from openai import OpenAI

client = OpenAI()

In [5]:
df = pd.read_csv('../data/data.csv')

In [6]:
sampled_df = df.sample(n=1000, random_state=42)

In [7]:
# Continuing from the previous example
sampled_df = sampled_df.reset_index(drop=True)
sampled_df['id'] = sampled_df.index

In [8]:
sampled_df = sampled_df[['id'] + [col for col in sampled_df.columns if col != 'id']]

In [9]:
sampled_df['id'] = sampled_df.index + 1

In [10]:
sampled_df.head()

Unnamed: 0,id,Description,Patient,Doctor
0,1,Noticed maroon dot underneath skin which oozed...,first I saw a maroonish dot underneath my skin...,"Hi, Thanks for using HCM.Maroonish dot then la..."
1,2,How can anxiety and depression be treated?,i feel lack of emotion and im always anxious i...,Hello and Welcome to ‘Ask A Doctor’ service. I...
2,3,Suggest treatment for upper back pain,"had a pain in upper back since last monday, fe...","Hello, I have studied your case. Due to compre..."
3,4,What could cause sore & enlarged cubital nodes?,I went to the dr. last night for paranychia. T...,"Hello, Your symptoms seem to be related to swo..."
4,5,Q. I am experiencing erectile dysfunction. Wha...,"Hi doctor, I am a 47 year old male. Recently, ...",Hello. Thanks. Take care. For more information...


In [11]:
documents = sampled_df.to_dict(orient='records')

In [28]:
prompt_template = """
You emulate a user of our health care assistant application.
Formulate 5 short conversation that the patient might ask based on a provided description.
Make the patient's words specific to the description.
The record should contain the reply from doctor to the patient's questions, and the questions should
be complete and not too short. Use as fewer words as possible from the record. 

The record:

Description: {Description}
Patient: {Patient}
Doctor: {Doctor}

Provide the output in parsable JSON without using code blocks:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

In [24]:
prompt = prompt_template.format(**documents[2])

In [25]:
print(prompt)

You emulate a user of our health care assistant application.
Formulate 5 short conversation that the patient might ask based on a provided description.
Make the patient's words specific to the description.
The record should contain the reply from doctor to the patient's questions, and the questions should
be complete and not too short. 

The record:

Description: Suggest treatment for upper back pain
Patient: had a pain in upper back since last monday, feeling like trapped nerve, eases off for while, putting hot water bottle on when it more painfull, then last night app, 3am, the pain was in my chest, had to get up take paracetamol and re heat water bottle, propped up in bed, then eased off, wasn`t to bad when got up, as day gone on pain increased again in chest, was going to contact my gp, but it was too late, no appoitments. just to put my mind at rest,, as last night was scary, it felt like a heart attack, but i dont know what heart attack feels like, i will contact my gp in morning

In [15]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [26]:
questions = llm(prompt)

In [27]:
print(questions)

{"questions": ["I have been experiencing upper back pain for a week now, which sometimes feels like a trapped nerve. What can I do to alleviate this issue?", "Last night at 3 AM, I felt chest pain that scared me. Is this something related to my upper back pain or should I be more concerned?", "What type of tests do you recommend for my upper back and chest pain, and how soon should I get them done?", "You mentioned medication for pain relief; could you please specify which analgesic and neurotropic medications would be suitable for my symptoms?", "I would like to know more about the physiotherapy options you mentioned. What types of treatments should I look into to help with my condition?"]}


In [29]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [30]:
from tqdm.auto import tqdm
import json

In [31]:
results = {}

In [32]:
for doc in tqdm(documents):
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw)
    results[doc_id] = questions['questions']

  0%|          | 0/1000 [00:00<?, ?it/s]

In [34]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))

In [36]:
final_results[:3]

[(1,
  'What causes a maroon dot like the one I have under my skin, and could it be something serious?'),
 (1,
  'Is there a way to treat the maroon dot at home, or should I wait for my dermatologist appointment?'),
 (1,
  'Could any medications I am taking be affecting the bleeding from the maroon dot?')]

In [37]:
df_results = pd.DataFrame(final_results, columns=['id', 'question'])

In [38]:
df_results.to_csv('../data/ground-truth-retrieval.csv', index=False)