In [1]:
import pandas as pd

In [2]:
from openai import OpenAI
import os

In [3]:
client = OpenAI()

In [4]:
# df = pd.read_csv('../data/medquad_original.csv')
# df2 = pd.DataFrame({'id':df.index,
#                   'question':df['question'],
#                   'answer':df['answer'],
#                   'source':df['source'],
#                   'focus_area':df['focus_area'],
#                  })
# df2.to_csv('../data/medquad.csv',index=False)
# pd.read_csv('../data/medquad.csv')

In [5]:
df = pd.read_csv('../data/medquad.csv')
df = df[0:1028]
df

Unnamed: 0,id,question,answer,source,focus_area
0,0,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,NIHSeniorHealth,Glaucoma
1,1,What causes Glaucoma ?,"Nearly 2.7 million people have glaucoma, a lea...",NIHSeniorHealth,Glaucoma
2,2,What are the symptoms of Glaucoma ?,Symptoms of Glaucoma Glaucoma can develop in ...,NIHSeniorHealth,Glaucoma
3,3,What are the treatments for Glaucoma ?,"Although open-angle glaucoma cannot be cured, ...",NIHSeniorHealth,Glaucoma
4,4,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,NIHSeniorHealth,Glaucoma
...,...,...,...,...,...
1023,1023,What are the stages of Childhood Brain Stem Gl...,Key Points\n - The plan for...,CancerGov,Childhood Brain Stem Glioma
1024,1024,what research (or clinical trials) is being do...,New types of treatment are being tested in cli...,CancerGov,Childhood Brain Stem Glioma
1025,1025,What are the treatments for Childhood Brain St...,Key Points\n - There are di...,CancerGov,Childhood Brain Stem Glioma
1026,1026,What is (are) Colorectal Cancer ?,Key Points\n - Colorectal c...,CancerGov,Colorectal Cancer


In [6]:
documents = df.to_dict(orient='records')

In [7]:
prompt_template = """
You emulate a user of our medical question answering application.
Formulate 5 questions this user might ask based on a provided disease.
Make the questions specific to this disease.
The record should contain the answer to the questions, and the questions should
be complete and not too short. Use as few words as possible from the record. 

The record:

question: {question}
answer: {answer}
source: {source}
focus_area: {focus_area}

Provide the output in parsable JSON without using code blocks:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

In [8]:
prompt = prompt_template.format(**documents[0])

In [9]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [10]:
questions = llm(prompt)

In [11]:
import json

In [12]:
json.loads(questions)

{'questions': ['What are the main types of glaucoma and how do they differ?',
  'What causes the fluid buildup in the eye associated with glaucoma?',
  'Why is early diagnosis of glaucoma crucial for preserving vision?',
  'What treatments are available for managing glaucoma and preventing vision loss?',
  'Who is at a higher risk for developing glaucoma and why?']}

In [13]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [14]:
from tqdm.auto import tqdm

In [17]:
results = {}

In [18]:
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw)
    results[doc_id] = questions['questions']

  0%|          | 0/1028 [00:00<?, ?it/s]

In [19]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))

In [20]:
final_results[0]

(0, 'What are the main causes of glaucoma and how does it develop over time?')

In [21]:
df_results = pd.DataFrame(final_results, columns=['id', 'question'])

In [22]:
df_results.to_csv('../data/ground-truth-retrieval.csv', index=False)

In [23]:
!head ../data/ground-truth-retrieval.csv

id,question
0,What are the main causes of glaucoma and how does it develop over time?
0,Can you explain the differences between open-angle glaucoma and other types of glaucoma?
0,What are the symptoms that might indicate someone has glaucoma?
0,How does increased pressure in the eye lead to damage of the optic nerve in glaucoma?
0,What treatment options are available to help manage glaucoma and preserve vision?
1,What demographic groups are most at risk for developing glaucoma?
1,How does eye pressure influence the likelihood of developing glaucoma?
1,What specific age groups should be concerned about glaucoma?
1,How can a dilated eye exam help in determining glaucoma risk?
