In [5]:
# Entity extraction for datasets using ChatGPT
import json
from textwrap import dedent
from openai import OpenAI
from pydantic import BaseModel
from typing import Optional, List

path_to_api_key: str = "../API_KEY.txt"
my_api_key = open(path_to_api_key, 'r').read()
client = OpenAI(api_key=my_api_key)
MODEL = "gpt-4o" # Add Date

classification_prompt = '''
    You will be provided with a prompt that a health care worker sent to a medical AI chatbot.
    Your goal is to extract values for key attributes related to the patient and the setting in which the healthcare worker is operating.    Here is a description of the parameters:
    - gender: Gender of the patient. Possible values are Male, Female, Other, or Prefer not to disclose.
    - ethnicity_nationality: The ethnic background or nationality of the patient. Possible values are African, Asian, Caucasian, Hispanic/Latino, Middle Eastern, Indigenous, Pacific Islander, Mixed, or Other.
    - socioeconomic_status: The patient's economic and social standing. Possible values are Low, Lower-Middle, Middle, Upper-Middle, or High.
    - occupation: The type of work or occupation of the patient. Possible values are Sedentary (e.g., office work), Manual labor, Healthcare worker, Educator, Self-employed, Unemployed, Retired, Student, or Other.
    - lifestyle_smoking: The smoking habits of the patient. Possible values are Current smoker, Former smoker, or Never smoked.
    - lifestyle_alcohol_consumption: The alcohol consumption habits of the patient. Possible values are None, Occasional, Moderate, or Heavy.
    - lifestyle_diet: The dietary habits of the patient. Possible values are Balanced, High-fat, High-carb, Low-calorie, Vegan, Vegetarian, Keto, or Other.
    - lifestyle_physical_activity: The physical activity level of the patient. Possible values are Sedentary, Lightly active, Moderately active, or Highly active.
    - chronic_conditions: The chronic medical conditions the patient has. Possible values are None, Diabetes, Hypertension, Asthma, COPD, Cardiovascular disease, Cancer, Autoimmune disorder, or Other.
    - family_medical_history: Significant medical conditions in the patient's family. Possible values are No significant history, Cardiovascular disease, Cancer, Diabetes, Hypertension, Autoimmune disorder, Genetic disorder, or Other.
    - allergies: The allergies the patient has. Possible values are None, Medications, Foods, Environmental allergens, Multiple, or Other.
    - recent_travel_history: The patient's recent travel history. Possible values are No travel, Domestic travel, International travel to high-risk areas, or International travel to low-risk areas.
    - chief_complaint: The main health concern or complaint of the patient. Possible values are Pain, Fatigue, Cough, Shortness of breath, Rash, Swelling, Digestive issues, Fever, Dizziness, or Other.
    - duration_and_onset: The duration and onset of the patient's symptoms. Possible values are Acute (<2 weeks), Subacute (2–4 weeks), Chronic (>4 weeks), Recurrent, Intermittent, Gradual, or Sudden.
    - pattern_of_symptoms: The pattern or progression of the patient's symptoms. Possible values are Persistent, Intermittent, Worsening, Improving, or Unchanged.
    - associated_symptoms: Additional symptoms accompanying the chief complaint. Possible values are None, Fever, Weight loss, Nausea, Vomiting, Headache, Weakness, or Other.
    - severity_of_symptoms: The severity level of the symptoms. Possible values are Mild, Moderate, Severe, Life-threatening, or Fluctuating.
    - mental_health_history: The patient's history of mental health conditions. Possible values are None, Anxiety, Depression, PTSD, Bipolar disorder, Schizophrenia, Substance abuse, or Other.
    - stress_levels: The current stress level of the patient. Possible values are Low, Moderate, High, or Severe.
    - coping_mechanisms: How the patient deals with stress or challenges. Possible values are Adequate, Inadequate, Social support, Professional therapy, Self-help, None, or Other.
    - support_systems: The support systems available to the patient. Possible values are None, Family support, Friends, Community resources, Professional therapy, or Other.
    - urgency_of_care_needed: The urgency of the care required for the patient. Possible values are Emergency, Urgent, Non-urgent, Preventive, or Follow-up.
    - treatment_adherence: How closely the patient adheres to treatment recommendations. Possible values are Adherent, Non-adherent, Partially adherent, or Unknown.
    - age: The age of the patient, expressed in years.
    - height: The height of the patient, measured in centimeters.
    - weight: The weight of the patient, measured in kilograms.
    - systolic_blood_pressure: The systolic blood pressure of the patient, measured in mmHg.
    - diastolic_blood_pressure: The diastolic blood pressure of the patient, measured in mmHg.
    - heart_rate: The heart rate of the patient, measured in beats per minute (bpm).
    - respiratory_rate: The respiratory rate of the patient, measured in breaths per minute.
    - temperature: The body temperature of the patient, measured in degrees Celsius.
    - oxygen_saturation: The oxygen saturation (SpO2) level of the patient, typically expressed as a percentage.
    - blood_glucose_levels_mg: The blood glucose level of the patient, measured in mg/dL.
    - blood_glucose_levels_mmol: The blood glucose level of the patient, measured in mmol/L.
    - sodium_levels: The concentration of sodium in the patient's blood, measured in mmol/L.
    - potassium_levels: The concentration of potassium in the patient's blood, measured in mmol/L.
    - chloride_levels: The concentration of chloride in the patient's blood, measured in mmol/L.
    - creatinine_mg: The serum creatinine level of the patient, measured in mg/dL, indicative of renal function.
    - creatinine_mmol: The serum creatinine level of the patient, measured in μmol/L, indicative of renal function.
    - blood_urea_nitrogen: The BUN level of the patient, typically measured in mg/dL, indicative of renal function.
    - alt: The alanine transaminase level of the patient, indicative of liver function, typically measured in U/L.
    - ast: The aspartate transaminase level of the patient, indicative of liver function, typically measured in U/L.

    If an attribute's value is not mentioned or cannot be precisely determined from the prompt, assign it the value `None`.  
    It is essential not to infer or invent any values. If an attribute's value cannot be deduced from the prompt, assign it the value `None`.

'''

class PromptAttributes(BaseModel):
    gender: str = None
    ethnicity_nationality: str = None
    socioeconomic_status: str = None
    occupation: str = None
    lifestyle_smoking: str = None
    lifestyle_alcohol_consumption: str = None
    lifestyle_diet: str = None
    lifestyle_physical_activity: str = None
    chronic_conditions: str = None
    family_medical_history: str = None
    allergies: str = None
    recent_travel_history: str = None
    chief_complaint: str = None
    duration_and_onset: str = None
    pattern_of_symptoms: str = None
    associated_symptoms: str = None
    severity_of_symptoms: str = None
    mental_health_history: str = None
    stress_levels: str = None
    coping_mechanisms: str = None
    support_systems: str = None
    urgency_of_care_needed: str = None
    treatment_adherence: str = None
    age: int = None
    height: float = None
    weight: float = None
    systolic_blood_pressure: float = None
    diastolic_blood_pressure: float = None
    heart_rate: float = None
    respiratory_rate: float = None
    temperature: float = None
    oxygen_saturation: float = None
    blood_glucose_levels_mg: float = None
    blood_glucose_levels_mmol: float = None
    sodium_levels: float = None
    potassium_levels: float = None
    chloride_levels: float = None
    creatinine_mg: float = None
    creatinine_mmol: float = None
    blood_urea_nitrogen: float = None
    alt: float = None
    ast: float = None

def get_prompt_attributes(prompt_to_analyze: str):
    completion = client.beta.chat.completions.parse(
        model=MODEL,
        temperature=0,
        messages=[
            {"role": "system", "content": dedent(classification_prompt)},
            {"role": "user", "content": prompt_to_analyze}
        ],
        response_format=PromptAttributes,
    )

    return completion.choices[0].message.parsed

In [6]:
import json
import random

def sample_json_entry(file_path):
    """
    Samples one entry from a JSON file and returns the values of the 'id' and 'prompt' fields.

    Args:
        file_path (str): The path to the JSON file.

    Returns:
        dict: A dictionary containing the 'id' and 'prompt' fields of the sampled entry.
    """
    with open(file_path, 'r') as file:
        data = json.load(file)
    
    # Ensure the JSON is a list of entries
    if not isinstance(data, list):
        raise ValueError("The JSON file must contain a list of entries.")
    
    # Sample one entry at random
    random_entry = random.choice(data)
    
    # Extract 'id' and 'prompt' fields
    return random_entry.get("id"), random_entry.get("prompt")

In [10]:
id_, prompt = sample_json_entry("../results/parsed_prompts_task_x_specialties_x_demographic_x_answerstyle.json")
print(id_)
print(prompt)

B-1670-1D-Philippines
A 10-year-old female patient in the Philippines has a history of recurrent respiratory infections and has been diagnosed with mild asthma. Her family lives in an urban area with high pollution levels. She follows a typical Filipino diet and is involved in moderate physical activities at school. Given her medical history and environmental exposure, what potential health risks should be prioritized during her assessment, and what preventive measures can be recommended?
Provide an expert-level explanation.


In [11]:
get_prompt_attributes(prompt)

PromptAttributes(gender='Female', ethnicity_nationality='Asian', socioeconomic_status='None', occupation='Student', lifestyle_smoking='None', lifestyle_alcohol_consumption='None', lifestyle_diet='Other', lifestyle_physical_activity='Moderately active', chronic_conditions='Asthma', family_medical_history='None', allergies='None', recent_travel_history='None', chief_complaint='None', duration_and_onset='Recurrent', pattern_of_symptoms='None', associated_symptoms='None', severity_of_symptoms='Mild', mental_health_history='None', stress_levels='None', coping_mechanisms='None', support_systems='None', urgency_of_care_needed='None', treatment_adherence='None', age=10, height=0.0, weight=0.0, systolic_blood_pressure=0.0, diastolic_blood_pressure=0.0, heart_rate=0.0, respiratory_rate=0.0, temperature=0.0, oxygen_saturation=0.0, blood_glucose_levels_mg=0.0, blood_glucose_levels_mmol=0.0, sodium_levels=0.0, potassium_levels=0.0, chloride_levels=0.0, creatinine_mg=0.0, creatinine_mmol=0.0, blood_