In [None]:
import os
import sys

src_path = os.path.abspath('../..')
print(src_path)
sys.path.append(src_path)

In [None]:
from src.utils import create_directory, raw_data_path, processed_data_path, set_seed

In [None]:
set_seed(seed=42)

In [None]:
import pandas as pd

In [None]:
mimic_iv_path = os.path.join(raw_data_path, "physionet.org/files/mimiciv/2.2")
mimic_iv_note_path = os.path.join(raw_data_path, "physionet.org/files/mimic-iv-note/2.2")
output_path = os.path.join(processed_data_path, "mimic4")

In [None]:
cohort = pd.read_csv(os.path.join(output_path, "cohort.csv"))
print(cohort.shape)
cohort.head()

In [None]:
cohort["hadm_intime"] = pd.to_datetime(cohort["hadm_intime"])
cohort["hadm_outtime"] = pd.to_datetime(cohort["hadm_outtime"])
cohort["stay_intime"] = pd.to_datetime(cohort["stay_intime"])
cohort["stay_outtime"] = pd.to_datetime(cohort["stay_outtime"])

In [None]:
hadm_ids = set(cohort.hadm_id.unique().tolist())
len(hadm_ids)

## event

In [None]:
hadm_id_to_max_hours = cohort[["hadm_id", "hadm_los"]].set_index("hadm_id").to_dict()["hadm_los"]

In [None]:
import random

In [None]:
def read_event_df(hadm_id, event_type):
    df = pd.read_csv(os.path.join(output_path, f"event_{event_type}/event_{hadm_id}.csv"))
    return df

In [None]:
hadm_id = 28141610

In [None]:
hadm_id_to_max_hours[hadm_id]

In [None]:
def generate_qa_patient_demographics(hadm_id, return_one=True):
    df = read_event_df(hadm_id, "patient_demographics")
    assert len(df) == 1
    available_qa = []
    available_qa.append(("What was the gender of the patient?", df.iloc[0].meta_gender))
    available_qa.append(("What was the age of the patient?", str(df.iloc[0].meta_age)))    
    available_qa.append(("What was the race of the patient?", df.iloc[0].meta_race))
    available_qa.append(("What was the insurance of the patient?", df.iloc[0].meta_insurance))
    if not pd.isna(df.iloc[0].meta_marital_status):
        available_qa.append(("What was the marital status of the patient?", df.iloc[0].meta_marital_status))
    if return_one:
        return random.choice(available_qa)
    else:
        return available_qa

In [None]:
generate_qa_patient_demographics(hadm_id, return_one=False)

In [None]:
def generate_qa_admission_info(hadm_id, return_one=True):
    df = read_event_df(hadm_id, "admission_info")
    assert len(df) == 1
    available_qa = []
    available_qa.append(("What was the admission type of the patient?", df.iloc[0].meta_admission_type))
    available_qa.append(("What was the admission location of the patient?", df.iloc[0].meta_admission_location))    
    if not pd.isna(df.iloc[0].meta_chief_complaint):
        available_qa.append(("What aws the chief complaint of the patient?", df.iloc[0].meta_chief_complaint))
    if return_one:
        return random.choice(available_qa)
    else:
        return available_qa

In [None]:
generate_qa_admission_info(hadm_id, return_one=False)

In [None]:
def generate_qa_diagnoses_icd(hadm_id, return_one=True):
    df = read_event_df(hadm_id, "diagnoses_icd")
    available_qa = []
    available_qa.append(("What were the billled diagnoses of the patient?", 
                         "; ".join(df.meta_long_title.tolist())))
    available_qa.append(("What was the first billled diagnose of the patient?", 
                         "; ".join(df.meta_long_title.tolist()[:1])))
    available_qa.append(("What were the top three billled diagnoses of the patient?", 
                         "; ".join(df.meta_long_title.tolist()[:3])))
    available_qa.append(("What were the top five billled diagnoses of the patient?", 
                         "; ".join(df.meta_long_title.tolist()[:5])))
    if return_one:
        return random.choice(available_qa)
    else:
        return available_qa

In [None]:
generate_qa_diagnoses_icd(hadm_id, return_one=False)

In [None]:
import warnings


def sample_time_period(df, enforce_non_empty=True):
#     this is wrong since the df is only a sub-sequence
#     max_hours = df.timestamp.max() 
    max_hours = hadm_id_to_max_hours[df.hadm_id.iloc[0]]
    max_days = int(max_hours // 24 + 1)
    
    available = {
        "during the first 12 hours": lambda x: x < 12,
        "during the first 24 hours": lambda x: x < 24,
        "during the first 48 hours": lambda x: x < 48,
        "during the last 12 hours": lambda x: x >= max_hours - 12,
        "during the last 24 hours": lambda x: x >= max_hours - 24,
        "during the last 48 hours": lambda x: x >= max_hours - 48,
    }
    for _ in range(3):
        d = random.choice(range(1, max_days + 1))
        available[f"during day {d}"] = lambda x, d=d: ((d - 1) * 24) <= x < (d * 24)
    n_tries = 0
    while True:
        s = random.choice(list(available.keys()))
        f = available[s]
        tmp = df[df.timestamp.apply(f)]
        if not enforce_non_empty:
            return s, tmp
        if len(tmp) > 0:
            return s, tmp        
        n_tries += 1
        if n_tries > 100:
            warnings.warn(f"Too many tries to enfore non-empty return: len={len(df)}")
            return "during the entire stay", df

In [None]:
import numpy as np


def generate_qa_labevents(hadm_id, return_one=True):
    df = read_event_df(hadm_id, "labevents")
    available_qa = []
    
    df["event"] = df.apply(lambda x: f"{x.meta_fluid} {x.meta_label} {x.meta_category}", axis=1)
    df["value"] = df.apply(lambda x: f"{x.meta_value}" if pd.isna(x.meta_valueuom) else f"{x.meta_value} {x.meta_valueuom}", axis=1)
    
    x = df[df.event == random.choice(df.event.unique())].sample()
    x = x.iloc[0]
    
    # what was the {value} of the {event} at the {time_exact} hour?    
    q = f"What was the {x.event} measurement at the {x.timestamp:.2f} hour?"
    a = x.value
    available_qa.append((q, a))
    
    # was the the {event} at the {time_exact} hour normal?
    q = f"Was the {x.event} measurement at the {x.timestamp:.2f} hour normal?"
    if pd.isna(x.meta_flag):
        a = "Yes"
    else:
        a = "No"
    available_qa.append((q, a))
        
    # what {measurements} were performed on the {specimen} at the {time_exact} hour?
    df_tmp = df[(df.timestamp == x.timestamp) & (df.meta_fluid == x.meta_fluid) & (df.meta_category == x.meta_category)]
    q = f"What {x.meta_category} measurements were performed on the {x.meta_fluid} specimen at the {x.timestamp:.2f} hour?"
    a = ", ".join(df_tmp.meta_label.tolist())
    available_qa.append((q, a))
    
    # what {measurements} performed on the {specimen} were abnormal at the {time_exact} hour?
    df_tmp = df_tmp[~pd.isna(df_tmp.meta_flag)]
    q = f"What {x.meta_category} measurements on the {x.meta_fluid} specimen were abnormal at the {x.timestamp:.2f} hour?"
    a = ", ".join(df_tmp.meta_label.tolist())
    if len(a) == 0:
        a = "All lab tests were normal"
    available_qa.append((q, a))
    
    # what was the {value} of the {time_select} {event} during {time_period}?
    df_tmp = df[df.event == x.event]
    s, df_tmp = sample_time_period(df_tmp)
    q = f"What was the first {x.event} measurement {s}?"
    a = df_tmp.iloc[0].value
    available_qa.append((q, a))
    q = f"What was the last {x.event} measurement {s}?"
    a = df_tmp.iloc[-1].value
    available_qa.append((q, a))
    
    # when was the {time_select} {event} during {time_period}?
    q = f"When was the first {x.event} measurement {s}?"
    a = f"At the {df_tmp.iloc[0].timestamp:.2f} hour"
    available_qa.append((q, a))
    q = f"When was the last {x.event} measurement {s}?"
    a = f"At the {df_tmp.iloc[-1].timestamp:.2f} hour"
    available_qa.append((q, a))
    
    # count the number of times the patient had {event} during {time_period}?
    q = f"How many times did the patient have the {x.event} measurement {s}?"
    a = f"{len(df_tmp)} times" if len(df_tmp) > 1 else "1 time"
    available_qa.append((q, a))
    
    # what was the {agg_function} {value} of the {event} during {time_period}?
    unique_units = df_tmp.meta_valueuom.unique()
    if len(unique_units) == 1 and not pd.isna(unique_units[0]):
        try:
            unique_values = [float(v) for v in df_tmp.meta_value]
            q = f"What was the maximum {x.event} measurement {s}?"
            a = f"{np.max(unique_values)} {unique_units[0]}"
            available_qa.append((q, a))
            q = f"What was the minimum {x.event} measurement {s}?"
            a = f"{np.min(unique_values)} {unique_units[0]}"
            available_qa.append((q, a))
            q = f"What was the average {x.event} measurement {s}?"
            a = f"{np.mean(unique_values):.2f} {unique_units[0]}"
            available_qa.append((q, a))
        except ValueError:
            pass
        
    # was the patient having {event} during {time_period}?
    s, df_tmp = sample_time_period(df, enforce_non_empty=False)
    q = f"Did the patient have any {x.event} measurement {s}?"
    if x.event in df_tmp.event.unique():
        a = "Yes"
    else:
        a = "No"
    available_qa.append((q, a))
        
    if return_one:
        return random.choice(available_qa)
    else:
        return available_qa

In [None]:
generate_qa_labevents(hadm_id, return_one=False)

In [None]:
def generate_qa_microbiologyevents(hadm_id, return_one=True):
    df = read_event_df(hadm_id, "microbiologyevents")
    available_qa = []
    
    x = df.sample()
    x = x.iloc[0]
        
    # what {measurements} were performed on the {specimen} at the {time_exact} hour?
    df_tmp = df[(df.timestamp == x.timestamp) & (df.meta_spec_type_desc == x.meta_spec_type_desc)]
    q = f"What microbiology tests were performed on the {x.meta_spec_type_desc} specimen at the {x.timestamp:.2f} hour?"
    a = ", ".join(df_tmp.meta_test_name.unique())
    available_qa.append((q, a))
    
    # what {measurements} performed on the {specimen} were abnormal at the {time_exact} hour?
    df_tmp = df_tmp[~pd.isna(df_tmp.meta_org_name)]
    q = f"What organisms were found on the {x.meta_spec_type_desc} specimen at the {x.timestamp:.2f} hour?"
    a = ", ".join(df_tmp.meta_org_name.unique())
    if len(a) == 0:
        a = "No growth was found"
    available_qa.append((q, a))
    
    # what {measurements} performed on the {specimen} were abnormal at the {time_exact} hour?
    if len(df_tmp) > 0:
        x = df_tmp.sample().iloc[0]
        df_tmp = df_tmp[df_tmp.meta_org_name == x.meta_org_name]
        q = f"What were the antibiotics test results against the {x.meta_org_name} on the {x.meta_spec_type_desc} specimen at the {x.timestamp:.2f} hour?"
        df_tmp = df_tmp[~pd.isna(df_tmp.meta_ab_name)]
        a = ", ".join([f"{ab}: {res}" for ab, res in zip(df_tmp.meta_ab_name.tolist(), df_tmp.meta_interpretation.tolist())])
        if len(a) == 0:
            a = "No antibiotics was tested"
        available_qa.append((q, a))
        
    # was the patient having {event} during {time_period}?
    s, df_tmp = sample_time_period(df, enforce_non_empty=False)
    q = f"Did the patient have any microbiology test on the {x.meta_spec_type_desc} specimen {s}?"
    if x.meta_spec_type_desc in df_tmp.meta_spec_type_desc.unique():
        a = "Yes"
    else:
        a = "No"
    available_qa.append((q, a))
        
    if return_one:
        return random.choice(available_qa)
    else:
        return available_qa

In [None]:
hadm_id = 24248394

In [None]:
generate_qa_microbiologyevents(hadm_id, return_one=False)

In [None]:
def generate_qa_prescriptions(hadm_id, return_one=True):
    df = read_event_df(hadm_id, "prescriptions")
    available_qa = []
    
    x = df.sample()
    x = x.iloc[0]
    
    # what was the {value} of the {event} at the {time_exact} hour?
    if not pd.isna(x.meta_prod_strength):
        q = f"What was the composition of the prescribed {x.meta_drug} at the {x.timestamp:.2f} hour?"
        a = x.meta_prod_strength
        available_qa.append((q, a))
        
    if not pd.isna(x.meta_dose_val_rx):
        q = f"What was the dose of the prescribed {x.meta_drug} at the {x.timestamp:.2f} hour?"
        a = f"{x.meta_dose_val_rx}"
        if not pd.isna(x.meta_dose_unit_rx):
            a += f" {x.meta_dose_unit_rx}"
        available_qa.append((q, a))
        
    q = f"What was the administration route of the prescribed {x.meta_drug} at the {x.timestamp:.2f} hour?"
    a = x.meta_route
    available_qa.append((q, a))
    
    q = f"What was the administration duration of the prescribed {x.meta_drug} at the {x.timestamp:.2f} hour?"
    a = f"{x.meta_duration:.2f} hours"
    available_qa.append((q, a))
        
    # what drugs were prescribed at the {time_exact} hour?
    df_tmp = df[(df.timestamp == x.timestamp)]
    q = f"What drugs were prescribed at the {x.timestamp:.2f} hour?"
    a = ", ".join(df_tmp.meta_drug.unique())
    available_qa.append((q, a))
    
    # what was the {value} of the {time_select} {event} during {time_period}?
    df_tmp = df[df.meta_drug == x.meta_drug]
    s, df_tmp = sample_time_period(df_tmp)
    if not pd.isna(x.meta_prod_strength):
        q = f"What was the composition of the first prescribed {x.meta_drug} {s}?"
        a = df_tmp.iloc[0].meta_prod_strength
        available_qa.append((q, a))
        
        q = f"What was the composition of the last prescribed {x.meta_drug} {s}?"
        a = df_tmp.iloc[-1].meta_prod_strength
        available_qa.append((q, a))
        
    if not pd.isna(x.meta_dose_val_rx):
        q = f"What was the dose of the first prescribed {x.meta_drug} {s}?"
        a = f"{df_tmp.iloc[0].meta_dose_val_rx}"
        if not pd.isna(df_tmp.iloc[0].meta_dose_unit_rx):
            a += f" {df_tmp.iloc[0].meta_dose_unit_rx}"
        available_qa.append((q, a))
        
        q = f"What was the dose of the last prescribed {x.meta_drug} {s}?"
        a = f"{df_tmp.iloc[-1].meta_dose_val_rx}"
        if not pd.isna(df_tmp.iloc[-1].meta_dose_unit_rx):
            a += f" {df_tmp.iloc[-1].meta_dose_unit_rx}"
        available_qa.append((q, a))
        
    q = f"What was the administration route of the first prescribed {x.meta_drug} {s}?"
    a = df_tmp.iloc[0].meta_route
    available_qa.append((q, a))
    
    q = f"What was the administration route of the last prescribed {x.meta_drug} {s}?"
    a = df_tmp.iloc[-1].meta_route
    available_qa.append((q, a))
    
    q = f"What was the administration duration of the first prescribed {x.meta_drug} {s}?"
    a = f"{df_tmp.iloc[0].meta_duration:.2f} hours"
    available_qa.append((q, a))
    
    q = f"What was the administration duration of the last prescribed {x.meta_drug} {s}?"
    a = f"{df_tmp.iloc[-1].meta_duration:.2f} hours"
    available_qa.append((q, a))
    
    # when was the {time_select} {event} during {time_period}?
    q = f"When was the first {x.meta_drug} prescription {s}?"
    a = f"At the {df_tmp.iloc[0].timestamp:.2f} hour"
    available_qa.append((q, a))
    q = f"When was the last {x.meta_drug} prescription {s}?"
    a = f"At the {df_tmp.iloc[-1].timestamp:.2f} hour"
    available_qa.append((q, a))
    
    # count the number of times the patient had {event} during {time_period}?
    q = f"How many times did the patient have the {x.meta_drug} prescription {s}?"
    a = f"{len(df_tmp)} times" if len(df_tmp) > 1 else "1 time"
    available_qa.append((q, a))
    
    # was the patient having {event} during {time_period}?
    s, df_tmp = sample_time_period(df, enforce_non_empty=False)
    q = f"Was the patient prescribed with any {x.meta_drug} {s}?"
    if x.meta_drug in df_tmp.meta_drug.unique():
        a = "Yes"
    else:
        a = "No"
    available_qa.append((q, a))
        
    if return_one:
        return random.choice(available_qa)
    else:
        return available_qa

In [None]:
hadm_id = 27262979

In [None]:
generate_qa_prescriptions(hadm_id, return_one=False)

In [None]:
def generate_qa_transfers(hadm_id, return_one=True):
    df = read_event_df(hadm_id, "transfers")
    available_qa = []
    
    x = df.sample()
    x = x.iloc[0]
    
    # what was the {value} of the {event} at the {time_exact} hour?
    q = f"Which unit was the patient transferred to at the {x.timestamp:.2f} hour?"
    if not pd.isna(x.meta_careunit):
        a = x.meta_careunit
        available_qa.append((q, a))
        
    # when was the patient discharged from the hospital?
    q = f"When was the patient discharged from the hospital?"
    df_tmp = df[df.event_value == "discharge"]
    assert len(df_tmp) == 1
    a = f"At the {df_tmp.iloc[0].timestamp:.2f} hour"
    available_qa.append((q, a))
    
    # how long was the length of hospital stay of the patient?
    q = f"How long was the length of hospital stay of the patient?"
    a = f"{df_tmp.iloc[0].timestamp:.2f} hours"
    available_qa.append((q, a))
    
    q = f"How long was the length of hospital stay of the patient?"
    a = f"{df_tmp.iloc[0].timestamp / 24:.2f} days"
    available_qa.append((q, a))

    if return_one:
        return random.choice(available_qa)
    else:
        return available_qa

In [None]:
hadm_id = 29622279

In [None]:
generate_qa_transfers(hadm_id, return_one=False)

In [None]:
def generate_qa_chartevents(hadm_id, return_one=True):
    df = read_event_df(hadm_id, "chartevents")
    available_qa = []
    
    df["event"] = df.meta_label
    df["value"] = df.apply(lambda x: f"{x.meta_value}" if pd.isna(x.meta_valueuom) else f"{x.meta_value} {x.meta_valueuom}", axis=1)
    
    x = df[df.event == random.choice(df.event.unique())].sample()
    x = x.iloc[0]
    
    # what was the {value} of the {event} at the {time_exact} hour?    
    q = f"What was the {x.event} value at the {x.timestamp:.2f} hour?"
    a = x.value
    available_qa.append((q, a))
    
    # what was the {value} of the {time_select} {event} during {time_period}?
    df_tmp = df[df.event == x.event]
    s, df_tmp = sample_time_period(df_tmp)
    q = f"What was the first {x.event} value {s}?"
    a = df_tmp.iloc[0].value
    available_qa.append((q, a))
    q = f"What was the last {x.event} value {s}?"
    a = df_tmp.iloc[-1].value
    available_qa.append((q, a))
    
    # when was the {time_select} {event} during {time_period}?
    q = f"When was the first {x.event} value {s}?"
    a = f"At the {df_tmp.iloc[0].timestamp:.2f} hour"
    available_qa.append((q, a))
    q = f"When was the last {x.event} value {s}?"
    a = f"At the {df_tmp.iloc[-1].timestamp:.2f} hour"
    available_qa.append((q, a))
    
    # count the number of times the patient had {event} during {time_period}?
    q = f"How many times did the patient have the {x.event} value {s}?"
    a = f"{len(df_tmp)} times" if len(df_tmp) > 1 else "1 time"
    available_qa.append((q, a))
    
    # what was the {agg_function} {value} of the {event} during {time_period}?
    unique_units = df_tmp.meta_valueuom.unique()
    if len(unique_units) == 1 and not pd.isna(unique_units[0]):
        try:
            unique_values = [float(v) for v in df_tmp.meta_value]
            q = f"What was the maximum {x.event} value {s}?"
            a = f"{np.max(unique_values)} {unique_units[0]}"
            available_qa.append((q, a))
            q = f"What was the minimum {x.event} value {s}?"
            a = f"{np.min(unique_values)} {unique_units[0]}"
            available_qa.append((q, a))
            q = f"What was the average {x.event} value {s}?"
            a = f"{np.mean(unique_values):.2f} {unique_units[0]}"
            available_qa.append((q, a))
        except ValueError:
            pass
        
    # was the patient having {event} during {time_period}?
    s, df_tmp = sample_time_period(df, enforce_non_empty=False)
    q = f"Did the patient have any {x.event} value {s}?"
    if x.event in df_tmp.event.unique():
        a = "Yes"
    else:
        a = "No"
    available_qa.append((q, a))
        
    if return_one:
        return random.choice(available_qa)
    else:
        return available_qa

In [None]:
generate_qa_chartevents(hadm_id, return_one=False)

In [None]:
def generate_qa_inputevents(hadm_id, return_one=True):
    df = read_event_df(hadm_id, "inputevents")
    available_qa = []
    
    x = df.sample()
    x = x.iloc[0]
    
    # what was the {value} of the {event} at the {time_exact} hour?
    q = f"What was the amount of the IV administration {x.meta_label} at the {x.timestamp:.2f} hour?"
    a = f"{x.meta_amount:.2f} {x.meta_amountuom}"
    available_qa.append((q, a))
    
    q = f"What was the duration of IV administration {x.meta_label} at the {x.timestamp:.2f} hour?"
    a = f"{x.meta_duration:.2f} hours"
    available_qa.append((q, a))
        
    # what drugs were prescribed at the {time_exact} hour?
    df_tmp = df[(df.timestamp == x.timestamp)]
    q = f"What drugs were administered through IV at the {x.timestamp:.2f} hour?"
    a = ", ".join(df_tmp.meta_label.unique())
    available_qa.append((q, a))
    
    # what was the {value} of the {time_select} {event} during {time_period}?
    df_tmp = df[df.meta_label == x.meta_label]
    s, df_tmp = sample_time_period(df_tmp)
        
    q = f"What was the amount of the first IV administration {x.meta_label} {s}?"
    a = f"{df_tmp.iloc[0].meta_amount:.2f} {df_tmp.iloc[0].meta_amountuom}"
    available_qa.append((q, a))

    q = f"What was the amount of the last IV administration {x.meta_label} {s}?"
    a = f"{df_tmp.iloc[-1].meta_amount:.2f} {df_tmp.iloc[-1].meta_amountuom}"
    available_qa.append((q, a))
        
    q = f"What was the duration of the first IV administration {x.meta_label} {s}?"
    a = f"{df_tmp.iloc[0].meta_duration:.2f} hours"
    available_qa.append((q, a))

    q = f"What was the duration of the last IV administration {x.meta_label} {s}?"
    a = f"{df_tmp.iloc[-1].meta_duration:.2f} hours"
    available_qa.append((q, a))
    
    # when was the {time_select} {event} during {time_period}?
    q = f"When was the first {x.meta_label} IV administration {s}?"
    a = f"At the {df_tmp.iloc[0].timestamp:.2f} hour"
    available_qa.append((q, a))
    q = f"When was the last {x.meta_label} IV administration {s}?"
    a = f"At the {df_tmp.iloc[-1].timestamp:.2f} hour"
    available_qa.append((q, a))
    
    # count the number of times the patient had {event} during {time_period}?
    q = f"How many times did the patient have the {x.meta_label} IV administration {s}?"
    a = f"{len(df_tmp)} times" if len(df_tmp) > 1 else "1 time"
    available_qa.append((q, a))
    
    # was the patient having {event} during {time_period}?
    s, df_tmp = sample_time_period(df, enforce_non_empty=False)
    q = f"Was the patient administered with any {x.meta_label} through IV {s}?"
    if x.meta_label in df_tmp.meta_label.unique():
        a = "Yes"
    else:
        a = "No"
    available_qa.append((q, a))
        
    if return_one:
        return random.choice(available_qa)
    else:
        return available_qa

In [None]:
hadm_id = 24248394

In [None]:
generate_qa_inputevents(hadm_id, return_one=False)

In [None]:
def generate_qa_outputevents(hadm_id, return_one=True):
    df = read_event_df(hadm_id, "outputevents")
    available_qa = []
    
    x = df.sample()
    x = x.iloc[0]
    
    # what was the {value} of the {event} at the {time_exact} hour?
    q = f"What was the amount of the output {x.meta_label} at the {x.timestamp:.2f} hour?"
    a = f"{x.meta_value:.2f} {x.meta_valueuom}"
    available_qa.append((q, a))
    
    # what was the {value} of the {time_select} {event} during {time_period}?
    df_tmp = df[df.meta_label == x.meta_label]
    s, df_tmp = sample_time_period(df_tmp)
        
    q = f"What was the amount of the first output {x.meta_label} {s}?"
    a = f"{df_tmp.iloc[0].meta_value:.2f} {df_tmp.iloc[0].meta_valueuom}"
    available_qa.append((q, a))

    q = f"What was the amount of the last output {x.meta_label} {s}?"
    a = f"{df_tmp.iloc[-1].meta_value:.2f} {df_tmp.iloc[-1].meta_valueuom}"
    available_qa.append((q, a))
    
    unique_units = df_tmp.meta_valueuom.unique()
    if len(unique_units) == 1:
        q = f"What was the total amount of the output {x.meta_label} {s}?"
        a = f"{df_tmp.meta_value.sum():.2f} {unique_units[0]}"
        available_qa.append((q, a))
        
        q = f"What was the maximum amount of the output {x.meta_label} {s}?"
        a = f"{df_tmp.meta_value.max():.2f} {unique_units[0]}"
        available_qa.append((q, a))
        
        q = f"What was the minimum amount of the output {x.meta_label} {s}?"
        a = f"{df_tmp.meta_value.min():.2f} {unique_units[0]}"
        available_qa.append((q, a))
        
        q = f"What was the average amount of the output {x.meta_label} {s}?"
        a = f"{df_tmp.meta_value.mean():.2f} {unique_units[0]}"
        available_qa.append((q, a))
    
    # when was the {time_select} {event} during {time_period}?
    q = f"When was the first output {x.meta_label} {s}?"
    a = f"At the {df_tmp.iloc[0].timestamp:.2f} hour"
    available_qa.append((q, a))
    q = f"When was the last output {x.meta_label} {s}?"
    a = f"At the {df_tmp.iloc[-1].timestamp:.2f} hour"
    available_qa.append((q, a))
    
    # count the number of times the patient had {event} during {time_period}?
    q = f"How many times did the patient have the {x.meta_label} output {s}?"
    a = f"{len(df_tmp)} times" if len(df_tmp) > 1 else "1 time"
    available_qa.append((q, a))
    
    # was the patient having {event} during {time_period}?
    s, df_tmp = sample_time_period(df, enforce_non_empty=False)
    q = f"Did the patient have any {x.meta_label} output {s}?"
    if x.meta_label in df_tmp.meta_label.unique():
        a = "Yes"
    else:
        a = "No"
    available_qa.append((q, a))
        
    if return_one:
        return random.choice(available_qa)
    else:
        return available_qa

In [None]:
generate_qa_outputevents(hadm_id, return_one=False)

In [None]:
def generate_qa_procedureevents(hadm_id, return_one=True):
    df = read_event_df(hadm_id, "procedureevents")
    available_qa = []
    
    x = df.sample()
    x = x.iloc[0]
    
    # what procedures were performed at the {time_exact} hour?
    df_tmp = df[(df.timestamp == x.timestamp)]
    q = f"What procedures were performed at the {x.timestamp:.2f} hour?"
    a = ", ".join(df_tmp.meta_label.unique())
    available_qa.append((q, a))
    
    # what procedures were performed during {time_period}?
    s, df_tmp = sample_time_period(df)
    q = f"What procedures were performed {s}?"
    a = ", ".join(df_tmp.meta_label.unique())
    available_qa.append((q, a))
    
    # what was the {value} of the {time_select} {event} during {time_period}?
    df_tmp = df[df.meta_label == x.meta_label]
    s, df_tmp = sample_time_period(df_tmp)
        
    q = f"What was the duration of the first {x.meta_label} procedure {s}?"
    a = f"{df_tmp.iloc[0].meta_duration:.2f} hours"
    available_qa.append((q, a))

    q = f"What was the duration of the last {x.meta_label} procedure {s}?"
    a = f"{df_tmp.iloc[-1].meta_duration:.2f} hours"
    available_qa.append((q, a))
    
    # when was the {time_select} {event} during {time_period}?
    q = f"When was the first {x.meta_label} procedure {s}?"
    a = f"At the {df_tmp.iloc[0].timestamp:.2f} hour"
    available_qa.append((q, a))
    q = f"When was the last {x.meta_label} procedure {s}?"
    a = f"At the {df_tmp.iloc[-1].timestamp:.2f} hour"
    available_qa.append((q, a))
    
    # count the number of times the patient had {event} during {time_period}?
    q = f"How many times did the patient undergo the {x.meta_label} procedure {s}?"
    a = f"{len(df_tmp)} times" if len(df_tmp) > 1 else "1 time"
    available_qa.append((q, a))
    
    # was the patient having {event} during {time_period}?
    s, df_tmp = sample_time_period(df, enforce_non_empty=False)
    q = f"Did the patient undergo any {x.meta_label} procedure {s}?"
    if x.meta_label in df_tmp.meta_label.unique():
        a = "Yes"
    else:
        a = "No"
    available_qa.append((q, a))
        
    if return_one:
        return random.choice(available_qa)
    else:
        return available_qa

In [None]:
generate_qa_procedureevents(hadm_id, return_one=False)

In [None]:
events_selected = {
    "patient_demographics": generate_qa_patient_demographics,
    "admission_info": generate_qa_admission_info,
    "diagnoses_icd": generate_qa_diagnoses_icd,
    "labevents": generate_qa_labevents,           
    "procedureevents": generate_qa_procedureevents,
    "microbiologyevents": generate_qa_microbiologyevents,
    "prescriptions": generate_qa_prescriptions,  
    "transfers": generate_qa_transfers,
}

In [None]:
len(hadm_ids)

In [None]:
350000 // 59513

In [None]:
from tqdm import tqdm


warnings.filterwarnings('ignore')

qa = []
for hadm_id in tqdm(hadm_ids):
    for _ in range(6):
        while True:
            event_type = random.choice(list(events_selected.keys()))
            event_f = events_selected[event_type]
            try:
                qa.append((hadm_id, *event_f(hadm_id), event_type))
                break
            except FileNotFoundError:
                warnings.warn(f"No {event_type} for {hadm_id}")
                continue
#             except Exception as e:
#                 print(f"Got the following error with {event_type}({hadm_id}): ", e)
#                 continue
#     if len(qa) >= 1000:
#         break
        
print(f"Processed {len(qa)} responses")

warnings.filterwarnings('default')

In [None]:
import json


with open(os.path.join(output_path, "qa_event_template.jsonl"), "w") as file:
    for hadm_id, q, a, e in qa:
        # Convert the dictionary to a JSON string and write it to the file
        json_string = json.dumps({"hadm_id": hadm_id, "q": q, "a": a, "event_type": e})
        file.write(json_string + '\n')