In [1]:
from langsmith import Client
import os
from langsmith_utils import extract_and_save_all_child_runs_by_project
client = Client()

def _set_if_undefined(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"Please provide your {var}")
_set_if_undefined("OPENAI_API_KEY")
_set_if_undefined("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"

In [2]:
from langsmith_utils import extract_plan_and_details
from pathlib import Path
import json
from tqdm import tqdm
def save_reduced_details(project_name, data_path="experiments/xmode/en"):
    test_run_folder = f"{project_name}-details"
    data_path = Path(data_path)
    test_run_folder = data_path / test_run_folder
    # get all json files under the test_run_folder
    assert test_run_folder.exists(), f"{test_run_folder} does not exist"
    test_run_files = list(test_run_folder.glob("*.json"))
    reduced_run_folder = data_path / f"{project_name}-details-reduced"
    Path(reduced_run_folder).mkdir(parents=True, exist_ok=True)
    all_plans = []
    for test_run_file in tqdm(test_run_files):
        # get the stem of the file
        try:
            i = int(test_run_file.stem)
            with open(test_run_file, "r") as f:
                data = json.load(f)
            _result = extract_plan_and_details(data)
            reduced_run_path = Path(reduced_run_folder) / f"{i}.json"
            with open(reduced_run_path, "w") as f:
                json.dump(_result, f, indent=2)
            all_plans.append(_result)
        except ValueError:
            print(f"Skipping {test_run_file.stem}.{test_run_file.suffix}")
            continue
    output_path = Path(reduced_run_folder).parent / f"{project_name}-details-reduced.json"
    with open(output_path, "w") as f:
        json.dump(all_plans, f, indent=2)
    print(f"Saved to {output_path}")


In [3]:
os.environ["LANGCHAIN_PROJECT"] = "xmode-vqa-gpt_4o-english-13"
project_name=os.environ["LANGCHAIN_PROJECT"]

In [4]:
# extract_and_save_all_child_runs_by_project(project_name)
save_reduced_details(project_name)

  0%|          | 0/13 [00:00<?, ?it/s]

100%|██████████| 13/13 [00:00<00:00, 40.22it/s]

Saved to experiments/xmode/en/xmode-vqa-gpt_4o-english-13-details-reduced.json





In [5]:
os.environ["LANGCHAIN_PROJECT"] = "xmode-vqa-gpt_4o-english-52"
project_name=os.environ["LANGCHAIN_PROJECT"]
# extract_and_save_all_child_runs_by_project(project_name)
save_reduced_details(project_name)

100%|██████████| 52/52 [00:01<00:00, 42.37it/s]

Saved to experiments/xmode/en/xmode-vqa-gpt_4o-english-52-details-reduced.json





In [6]:
# merge the data to the exsisted file.
import pandas as pd
import json
from pathlib import Path
data_path = Path("experiments/xmode/en")
json_file_1 = data_path / "xmode-vqa-gpt_4o-english-13-details-reduced.json"
json_file_2 = data_path / "xmode-vqa-gpt_4o-english-52-details-reduced.json"
with open(json_file_1, "r") as f:
    data_1 = json.load(f)
with open(json_file_2, "r") as f:
    data_2 = json.load(f)
data = data_1 + data_2
source_xlxs = "eval_ehr_100_samples.xlsx"
df = pd.read_excel(source_xlxs)

In [17]:
df.iloc[60]["question"]

'was patient 12724975 diagnosed with hypoxemia until 1 year ago and did a chest x-ray reveal any tubes/lines in the abdomen during the same period?'

In [18]:
data[0]["question"]

'was patient 12724975 diagnosed with hypoxemia until 1 year ago and did a chest x-ray reveal any tubes/lines in the abdomen during the same period?'

In [19]:
df.iloc[60]["prediction"]

"[{'Summary': 'Patient 12724975 was diagnosed with hypoxemia until about two years ago and no tubes/lines were detected in the abdomen during chest x-rays in the last year.', 'details': 'The patient had a diagnosis of hypoxemia recorded on 2103-12-27. Chest x-ray analysis from the last year (2104-12-31 to 2105-12-31) revealed no tubes or lines in the abdomen.', 'source': 'Diagnosis records and chest x-ray image analysis.', 'inference': 'no', 'extra explanation': 'The diagnosis occurred two years ago, not within the last year. No tubes or lines were observed in the x-rays.'}]"

In [7]:
json_df = pd.DataFrame(data)
# rename the columns
json_df = json_df.rename(columns={"plans": "Plans (updated with SQL)", "predictions": "prediction"})
json_df.columns


Index(['Plans (updated with SQL)', 'prediction', 'question'], dtype='object')

In [8]:
for index, row in json_df.iterrows():
    question = row["question"]

    df.loc[df['question'] == question, 'Plans (updated with SQL)'] = str(row['Plans (updated with SQL)'])
    df.loc[df['question'] == question, 'prediction'] = str(row['prediction'])

df.to_excel("eval_ehr_100_samples_updated.xlsx", index=False)


In [9]:
# for the json data we need to add an question_id from the df
json_df["id"] = 0
for index, row in json_df.iterrows():
    question = row["question"]
    question_id = df[df["question"] == question]["id"].values[0]
    json_df.loc[index, "id"] = question_id
# change the type of json_df["id"] to int
json_df["id"] = json_df["id"].astype(int)

# convert the json_df to a list of dictionaries
json_data = json_df.to_dict(orient="records")

# save the json_data to json files under the data_path
data_path = Path("experiments/xmode/en")
json_folder = data_path / "xmode-vqa-gpt_4o-english-13-details-reduced-tagged"
Path(json_folder).mkdir(parents=True, exist_ok=True)
for d in json_data:
    i = d["id"]
    with open(Path(json_folder) / f"{i}.json", "w") as f:
        json.dump(d, f, indent=2)

In [55]:
json_data[0]["question_id"]

0

In [7]:
from pathlib import Path
import json
from tqdm import tqdm
import pandas as pd

data_path = Path("dataset/mimic_iv_cxr/sampled_test_with_scope_preprocessed_balenced_answer_100.json")
eval_folder = Path("experiments/xmode/en/xmode-vqa-m3ae-star-100-en-gpt_4o-with-intent-langsmith-extract-details-reduced")
# get all json files under the eval_folder
assert eval_folder.exists(), f"{eval_folder} does not exist"
eval_files = list(eval_folder.glob("*.json"))
output_folder = Path("experiments/xmode/en/xmode-vqa-m3ae-star-100-en-gpt_4o-with-intent-langsmith-extract-details-reduced-tagged")
output_folder.mkdir(parents=True, exist_ok=True)
# load the data
with open(data_path, "r") as f:
    data = json.load(f)
    df = pd.DataFrame(data)
# iterate over the eval_files

for eval_file in tqdm(eval_files):
    with open(eval_file, "r") as f:
        eval_data = json.load(f)
    # get the question_id from the df by same question
    question = eval_data["question"]
    question_id = df[df["question"] == question]["id"].values[0]
    # output the question_id to the eval_data
    eval_data["id"] = int(question_id)
    # save the eval_data to the output_folder
    output_file = output_folder / f"{question_id}.json"
    with open(output_file, "w") as f:
        json.dump(eval_data, f, indent=2)

    

  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [00:00<00:00, 269.33it/s]
