In [1]:
import json
import re
from pathlib import Path




In [2]:
# use recursive function to extract the child_run with name of "M3LXPlanParser"
def extract_child_run(data, name):
    if isinstance(data, dict):
        if "name" in data and data["name"] == name:
            return data
        for k, v in data.items():
            res = extract_child_run(v, name)
            if res:
                return res
    elif isinstance(data, list):
        for d in data:
            res = extract_child_run(d, name)
            if res:
                return res
    return None

def extract_child_run_by_id(data, name):
    if isinstance(data, dict):
        if "id" in data and data["id"] == name:
            return data
        for k, v in data.items():
            res = extract_child_run_by_id(v, name)
            if res:
                return res
    elif isinstance(data, list):
        for d in data:
            res = extract_child_run_by_id(d, name)
            if res:
                return res
    return None

# use recursive function to extract the child_runs 
def extract_child_runs(data, name):
    res = []
    if isinstance(data, dict):
        if "name" in data and data["name"] == name:
            res.append(data)
        for k, v in data.items():
            res.extend(extract_child_runs(v, name))
    elif isinstance(data, list):
        for d in data:
            res.extend(extract_child_runs(d, name))
    return res

    
def get_SQL_query(run):
    res = extract_child_run(run, "PydanticAttrOutputFunctionsParser")
    if res:
        output = res['inputs']['input']['additional_kwargs']['function_call']['arguments']
        output = eval(output)
        return output
    return None


In [3]:
def get_run_by_id(run_id, data):
    if isinstance(data, dict):
        if "id" in data and data["id"] == run_id:
            return data
        for k, v in data.items():
            res = extract_child_run_by_id(v, run_id)
            if res:
                return res
    elif isinstance(data, list):
        for d in data:
            res = extract_child_run_by_id(d, run_id)
            if res:
                return res
    return None

def get_parent_run_ids(run):
    parent_run_ids = run['parent_run_ids']
    if parent_run_ids:
        return [run_id for run_id in parent_run_ids]
    return None

def get_plan_and_schedule_run(xmodeplanparser_run, data):
    parent_runs = get_parent_run_ids(xmodeplanparser_run)
    plan_run = get_run_by_id(parent_runs[-2], data)
    return plan_run

In [4]:
def get_sql_details(schedule_tasks_runs):
    if isinstance(schedule_tasks_runs, dict):
        schedule_tasks_runs = [schedule_tasks_runs]
    text2SQL_runs = []
    for schedul_tasks_run in schedule_tasks_runs:
        all_text2SQL_runs = extract_child_runs(schedul_tasks_run, "text2SQL")
        for all_text2SQL_run in all_text2SQL_runs:
            if all_text2SQL_run.get("outputs", False):
                detail_output = get_SQL_query(all_text2SQL_run)
                if detail_output:
                    detail_output["output"]["results"] = all_text2SQL_run["outputs"]["output"]
                    detail_output["input"] = eval(all_text2SQL_run["inputs"]['input'])
                    text2SQL_runs.append(detail_output)
    return text2SQL_runs

def get_image_analysis_details(data):
    if isinstance(data, dict):
        data = [data]
    schedule_tasks_runs = extract_child_runs(data, "schedule_tasks")
    image_analysis_runs = []
    for schedul_tasks_run in schedule_tasks_runs:
        all_image_analysis_runs = extract_child_runs(schedul_tasks_run, "image_analysis")
        for all_image_analysis_run in all_image_analysis_runs:
            if all_image_analysis_run.get("outputs", False):
                detail_output = all_image_analysis_run["outputs"]
                detail_output["input"] = eval(all_image_analysis_run["inputs"]['input'])
                image_analysis_runs.append(detail_output)
        if len(image_analysis_runs) == 0:
            for all_image_analysis_run in all_image_analysis_runs:
                if all_image_analysis_run.get("error", False):
                    detail_output = {}
                    detail_output["error"] = all_image_analysis_run["error"]
                    detail_output["input"] = eval(all_image_analysis_run["inputs"]['input'])
                    detail_output["output"] = all_image_analysis_run.get("outputs", None)
                    image_analysis_runs.append(detail_output)
            
    return image_analysis_runs
    

In [5]:
# extract the functions from M3LXPlanParser['inputs']['input']['content'] with regex


def extract_plan_from_M3LXPlanParser(child_run):
    content = child_run['inputs']['input']['content']
    intent_tables = re.findall(r"(\d+).+(intent_tables\(.*\))", content)
    text2sql = re.findall(r"(\d+).+(text2SQL\(.*\))", content)
    image_analysis = re.findall(r"(\d+).+(image_analysis\(.*\))", content)
    join = re.findall(r"(\d+).+(join\(.*\))", content)
    res = intent_tables + text2sql + image_analysis + join
    _result = {}
    for i, f in res:
        if "text2SQL" in f:
            try:
                problem, context = re.findall(r'text2SQL\(problem=["\']?(.+?)["\']?, context=["\']?(\$?.+)["\']?\)', f)[0]
                _result[str(i)] = {
                    "function": f,
                    "problem": problem,
                    "context": context
                }
            except:
                print(f, "error text2SQL")

        elif "intent_table" in f:
            try:
                problem, context = re.findall(r'intent_tables\(problem=["\']?(.+?)["\']?, context=["\']?(\$?.+)["\']?\)', f)[0]
                _result[str(i)] = {
                    "function": f,
                    "problem": problem,
                    "context": context
                }
            except:
                print(f, "error intent_table")

        elif "image_analysis" in f:
            try:
                question, context = re.findall(r'image_analysis\(question=["\']?(.+?)["\']?, context=["\']?\[?(\$?.+)\]?["\']?\)', f)[0]
                _result[str(i)] = {
                    "function": f,
                    "question": question,
                    "context": context
                }
            except:
                print(f, "error image_analysis")
        else:
            _result[str(i)] = {
                "function": f
            }
    return _result






In [6]:
def extract_plan_and_details(data):
    question = data["inputs"]["input"][0]['content'][0]["question"]
    child_runs = extract_child_runs(data, "M3LXPlanParser")
    res = {"plans": []}
    for plan_id, child_run in enumerate(child_runs):
        plan_and_schedule_run = get_plan_and_schedule_run(child_run, data)
        plan = extract_plan_from_M3LXPlanParser(child_run)
        check_image_analysis = 0
        check_text2SQL = 0
        # print(plan)
        text2SQL_runs = get_sql_details(plan_and_schedule_run)
        image_analysis_runs = get_image_analysis_details(plan_and_schedule_run)
        result = {"plan": []}
        for i, p in plan.items():
            if "problem" in p.keys() and 'text2SQL(' in p['function']:
                for text2SQL_run in text2SQL_runs:
                    if p["problem"] == text2SQL_run["input"]["problem"]:
                        _text2SQL_run = text2SQL_run.copy()
                        p["outputs"] = _text2SQL_run
                        check_text2SQL += 1
            elif 'image_analysis(' in p['function']:
                for image_analysis_run in image_analysis_runs:
                    if p["question"] == image_analysis_run["input"]["question"]:
                        _image_analysis_run = image_analysis_run.copy()
                        p["outputs"] = _image_analysis_run
                        check_image_analysis += 1

    # sort the plan by the order of the functions

        for i in sorted([int(key) for key in plan.keys()]):
            plan[str(i)]["id"] = i
            result["plan"].append(plan[str(i)])
        if check_image_analysis == 0:
            result["missing_image_analysis"] = True
        if check_text2SQL == 0:
            result["missing_text2SQL"] = True
        res["plans"].append(result)
        
    res["question"] = question
    return res



In [10]:
from tqdm import tqdm
test_run_folder = "experiments/xmode/en/xmode-vqa-m3ae-star-100-en-gpt_4o-with-intent-langsmith-extract-details"
reduced_run_folder = "experiments/xmode/en/xmode-vqa-m3ae-star-100-en-gpt_4o-with-intent-langsmith-extract-details-reduced"
Path(reduced_run_folder).mkdir(parents=True, exist_ok=True)
all_plans = []
for i in tqdm(range(100)):
    test_run_path = Path(test_run_folder) / f"{i}.json"
    with open(test_run_path, "r") as f:
        data = json.load(f)
    _result = extract_plan_and_details(data)
    reduced_run_path = Path(reduced_run_folder) / f"{i}.json"
    with open(reduced_run_path, "w") as f:
        json.dump(_result, f, indent=2)
    all_plans.append(_result)

output_path = Path(test_run_folder) / "xmode-vqa-m3ae-star-100-en-gpt_4o-with-intent-langsmith-extract-details-plans.json"
with open(output_path, "w") as f:
    json.dump(all_plans, f, indent=2)

100%|██████████| 100/100 [00:06<00:00, 16.05it/s]


In [101]:
csv_output_path = Path(test_run_folder) / "xmode-vqa-m3ae-star-100-en-gpt_4o-with-intent-langsmith-extract-details-plans.csv"
xls_output_path = Path(test_run_folder) / "xmode-vqa-m3ae-star-100-en-gpt_4o-with-intent-langsmith-extract-details-plans.xlsx"
import pandas as pd
df = pd.DataFrame(all_plans)
df.to_csv(csv_output_path, index=False)
df.to_excel(xls_output_path, index=False)

In [102]:
i = 31
test_run_folder = "experiments/xmode/en/xmode-vqa-m3ae-star-100-en-gpt_4o-with-intent-langsmith-extract-details"
test_run_path = Path(test_run_folder) / f"{i}.json"
with open(test_run_path, "r") as f:
    data31 = json.load(f)
result31 = extract_plan_and_details(data31)



In [103]:
result31

{'plans': [{'plan': [{'function': 'intent_tables(problem="Retrieve the image_id for study 55949143 and the image_id for the last study for comparison.", context="\\nCREATE TABLE \\"TB_CXR\\" (\\n\\trow_id INTEGER, \\n\\tsubject_id INTEGER, \\n\\thadm_id REAL, \\n\\tstudy_id INTEGER, \\n\\timage_id TEXT, \\n\\tviewposition TEXT, \\n\\tstudydatetime TEXT, \\n\\tFOREIGN KEY(subject_id) REFERENCES \\"ADMISSIONS\\" (subject_id)\\n)\\n\\n/*\\n5 rows from TB_CXR table:\\nrow_id\\tsubject_id\\thadm_id\\tstudy_id\\timage_id\\tviewposition\\tstudydatetime\\n0\\t10020740\\tNone\\t52268471\\t46ee8707-81386f73-3ff804c3-92e6044b-dce8d6b2\\tap\\t2104-03-22 16:03:20\\n1\\t10020740\\tNone\\t55522869\\t27776756-1d9ef4fc-cd8dd0ca-1453072f-12c0f484\\tap\\t2104-03-23 05:19:57\\n2\\t10020740\\tNone\\t58116104\\td3dbb519-1ea6cf3c-bb4c1fd8-79bb117a-1dc3869f\\tap\\t2104-03-23 11:25:11\\n3\\t10020740\\tNone\\t59646202\\t7b7c6181-fcc7d50e-b65598fc-03cc32e9-4e291b92\\tap\\t2104-03-23 11:57:55\\n4\\t10020740\\tNon

In [104]:
reduced_run_folder = "experiments/xmode/en/xmode-vqa-m3ae-star-100-en-gpt_4o-with-intent-langsmith-extract-details-reduced"
Path(reduced_run_folder).mkdir(mode=777, parents=True, exist_ok=True)