In [2]:
from langsmith import Client
import os
client = Client()
project_name=""
def _set_if_undefined(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"Please provide your {var}")
_set_if_undefined("OPENAI_API_KEY")
_set_if_undefined("LANGCHAIN_API_KEY")
_set_if_undefined("LANGCHAIN_API_YI_KEY")
os.environ["LANGCHAIN_API_KEY"] = os.environ["LANGCHAIN_API_YI_KEY"]

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "xmode-vqa-gpt_4o-english-100-with-intent"
project_name=os.environ["LANGCHAIN_PROJECT"]
project_runs = list(client.list_runs(project_name=project_name, is_root=True))

In [3]:
# Filter out only successful runs
success_project_runs = [run for run in project_runs if run.status == "success"]

In [4]:
len(success_project_runs)

100

In [None]:
success_project_runs[0].__dict__.keys()

dict_keys(['id', 'name', 'start_time', 'run_type', 'end_time', 'extra', 'error', 'serialized', 'events', 'inputs', 'outputs', 'reference_example_id', 'parent_run_id', 'tags', 'session_id', 'child_run_ids', 'child_runs', 'feedback_stats', 'app_path', 'manifest_id', 'status', 'prompt_tokens', 'completion_tokens', 'total_tokens', 'first_token_time', 'total_cost', 'prompt_cost', 'completion_cost', 'parent_run_ids', 'trace_id', 'dotted_order', 'in_dataset'])

datetime.timedelta(seconds=14, microseconds=442852)

In [9]:
r = [str(r_id)  for r_id in success_project_runs[0].child_run_ids]

In [92]:
from decimal import Decimal
from uuid import UUID
from datetime import datetime
from tqdm import tqdm as tqdm
from time import sleep
import json
import re
def get_step(text):
    pattern_1 = r'\n(\d+)\. '
    pattern_2 = r'^(\d+)\. '
    # join 2 patterns
    pattern = f'({pattern_1}|{pattern_2})'
    # get all numbers
    numbers = re.findall(pattern, text)
    numbers = [n[1] if len(n[1])>0 else n[2] for n in numbers ]
    res = []
    for n in numbers:
        try:
            int_n = int(n)
            res.append(int_n)
        except:
            continue
    return res

def handle_value(value):
    if isinstance(value, UUID):
        return str(value)
    if isinstance(value, Decimal):
        return float(value)
    if isinstance(value, datetime):
        return value.isoformat()
    if isinstance(value, dict):
        return {key: handle_value(val) for key, val in value.items()}
    if isinstance(value, list):
        return [handle_value(val) for val in value]
    return value

def _extract_run(run):
    keys = run.__dict__.keys()
    temp =  {key: handle_value(run.__dict__[key]) for key in keys}
    return temp

def extract_plans(run):
    child_runs = [str(r_id)  for r_id in run.child_run_ids]
    c_runs = list(client.list_runs(project_name=project_name, run_ids=child_runs))
    plan_runs = []
    plans = []
    steps = []
    for c_run in c_runs:
        if c_run.name == "M3LXPlanParser":
            plan = _extract_run(c_run)
            plan_runs.append(plan)
            plan_str = plan["inputs"]["input"]["content"]
            plans.append(plan_str)
            if len(plan_str) > 0:
                steps = get_step(plan_str)
    max_step = max(steps)
    last_step = steps[-1]
    total_steps = len(steps)
    res = {
        "plans": plans,
        "plan_runs": plan_runs,
        "steps": steps,
        "max_step": max_step,
        "last_step": last_step,
        "total_steps": total_steps
    }
    return res

def extract_plan_from_runs(runs):
    runs = runs[::-1]
    res_plans = []
    for run in tqdm(runs):
        delta_time = (run.end_time - run.start_time).total_seconds()
        root_run = _extract_run(run)
        question = root_run["inputs"]["input"][0]["content"][0]["question"]
        res_run = extract_plans(run)
        res_run["question"] = question
        res_run["root_run"] = root_run
        res_run["latency_ms"] = delta_time
        res_plans.append(res_run)
    return res_plans

res_plans = extract_plan_from_runs(success_project_runs)

  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [04:56<00:00,  2.96s/it]


In [94]:
from pathlib import Path
import pandas as pd

def merge_to_eval(eval_file, res_plans, output_file = None):
    with open(eval_file, "r") as f:
        eval_res = json.load(f)
    for eval_r, res_plan in zip(eval_res, res_plans):
        assert eval_r["question"] == res_plan["question"]
        eval_r["plans"] = [p.replace("```python", "").replace("```", "") for p in res_plan["plans"]]
        eval_r["plan_runs"] = res_plan["plan_runs"]
        eval_r["steps"] = res_plan["steps"]
        eval_r["max_step"] = res_plan["max_step"]
        eval_r["last_step"] = res_plan["last_step"]
        eval_r["total_steps"] = res_plan["total_steps"]
        eval_r["latency_s"] = res_plan["latency_ms"]
    if output_file is None:
        path = Path(eval_file).parent
        name_no_ext = Path(eval_file).stem
        ext = Path(eval_file).suffix
        output_file = path / (name_no_ext + "_with_plans" + ext)
    with open(output_file, "w") as f:
        json.dump(eval_res, f, indent=2)
    # save it in csv and xlsx
    df = pd.DataFrame(eval_res)
    csv_file = output_file.with_suffix(".csv")
    xlsx_file = output_file.with_suffix(".xlsx")
    df.to_csv(csv_file, index=False)
    df.to_excel(xlsx_file, index=False)
    
merge_to_eval("experiments/xmode/en/xmode-vqa-m3ae-star-100-en-gpt_4o-with-intent-with-meta.json", res_plans)

In [None]:
_extract_run(success_project_runs[0])

'for patient 17281190, was bone lesion identified in the chest x-ray within 2 day after the diagnosis of benign prostatic hyperplasia with lower urinary tract symptoms since 11/2101?'

'.json'

In [15]:
c_runs

[Run(id=UUID('fb885977-a3cb-492e-8367-e799192787d4'), name='should_continue', start_time=datetime.datetime(2024, 12, 8, 11, 50, 1, 902121), run_type='chain', end_time=datetime.datetime(2024, 12, 8, 11, 50, 1, 903167), extra={'metadata': {'langgraph_step': 2, 'langgraph_node': 'join', 'langgraph_triggers': ['plan_and_schedule'], 'langgraph_task_idx': 0, 'thread_ts': '1efb55a8-62e6-6919-bffe-73d40f5bcad0', 'revision_id': '8f29e69-dirty'}, 'runtime': {'sdk': 'langsmith-py', 'sdk_version': '0.1.96', 'library': 'langsmith', 'platform': 'Linux-5.15.0-40-generic-x86_64-with-glibc2.35', 'runtime': 'python', 'py_implementation': 'CPython', 'runtime_version': '3.10.8', 'langchain_version': '0.2.12', 'langchain_core_version': '0.2.27', 'thread_count': 10.0, 'mem': {'rss': 224436224.0}, 'cpu': {'time': {'sys': 6.83, 'user': 50.09}, 'ctx_switches': {'voluntary': 326.0, 'involuntary': 18.0}, 'percent': 0.0}}}, error=None, serialized=None, events=[{'name': 'start', 'time': '2024-12-08T11:50:01.902121

In [None]:



def _extract_run(run):
    keys = run.__dict__.keys()
    temp =  {key: handle_value(run.__dict__[key]) for key in keys}
    return temp

def extract_run(run):
    temp = _extract_run(run)
    if isinstance(temp.get('child_run_ids', None), list) and len(temp['child_run_ids']) > 0:
        # print("****", temp['child_run_ids'])
        c_runs_ids = [handle_value(c_id) for c_id in temp['child_run_ids']]
        max_retry = 3
        retry = 0
        try:
            c_runs = list(client.list_runs(project_name=project_name, run_ids=c_runs_ids))
            sleep(1)
        except:
            while len(c_runs) == 0 and retry < max_retry:
                retry += 1
                try:
                    c_runs = list(client.list_runs(project_name=project_name, run_ids=c_runs_ids))
                    sleep(10)
                except:
                    print("retrying...")
                
        # print("add child:", [r.id for r in c_runs])
        res = {
            **temp,
            'child_runs': [_extract_run(_run) for _run in c_runs]
        }
        if not any(res['child_runs']):
            res['child_runs'] = None
        return res
    if temp.get('type','ChatGenerationChunk') == 'ChatGenerationChunk':
        return
    return temp

In [11]:
import json
from time import sleep
from tqdm import tqdm as tqdm
for run in tqdm(success_project_runs[::-1], total=len(success_project_runs)):
    run = _extract_run(run)
    with open("experiments/xmode/en/xmode-vqa-m3ae-star-100-en-gpt_4o-with-intent-langsmith-extract_all.json", "r+") as f:
        # load existing data
        try:
            data = json.load(f)
        except json.JSONDecodeError:
            data = []
        # update data
        data.append(run)
        # save data
        f.seek(0)
        json.dump(data, f, indent=2)
        f.truncate()
    sleep(1)

100%|██████████| 100/100 [2:18:18<00:00, 82.98s/it]  


In [None]:
res = []
for run in success_project_runs:
    res.append(_extract_run(run))

In [None]:
import json
with open("experiments/xmode/en/metadata_xmode-vqa-m3ae-star-100-en-gpt_4o-with-intent.json", "w") as f:
    json.dump(res[::-1], f, indent=2) # reverse the order to match the order of the questions in the dataset

In [38]:
with open("experiments/xmode/en/metadata_xmode-vqa-m3ae-star-100-en-gpt_4o-with-intent.json", "r") as f:
    res = json.load(f)


In [41]:
data_path = "/home/ubuntu/workspace/XMODE-LLMCompiler/experiments/xmode/en/xmode-vqa-m3ae-star-100-en-gpt_4o-with-intent.json"
with open(data_path, "r") as f:
    data = json.load(f)

for d,r in zip(data,res):
    assert d["question"]==r["question"], print(d["question"],r["question"])
    d["prompt_tokens"] = r["prompt_tokens"]
    d["completion_tokens"] = r["completion_tokens"]
    d["total_tokens"] = r["total_tokens"]
    d["prompt_cost"] = r["prompt_cost"]
    d["completion_cost"] = r["completion_cost"]
    d["total_cost"] = r["total_cost"]

merged_data_path = "/home/ubuntu/workspace/XMODE-LLMCompiler/experiments/xmode/en/xmode-vqa-m3ae-star-100-en-gpt_4o-with-intent-with-meta.json"
with open(merged_data_path, "w") as f:
    json.dump(data, f, indent=2)

import pandas as pd
df = pd.DataFrame(data)
df.to_csv("/home/ubuntu/workspace/XMODE-LLMCompiler/experiments/xmode/en/xmode-vqa-m3ae-star-100-en-gpt_4o-with-intent-with-meta.csv", index=False)


In [42]:
df.to_excel("/home/ubuntu/workspace/XMODE-LLMCompiler/experiments/xmode/en/xmode-vqa-m3ae-star-100-en-gpt_4o-with-intent-with-meta.xlsx", index=False)

"given the last study of patient 10284038 in 2105, is the cardiac silhouette's width larger than half of the total thorax width?"