In [2]:
from langsmith import Client
import os
client = Client()
project_name=""
def _set_if_undefined(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"Please provide your {var}")
_set_if_undefined("OPENAI_API_KEY")
_set_if_undefined("LANGCHAIN_API_KEY")
_set_if_undefined("LANGCHAIN_API_YI_KEY")
os.environ["LANGCHAIN_API_KEY"] = os.environ["LANGCHAIN_API_YI_KEY"]

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "xmode-vqa-gpt_4o-english-100-with-intent"
project_name=os.environ["LANGCHAIN_PROJECT"]
project_runs = list(client.list_runs(project_name=project_name, is_root=True))

In [3]:
# Filter out only successful runs
success_project_runs = [run for run in project_runs if run.status == "success"]

In [4]:
run_id = success_project_runs[0].id

In [5]:
success_project_runs[0].__dict__.keys()

dict_keys(['id', 'name', 'start_time', 'run_type', 'end_time', 'extra', 'error', 'serialized', 'events', 'inputs', 'outputs', 'reference_example_id', 'parent_run_id', 'tags', 'session_id', 'child_run_ids', 'child_runs', 'feedback_stats', 'app_path', 'manifest_id', 'status', 'prompt_tokens', 'completion_tokens', 'total_tokens', 'first_token_time', 'total_cost', 'prompt_cost', 'completion_cost', 'parent_run_ids', 'trace_id', 'dotted_order', 'in_dataset'])

In [None]:
from decimal import Decimal
from uuid import UUID
from datetime import datetime
from tqdm import tqdm as tqdm
from time import sleep
import json
import re

def handle_value(value):
    if isinstance(value, UUID):
        return str(value)
    if isinstance(value, Decimal):
        return float(value)
    if isinstance(value, datetime):
        return value.isoformat()
    if isinstance(value, dict):
        return {key: handle_value(val) for key, val in value.items()}
    if isinstance(value, list):
        return [handle_value(val) for val in value]
    return value

def _extract_run(run):
    keys = run.__dict__.keys()
    temp =  {key: handle_value(run.__dict__[key]) for key in keys}
    if temp['child_runs'] is None:
        temp['child_runs'] = []
    return temp

def get_all_child_runs(root_run, filter_str = None):
    if not isinstance(root_run, dict):
        root_run = _extract_run(root_run)
    child_runs = list(client.list_runs(project_name=project_name, run_ids = root_run['child_run_ids'], filter=filter_str, is_root=False))
    child_runs = list(map(_extract_run, child_runs))
    return child_runs
        
"""
filter_str = 'and(neq(name, "ChatPromptTemplate"), neq(name, "__start__"), neq(name, "RunnableBranch"), neq(name, "ChannelWrite<join,__root__>"), neq(name, "ChannelWrite<plan_and_schedule,__root__>"))'
filtered_child_runs = get_all_child_runs(root_run, filter_str)
"""
    

'\nfilter_str = \'and(neq(name, "ChatPromptTemplate"), neq(name, "__start__"), neq(name, "RunnableBranch"), neq(name, "ChannelWrite<join,__root__>"), neq(name, "ChannelWrite<plan_and_schedule,__root__>"))\'\nfiltered_child_runs = get_all_child_runs(root_run, filter_str)\n'

In [None]:
def child2parent_dict(root_run, child_runs):
    if not isinstance(root_run, dict):
        root_run = _extract_run(root_run)
    parent_ids = {root_run['id']: None}
    for child_run in child_runs:
        parent_id = child_run['parent_run_id']
        parent_ids[child_run['id']] = parent_id
    return parent_ids

def get_paths(parent_ids):
    res = {}
    for child_id in parent_ids:
        path = []
        current_id = child_id
        while current_id is not None:
            path.append(current_id)
            current_id = parent_ids[current_id]
        path.reverse()
        res[child_id] = path[:-1]
    return res

def extract_child_runs_by_paths(root_run):
    root_run = _extract_run(root_run)
    all_child_runs = get_all_child_runs(root_run)
    parent_ids = child2parent_dict(root_run, all_child_runs)
    all_runs = [root_run] + all_child_runs
    for run in all_runs:
        if parent_ids[run['id']] is not None:
            parent_run = list(filter(lambda x: x['id'] == parent_ids[run['id']], all_runs))[0]
            parent_run['child_runs'].append(run)
    parent_run['child_runs'] = sorted(parent_run['child_runs'], key=lambda x: datetime.strptime(x['start_time'], "%Y-%m-%dT%H:%M:%S.%f"))
    return root_run

def extract_all_child_runs_by_paths(runs):
    res = []
    for run in tqdm(runs):
        run = extract_child_runs_by_paths(run)
        res.append(run)
    return res

res = extract_all_child_runs_by_paths(success_project_runs[::-1])


100%|██████████| 100/100 [03:05<00:00,  1.85s/it]


In [19]:
# sort the child runs by start time recursively
def sort_child_runs(run):
    if 'child_runs' in run:
        run['child_runs'] = sorted(run['child_runs'], key=lambda x: datetime.strptime(x['start_time'], "%Y-%m-%dT%H:%M:%S.%f"))
        for child_run in run['child_runs']:
            sort_child_runs(child_run)

for run in res:
    sort_child_runs(run)



In [20]:

json_output_path = 'experiments/xmode/en/xmode-vqa-m3ae-star-100-en-gpt_4o-with-intent-langsmith-extract-details.json'

with open(json_output_path, 'w') as f:
    json.dump(res, f, indent=2)

# save the item of results in a file folder
from pathlib import Path
(Path(json_output_path).parent / "xmode-vqa-m3ae-star-100-en-gpt_4o-with-intent-langsmith-extract-details").mkdir(parents=True, exist_ok=True)
for i, item in enumerate(res):
    with open(Path(json_output_path).parent / "xmode-vqa-m3ae-star-100-en-gpt_4o-with-intent-langsmith-extract-details" / f"{i}.json", 'w') as f:
        json.dump(item, f, indent=2)

In [116]:
def check_chain_of_dict(parent_ids):
    # all values should be appears in keys at least once
    all_keys = set(parent_ids.keys())
    all_values = set(parent_ids.values())
    if all_values.issubset(all_keys):
        return []
    else: # find the missing values
        missing_values = all_values - all_keys
        return list(filter(lambda x: x is not None, list(missing_values)))

# if missing values is found, we need to find the missing runs recursively
def find_missing_runs(missing_parent_id, parent_ids):
    run = list(client.list_runs(project_name=project_name, run_ids = [missing_parent_id]))[0]
    run = _extract_run(run)
    parent_id = run['parent_run_id']
    # check if parent_id is already in parent_ids
    if parent_id in parent_ids:
        return parent_ids
    else:
        parent_ids[run['id']] = parent_id
        return find_missing_runs(parent_id, parent_ids)

missing_parent_ids =  check_chain_of_dict(parent_ids)
print(missing_parent_ids)

[]
