In [None]:
import os

os.environ["ANTHROPIC_API_KEY"] = ""

In [2]:
import sys

sys.path.append("../../app/")
sys.path.append("../../")

# Metrics

In [3]:
import numpy as np
from sklearn.metrics import mean_absolute_error


def accuracy_at_threshold(gold_scores, pred_scores, threshold):
    gold_scores = np.array(gold_scores)
    pred_scores = np.array(pred_scores)
    absolute_errors = np.abs(gold_scores - pred_scores)
    within_threshold_count = np.sum(absolute_errors <= threshold)
    return within_threshold_count / len(gold_scores)


# Evaluate

In [4]:
import pandas as pd

train_df = pd.read_csv("../../data/train.csv")
val_df = pd.read_csv("../../data/test.csv")

## Kaggle dataset

In [5]:
from langchain_anthropic import ChatAnthropic
from langgraph.graph import END, START, StateGraph

import app.workflow.node as node
from app.agents.evaluation import EvaluationAgent
from app.agents.jd_extractor import JDExtractor
from app.agents.resume_extractor import ResumeExtractor
from app.workflow.state import State

model = ChatAnthropic(model="claude-3-7-sonnet-20250219", temperature=0)
RESUME_EXTRACTOR = ResumeExtractor(llm=model)
EVALUATION_AGENT = EvaluationAgent(llm=model)
JD_EXTRACTOR = JDExtractor(llm=model)

CONFIGS = [
    {"obj": node.ValidateInputNode, "configs": {"name": "VALIDATION_INPUT"}},
    {
        "obj": node.ParseContentNode,
        "configs": {
            "name": "CONTENT_EXTRACTION",
            "jd_extractor": JD_EXTRACTOR,
            "resume_extractor": RESUME_EXTRACTOR,
        },
    },
    {
        "obj": node.EvaluatePairMatchingNode,
        "configs": {"name": "EVALUATION", "evaluation_agent": EVALUATION_AGENT},
    },
]

NODES: list[node.BaseNode] = [value["obj"](**value["configs"]) for value in CONFIGS]


def build_graph():
    graph = StateGraph(state_schema=State)
    for user_node in NODES:
        graph.add_node(user_node.name, user_node)

    graph.add_edge(START, "VALIDATION_INPUT")
    graph.add_edge("VALIDATION_INPUT", "CONTENT_EXTRACTION")
    graph.add_edge("CONTENT_EXTRACTION", "EVALUATION")
    graph.add_edge("EVALUATION", END)
    return graph


In [6]:
graph = build_graph()
app = graph.compile(checkpointer=None)

In [7]:
import time
import traceback

from tqdm import tqdm

gold_scores = []
pred_scores = []

print(f"\n--- Starting Evaluation on {len(val_df)} examples ---")
batch_size = 4
for i in tqdm(
    range(0, len(val_df), batch_size),
    desc="Evaluating LangGraph in Batches",
    total=len(val_df) // batch_size,
):
    batch_df = val_df.iloc[i : i + batch_size]
    batch_gold_scores = batch_df["match_score"].tolist()

    # Run the LangGraph app
    state = State(
        inputs=[
            {
                "job_description": example["job_description"],
                "resume": example["resume"],
            }
            for _, example in batch_df.iterrows()
        ]
    )
    try:
        result = app.invoke(state)
        batch_pred_scores = [
            result["results"][j].get("score") for j in range(len(result["results"]))
        ]
    except Exception as e:
        print(f"An error occurred during graph execution: {e}")
        print(traceback.format_exc())
        batch_pred_scores = [0.0] * len(batch_df)  # Assign a penalty score

    gold_scores.extend(batch_gold_scores)
    pred_scores.extend(batch_pred_scores)

    # To avoid rate limiting
    time.sleep(1)

print("--- Evaluation Complete ---")

# --- Calculate Metrics ---

# 1. Mean Absolute Error (MAE)
final_mae = mean_absolute_error(gold_scores, pred_scores)

# 2. Accuracy@threshold
acc_at_1 = accuracy_at_threshold(gold_scores, pred_scores, threshold=1)
acc_at_2 = accuracy_at_threshold(gold_scores, pred_scores, threshold=2)
acc_at_3 = accuracy_at_threshold(gold_scores, pred_scores, threshold=3)


# --- Print Results ---

print("\n--- LangGraph Evaluation Results ---")
print(f"📊 Mean Absolute Error (MAE): {final_mae:.4f}")
print("---")
print(f"🎯 Accuracy@1 (error <= 1.0): {acc_at_1:.2%}")
print(f"🎯 Accuracy@2 (error <= 2.0): {acc_at_2:.2%}")
print(f"🎯 Accuracy@3 (error <= 3.0): {acc_at_3:.2%}")
print("--------------------------------------")


--- Starting Evaluation on 3000 examples ---


Evaluating LangGraph in Batches:   0%|          | 0/750 [00:00<?, ?it/s]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   0%|          | 1/750 [00:28<5:58:19, 28.70s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   0%|          | 2/750 [00:59<6:11:05, 29.77s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   0%|          | 3/750 [01:22<5:34:33, 26.87s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   1%|          | 4/750 [01:47<5:23:00, 25.98s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   1%|          | 5/750 [02:15<5:30:40, 26.63s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   1%|          | 6/750 [02:40<5:26:57, 26.37s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   1%|          | 7/750 [03:04<5:15:05, 25.44s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   1%|          | 8/750 [03:29<5:13:51, 25.38s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   1%|          | 9/750 [03:51<5:01:35, 24.42s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   1%|▏         | 10/750 [04:17<5:03:45, 24.63s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   1%|▏         | 11/750 [04:39<4:54:13, 23.89s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   2%|▏         | 12/750 [05:06<5:05:40, 24.85s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   2%|▏         | 13/750 [05:29<4:59:38, 24.39s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   2%|▏         | 14/750 [05:53<4:56:05, 24.14s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   2%|▏         | 15/750 [06:18<5:01:12, 24.59s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   2%|▏         | 16/750 [06:43<5:01:08, 24.62s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   2%|▏         | 17/750 [07:05<4:52:27, 23.94s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   2%|▏         | 18/750 [07:32<5:01:34, 24.72s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   3%|▎         | 19/750 [07:58<5:05:35, 25.08s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   3%|▎         | 20/750 [08:24<5:08:33, 25.36s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   3%|▎         | 21/750 [08:48<5:04:11, 25.04s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   3%|▎         | 22/750 [09:14<5:05:01, 25.14s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   3%|▎         | 23/750 [09:37<4:59:24, 24.71s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   3%|▎         | 24/750 [10:01<4:55:01, 24.38s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   3%|▎         | 25/750 [10:25<4:54:44, 24.39s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   3%|▎         | 26/750 [10:48<4:49:02, 23.95s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   4%|▎         | 27/750 [11:11<4:45:26, 23.69s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   4%|▎         | 28/750 [11:36<4:47:45, 23.91s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   4%|▍         | 29/750 [12:01<4:51:27, 24.25s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   4%|▍         | 30/750 [12:25<4:49:55, 24.16s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   4%|▍         | 31/750 [12:48<4:47:27, 23.99s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   4%|▍         | 32/750 [13:12<4:44:21, 23.76s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   4%|▍         | 33/750 [13:35<4:42:33, 23.65s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   5%|▍         | 34/750 [14:01<4:51:02, 24.39s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   5%|▍         | 35/750 [14:26<4:54:03, 24.68s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   5%|▍         | 36/750 [14:53<5:00:02, 25.21s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   5%|▍         | 37/750 [15:18<4:58:54, 25.15s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   5%|▌         | 38/750 [15:45<5:03:50, 25.60s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   5%|▌         | 39/750 [16:08<4:56:40, 25.04s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   5%|▌         | 40/750 [16:39<5:16:23, 26.74s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   5%|▌         | 41/750 [17:06<5:16:35, 26.79s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   6%|▌         | 42/750 [17:32<5:12:15, 26.46s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   6%|▌         | 43/750 [17:55<5:02:21, 25.66s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   6%|▌         | 44/750 [18:20<4:58:00, 25.33s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   6%|▌         | 45/750 [18:47<5:03:44, 25.85s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   6%|▌         | 46/750 [19:14<5:05:54, 26.07s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   6%|▋         | 47/750 [19:36<4:53:04, 25.01s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   6%|▋         | 48/750 [20:02<4:56:15, 25.32s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   7%|▋         | 49/750 [20:28<4:56:13, 25.35s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   7%|▋         | 50/750 [20:52<4:52:59, 25.11s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   7%|▋         | 51/750 [21:16<4:49:57, 24.89s/it]

[VALIDATION_INPUT] Validating input...
An error occurred during graph execution: 0
Traceback (most recent call last):
  File "/var/folders/pk/36_zplxd57l142tqyrqqmmkh0000gp/T/ipykernel_34509/3647782116.py", line 32, in <module>
    result["results"][j].get("score") for j in range(len(result["results"]))
    ~~~~~~~~~~~~~~~~~^^^
KeyError: 0



Evaluating LangGraph in Batches:   7%|▋         | 52/750 [21:45<5:02:54, 26.04s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   7%|▋         | 53/750 [22:10<4:57:18, 25.59s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   7%|▋         | 54/750 [22:36<4:59:42, 25.84s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   7%|▋         | 55/750 [23:00<4:50:45, 25.10s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   7%|▋         | 56/750 [23:25<4:50:34, 25.12s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   8%|▊         | 57/750 [23:46<4:35:17, 23.83s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   8%|▊         | 58/750 [24:11<4:39:17, 24.22s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   8%|▊         | 59/750 [24:39<4:52:04, 25.36s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   8%|▊         | 60/750 [25:04<4:51:48, 25.37s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   8%|▊         | 61/750 [25:28<4:45:49, 24.89s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   8%|▊         | 62/750 [25:55<4:54:06, 25.65s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   8%|▊         | 63/750 [26:18<4:42:49, 24.70s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   9%|▊         | 64/750 [26:46<4:53:21, 25.66s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   9%|▊         | 65/750 [27:09<4:43:41, 24.85s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   9%|▉         | 66/750 [27:34<4:45:14, 25.02s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   9%|▉         | 67/750 [28:01<4:51:54, 25.64s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   9%|▉         | 68/750 [28:24<4:43:32, 24.95s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   9%|▉         | 69/750 [28:48<4:39:14, 24.60s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   9%|▉         | 70/750 [29:13<4:38:01, 24.53s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   9%|▉         | 71/750 [29:39<4:44:30, 25.14s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  10%|▉         | 72/750 [30:05<4:45:31, 25.27s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  10%|▉         | 73/750 [30:30<4:45:07, 25.27s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  10%|▉         | 74/750 [30:56<4:46:25, 25.42s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  10%|█         | 75/750 [31:22<4:47:54, 25.59s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  10%|█         | 76/750 [31:47<4:47:46, 25.62s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  10%|█         | 77/750 [32:15<4:53:03, 26.13s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  10%|█         | 78/750 [32:41<4:54:18, 26.28s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  11%|█         | 79/750 [33:06<4:47:18, 25.69s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  11%|█         | 80/750 [33:35<4:57:21, 26.63s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  11%|█         | 81/750 [33:59<4:48:35, 25.88s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  11%|█         | 82/750 [34:24<4:45:36, 25.65s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  11%|█         | 83/750 [34:48<4:40:22, 25.22s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  11%|█         | 84/750 [35:11<4:33:09, 24.61s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  11%|█▏        | 85/750 [35:37<4:36:17, 24.93s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  11%|█▏        | 86/750 [36:02<4:37:23, 25.07s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  12%|█▏        | 87/750 [36:26<4:33:49, 24.78s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  12%|█▏        | 88/750 [36:52<4:37:01, 25.11s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  12%|█▏        | 89/750 [37:18<4:38:10, 25.25s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  12%|█▏        | 90/750 [37:42<4:33:51, 24.90s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  12%|█▏        | 91/750 [38:07<4:32:56, 24.85s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  12%|█▏        | 92/750 [38:30<4:28:55, 24.52s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  12%|█▏        | 93/750 [38:53<4:21:18, 23.86s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  13%|█▎        | 94/750 [39:18<4:24:36, 24.20s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  13%|█▎        | 95/750 [39:42<4:25:37, 24.33s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  13%|█▎        | 96/750 [40:05<4:20:31, 23.90s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  13%|█▎        | 97/750 [40:29<4:19:41, 23.86s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  13%|█▎        | 98/750 [40:56<4:27:57, 24.66s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  13%|█▎        | 99/750 [41:20<4:25:49, 24.50s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  13%|█▎        | 100/750 [41:45<4:26:51, 24.63s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  13%|█▎        | 101/750 [42:09<4:25:42, 24.56s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  14%|█▎        | 102/750 [42:32<4:21:06, 24.18s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  14%|█▎        | 103/750 [42:55<4:17:28, 23.88s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  14%|█▍        | 104/750 [43:21<4:23:38, 24.49s/it]

[VALIDATION_INPUT] Validating input...
An error occurred during graph execution: 0
Traceback (most recent call last):
  File "/var/folders/pk/36_zplxd57l142tqyrqqmmkh0000gp/T/ipykernel_34509/3647782116.py", line 32, in <module>
    result["results"][j].get("score") for j in range(len(result["results"]))
    ~~~~~~~~~~~~~~~~~^^^
KeyError: 0



Evaluating LangGraph in Batches:  14%|█▍        | 105/750 [43:49<4:32:44, 25.37s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  14%|█▍        | 106/750 [44:12<4:24:54, 24.68s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  14%|█▍        | 107/750 [44:36<4:21:33, 24.41s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  14%|█▍        | 108/750 [44:58<4:15:50, 23.91s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  15%|█▍        | 109/750 [45:25<4:23:29, 24.66s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  15%|█▍        | 110/750 [45:47<4:15:37, 23.96s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  15%|█▍        | 111/750 [46:10<4:12:01, 23.66s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  15%|█▍        | 112/750 [46:36<4:18:23, 24.30s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  15%|█▌        | 113/750 [46:58<4:11:53, 23.73s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  15%|█▌        | 114/750 [47:24<4:18:49, 24.42s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  15%|█▌        | 115/750 [47:50<4:21:59, 24.75s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  15%|█▌        | 116/750 [48:14<4:20:14, 24.63s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  16%|█▌        | 117/750 [48:39<4:19:10, 24.57s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  16%|█▌        | 118/750 [49:02<4:15:18, 24.24s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  16%|█▌        | 119/750 [49:28<4:19:06, 24.64s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  16%|█▌        | 120/750 [49:53<4:21:44, 24.93s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  16%|█▌        | 121/750 [50:19<4:23:30, 25.14s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  16%|█▋        | 122/750 [50:45<4:25:22, 25.35s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  16%|█▋        | 123/750 [51:12<4:30:27, 25.88s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  17%|█▋        | 124/750 [51:35<4:22:41, 25.18s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  17%|█▋        | 125/750 [52:03<4:28:36, 25.79s/it]

[VALIDATION_INPUT] Validating input...
An error occurred during graph execution: 0
Traceback (most recent call last):
  File "/var/folders/pk/36_zplxd57l142tqyrqqmmkh0000gp/T/ipykernel_34509/3647782116.py", line 32, in <module>
    result["results"][j].get("score") for j in range(len(result["results"]))
    ~~~~~~~~~~~~~~~~~^^^
KeyError: 0



Evaluating LangGraph in Batches:  17%|█▋        | 126/750 [52:30<4:33:43, 26.32s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  17%|█▋        | 127/750 [52:55<4:28:06, 25.82s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  17%|█▋        | 128/750 [53:18<4:20:49, 25.16s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  17%|█▋        | 129/750 [53:42<4:16:42, 24.80s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  17%|█▋        | 130/750 [54:08<4:18:17, 25.00s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  17%|█▋        | 131/750 [54:30<4:08:04, 24.05s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  18%|█▊        | 132/750 [54:55<4:11:32, 24.42s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  18%|█▊        | 133/750 [55:19<4:10:57, 24.41s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  18%|█▊        | 134/750 [55:46<4:18:49, 25.21s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  18%|█▊        | 135/750 [56:10<4:14:30, 24.83s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  18%|█▊        | 136/750 [56:36<4:15:12, 24.94s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  18%|█▊        | 137/750 [57:01<4:17:08, 25.17s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  18%|█▊        | 138/750 [57:28<4:21:22, 25.63s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  19%|█▊        | 139/750 [57:53<4:20:24, 25.57s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  19%|█▊        | 140/750 [58:19<4:18:38, 25.44s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  19%|█▉        | 141/750 [58:44<4:17:01, 25.32s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  19%|█▉        | 142/750 [59:06<4:07:43, 24.45s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  19%|█▉        | 143/750 [59:31<4:10:30, 24.76s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  19%|█▉        | 144/750 [59:56<4:10:42, 24.82s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  19%|█▉        | 145/750 [1:00:21<4:08:44, 24.67s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  19%|█▉        | 146/750 [1:00:45<4:08:09, 24.65s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  20%|█▉        | 147/750 [1:01:13<4:15:51, 25.46s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  20%|█▉        | 148/750 [1:01:37<4:11:44, 25.09s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  20%|█▉        | 149/750 [1:02:04<4:17:02, 25.66s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  20%|██        | 150/750 [1:02:30<4:18:42, 25.87s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  20%|██        | 151/750 [1:02:57<4:20:08, 26.06s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  20%|██        | 152/750 [1:03:21<4:12:42, 25.36s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  20%|██        | 153/750 [1:03:47<4:14:38, 25.59s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  21%|██        | 154/750 [1:04:12<4:12:04, 25.38s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  21%|██        | 155/750 [1:04:36<4:09:03, 25.12s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  21%|██        | 156/750 [1:05:00<4:04:38, 24.71s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  21%|██        | 157/750 [1:05:26<4:07:21, 25.03s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  21%|██        | 158/750 [1:05:52<4:12:06, 25.55s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  21%|██        | 159/750 [1:06:16<4:07:30, 25.13s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  21%|██▏       | 160/750 [1:06:41<4:04:48, 24.90s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  21%|██▏       | 161/750 [1:07:02<3:53:09, 23.75s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  22%|██▏       | 162/750 [1:07:26<3:53:18, 23.81s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  22%|██▏       | 163/750 [1:07:51<3:55:21, 24.06s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  22%|██▏       | 164/750 [1:08:16<3:58:00, 24.37s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  22%|██▏       | 165/750 [1:08:42<4:02:48, 24.90s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  22%|██▏       | 166/750 [1:09:07<4:03:58, 25.07s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  22%|██▏       | 167/750 [1:09:35<4:10:15, 25.76s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  22%|██▏       | 168/750 [1:10:02<4:14:04, 26.19s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  23%|██▎       | 169/750 [1:10:27<4:10:33, 25.87s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  23%|██▎       | 170/750 [1:10:51<4:04:12, 25.26s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  23%|██▎       | 171/750 [1:11:16<4:03:18, 25.21s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  23%|██▎       | 172/750 [1:11:39<3:57:53, 24.70s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  23%|██▎       | 173/750 [1:12:05<3:59:48, 24.94s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  23%|██▎       | 174/750 [1:12:34<4:11:01, 26.15s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  23%|██▎       | 175/750 [1:12:59<4:07:14, 25.80s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  23%|██▎       | 176/750 [1:13:25<4:07:25, 25.86s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  24%|██▎       | 177/750 [1:13:49<4:03:13, 25.47s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  24%|██▎       | 178/750 [1:14:14<3:59:11, 25.09s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  24%|██▍       | 179/750 [1:14:39<3:58:36, 25.07s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  24%|██▍       | 180/750 [1:15:02<3:54:50, 24.72s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  24%|██▍       | 181/750 [1:15:27<3:54:17, 24.71s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  24%|██▍       | 182/750 [1:15:52<3:55:05, 24.83s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  24%|██▍       | 183/750 [1:16:16<3:52:03, 24.56s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  25%|██▍       | 184/750 [1:16:40<3:50:39, 24.45s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  25%|██▍       | 185/750 [1:17:04<3:47:59, 24.21s/it]

[VALIDATION_INPUT] Validating input...
An error occurred during graph execution: 0
Traceback (most recent call last):
  File "/var/folders/pk/36_zplxd57l142tqyrqqmmkh0000gp/T/ipykernel_34509/3647782116.py", line 32, in <module>
    result["results"][j].get("score") for j in range(len(result["results"]))
    ~~~~~~~~~~~~~~~~~^^^
KeyError: 0



Evaluating LangGraph in Batches:  25%|██▍       | 186/750 [1:17:29<3:48:18, 24.29s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  25%|██▍       | 187/750 [1:17:53<3:48:55, 24.40s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  25%|██▌       | 188/750 [1:18:19<3:53:10, 24.89s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  25%|██▌       | 189/750 [1:18:42<3:45:39, 24.14s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  25%|██▌       | 190/750 [1:19:07<3:48:37, 24.50s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  25%|██▌       | 191/750 [1:19:32<3:48:48, 24.56s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  26%|██▌       | 192/750 [1:19:55<3:44:33, 24.15s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  26%|██▌       | 193/750 [1:20:19<3:44:57, 24.23s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  26%|██▌       | 194/750 [1:20:43<3:43:19, 24.10s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  26%|██▌       | 195/750 [1:21:07<3:42:03, 24.01s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  26%|██▌       | 196/750 [1:21:32<3:44:59, 24.37s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  26%|██▋       | 197/750 [1:21:57<3:46:01, 24.52s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  26%|██▋       | 198/750 [1:22:22<3:47:17, 24.71s/it]

[VALIDATION_INPUT] Validating input...
An error occurred during graph execution: Error code: 429 - {'type': 'error', 'error': {'type': 'rate_limit_error', 'message': 'This request would exceed the rate limit for your organization (b28eb3d3-0d52-4991-aa6d-35c473076012) of 1,000,000 input tokens per minute. For details, refer to: https://docs.claude.com/en/api/rate-limits. You can see the response headers for current usage. Please reduce the prompt length or the maximum tokens requested, or try again later. You may also contact sales at https://www.anthropic.com/contact-sales to discuss your options for a rate limit increase.'}, 'request_id': 'req_011CU7VtJapsrnwsZVnJYM8D'}
Traceback (most recent call last):
  File "/var/folders/pk/36_zplxd57l142tqyrqqmmkh0000gp/T/ipykernel_34509/3647782116.py", line 30, in <module>
    result = app.invoke(state)
             ^^^^^^^^^^^^^^^^^
  File "/Users/nhannguyen/Documents/HCMUS/LLM/Code/llm-resume-evaluation/.env/lib/python3.12/site-packages/langg

Evaluating LangGraph in Batches:  27%|██▋       | 199/750 [1:22:44<3:39:40, 23.92s/it]

[VALIDATION_INPUT] Validating input...
An error occurred during graph execution: Error code: 429 - {'type': 'error', 'error': {'type': 'rate_limit_error', 'message': 'This request would exceed the rate limit for your organization (b28eb3d3-0d52-4991-aa6d-35c473076012) of 1,000,000 input tokens per minute. For details, refer to: https://docs.claude.com/en/api/rate-limits. You can see the response headers for current usage. Please reduce the prompt length or the maximum tokens requested, or try again later. You may also contact sales at https://www.anthropic.com/contact-sales to discuss your options for a rate limit increase.'}, 'request_id': 'req_011CU7VtYP3ViHb6GMBZp8eg'}
Traceback (most recent call last):
  File "/var/folders/pk/36_zplxd57l142tqyrqqmmkh0000gp/T/ipykernel_34509/3647782116.py", line 30, in <module>
    result = app.invoke(state)
             ^^^^^^^^^^^^^^^^^
  File "/Users/nhannguyen/Documents/HCMUS/LLM/Code/llm-resume-evaluation/.env/lib/python3.12/site-packages/langg

Evaluating LangGraph in Batches:  27%|██▋       | 200/750 [1:22:47<2:42:20, 17.71s/it]

[VALIDATION_INPUT] Validating input...
An error occurred during graph execution: Error code: 429 - {'type': 'error', 'error': {'type': 'rate_limit_error', 'message': 'This request would exceed the rate limit for your organization (b28eb3d3-0d52-4991-aa6d-35c473076012) of 1,000,000 input tokens per minute. For details, refer to: https://docs.claude.com/en/api/rate-limits. You can see the response headers for current usage. Please reduce the prompt length or the maximum tokens requested, or try again later. You may also contact sales at https://www.anthropic.com/contact-sales to discuss your options for a rate limit increase.'}, 'request_id': 'req_011CU7Vtn9GrZvtmmvLHKJKU'}
Traceback (most recent call last):
  File "/var/folders/pk/36_zplxd57l142tqyrqqmmkh0000gp/T/ipykernel_34509/3647782116.py", line 30, in <module>
    result = app.invoke(state)
             ^^^^^^^^^^^^^^^^^
  File "/Users/nhannguyen/Documents/HCMUS/LLM/Code/llm-resume-evaluation/.env/lib/python3.12/site-packages/langg

Evaluating LangGraph in Batches:  27%|██▋       | 201/750 [1:22:51<2:02:19, 13.37s/it]

[VALIDATION_INPUT] Validating input...
An error occurred during graph execution: Error code: 429 - {'type': 'error', 'error': {'type': 'rate_limit_error', 'message': 'This request would exceed the rate limit for your organization (b28eb3d3-0d52-4991-aa6d-35c473076012) of 1,000,000 input tokens per minute. For details, refer to: https://docs.claude.com/en/api/rate-limits. You can see the response headers for current usage. Please reduce the prompt length or the maximum tokens requested, or try again later. You may also contact sales at https://www.anthropic.com/contact-sales to discuss your options for a rate limit increase.'}, 'request_id': 'req_011CU7Vu1uW1HWo9Bh7CpScR'}
Traceback (most recent call last):
  File "/var/folders/pk/36_zplxd57l142tqyrqqmmkh0000gp/T/ipykernel_34509/3647782116.py", line 30, in <module>
    result = app.invoke(state)
             ^^^^^^^^^^^^^^^^^
  File "/Users/nhannguyen/Documents/HCMUS/LLM/Code/llm-resume-evaluation/.env/lib/python3.12/site-packages/langg

Evaluating LangGraph in Batches:  27%|██▋       | 202/750 [1:22:54<1:34:12, 10.32s/it]

[VALIDATION_INPUT] Validating input...
An error occurred during graph execution: Error code: 429 - {'type': 'error', 'error': {'type': 'rate_limit_error', 'message': 'This request would exceed the rate limit for your organization (b28eb3d3-0d52-4991-aa6d-35c473076012) of 1,000,000 input tokens per minute. For details, refer to: https://docs.claude.com/en/api/rate-limits. You can see the response headers for current usage. Please reduce the prompt length or the maximum tokens requested, or try again later. You may also contact sales at https://www.anthropic.com/contact-sales to discuss your options for a rate limit increase.'}, 'request_id': 'req_011CU7VuGSNKjEsZAAH8HiNt'}
Traceback (most recent call last):
  File "/var/folders/pk/36_zplxd57l142tqyrqqmmkh0000gp/T/ipykernel_34509/3647782116.py", line 30, in <module>
    result = app.invoke(state)
             ^^^^^^^^^^^^^^^^^
  File "/Users/nhannguyen/Documents/HCMUS/LLM/Code/llm-resume-evaluation/.env/lib/python3.12/site-packages/langg

Evaluating LangGraph in Batches:  27%|██▋       | 203/750 [1:22:57<1:15:18,  8.26s/it]

[VALIDATION_INPUT] Validating input...
An error occurred during graph execution: Error code: 429 - {'type': 'error', 'error': {'type': 'rate_limit_error', 'message': 'This request would exceed the rate limit for your organization (b28eb3d3-0d52-4991-aa6d-35c473076012) of 1,000,000 input tokens per minute. For details, refer to: https://docs.claude.com/en/api/rate-limits. You can see the response headers for current usage. Please reduce the prompt length or the maximum tokens requested, or try again later. You may also contact sales at https://www.anthropic.com/contact-sales to discuss your options for a rate limit increase.'}, 'request_id': 'req_011CU7VuX9QDPyc9PgYm1MrV'}
Traceback (most recent call last):
  File "/var/folders/pk/36_zplxd57l142tqyrqqmmkh0000gp/T/ipykernel_34509/3647782116.py", line 30, in <module>
    result = app.invoke(state)
             ^^^^^^^^^^^^^^^^^
  File "/Users/nhannguyen/Documents/HCMUS/LLM/Code/llm-resume-evaluation/.env/lib/python3.12/site-packages/langg

Evaluating LangGraph in Batches:  27%|██▋       | 204/750 [1:23:01<1:01:49,  6.79s/it]

[VALIDATION_INPUT] Validating input...
An error occurred during graph execution: Error code: 429 - {'type': 'error', 'error': {'type': 'rate_limit_error', 'message': 'This request would exceed the rate limit for your organization (b28eb3d3-0d52-4991-aa6d-35c473076012) of 1,000,000 input tokens per minute. For details, refer to: https://docs.claude.com/en/api/rate-limits. You can see the response headers for current usage. Please reduce the prompt length or the maximum tokens requested, or try again later. You may also contact sales at https://www.anthropic.com/contact-sales to discuss your options for a rate limit increase.'}, 'request_id': 'req_011CU7Vum4ZfSQpiLNDu6Png'}
Traceback (most recent call last):
  File "/var/folders/pk/36_zplxd57l142tqyrqqmmkh0000gp/T/ipykernel_34509/3647782116.py", line 30, in <module>
    result = app.invoke(state)
             ^^^^^^^^^^^^^^^^^
  File "/Users/nhannguyen/Documents/HCMUS/LLM/Code/llm-resume-evaluation/.env/lib/python3.12/site-packages/langg

Evaluating LangGraph in Batches:  27%|██▋       | 205/750 [1:23:04<52:02,  5.73s/it]  

[VALIDATION_INPUT] Validating input...
An error occurred during graph execution: Error code: 429 - {'type': 'error', 'error': {'type': 'rate_limit_error', 'message': 'This request would exceed the rate limit for your organization (b28eb3d3-0d52-4991-aa6d-35c473076012) of 1,000,000 input tokens per minute. For details, refer to: https://docs.claude.com/en/api/rate-limits. You can see the response headers for current usage. Please reduce the prompt length or the maximum tokens requested, or try again later. You may also contact sales at https://www.anthropic.com/contact-sales to discuss your options for a rate limit increase.'}, 'request_id': 'req_011CU7Vv1MYYBTtpTSu7ZuuK'}
Traceback (most recent call last):
  File "/var/folders/pk/36_zplxd57l142tqyrqqmmkh0000gp/T/ipykernel_34509/3647782116.py", line 30, in <module>
    result = app.invoke(state)
             ^^^^^^^^^^^^^^^^^
  File "/Users/nhannguyen/Documents/HCMUS/LLM/Code/llm-resume-evaluation/.env/lib/python3.12/site-packages/langg

Evaluating LangGraph in Batches:  27%|██▋       | 206/750 [1:23:07<45:38,  5.03s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  28%|██▊       | 207/750 [1:24:31<4:19:50, 28.71s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  28%|██▊       | 208/750 [1:24:56<4:08:11, 27.48s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  28%|██▊       | 209/750 [1:25:21<4:02:37, 26.91s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  28%|██▊       | 210/750 [1:25:45<3:53:11, 25.91s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  28%|██▊       | 211/750 [1:26:09<3:47:28, 25.32s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  28%|██▊       | 212/750 [1:26:36<3:52:39, 25.95s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  28%|██▊       | 213/750 [1:27:03<3:53:24, 26.08s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  29%|██▊       | 214/750 [1:27:25<3:42:46, 24.94s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  29%|██▊       | 215/750 [1:27:49<3:39:56, 24.67s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  29%|██▉       | 216/750 [1:28:13<3:36:57, 24.38s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  29%|██▉       | 217/750 [1:28:38<3:37:55, 24.53s/it]

[VALIDATION_INPUT] Validating input...
An error occurred during graph execution: 0
Traceback (most recent call last):
  File "/var/folders/pk/36_zplxd57l142tqyrqqmmkh0000gp/T/ipykernel_34509/3647782116.py", line 32, in <module>
    result["results"][j].get("score") for j in range(len(result["results"]))
    ~~~~~~~~~~~~~~~~~^^^
KeyError: 0



Evaluating LangGraph in Batches:  29%|██▉       | 218/750 [1:29:05<3:45:32, 25.44s/it]

[VALIDATION_INPUT] Validating input...
An error occurred during graph execution: 0
Traceback (most recent call last):
  File "/var/folders/pk/36_zplxd57l142tqyrqqmmkh0000gp/T/ipykernel_34509/3647782116.py", line 32, in <module>
    result["results"][j].get("score") for j in range(len(result["results"]))
    ~~~~~~~~~~~~~~~~~^^^
KeyError: 0



Evaluating LangGraph in Batches:  29%|██▉       | 219/750 [1:29:33<3:50:27, 26.04s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  29%|██▉       | 220/750 [1:29:56<3:43:23, 25.29s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  29%|██▉       | 221/750 [1:30:20<3:40:01, 24.96s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  30%|██▉       | 222/750 [1:30:43<3:34:19, 24.35s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  30%|██▉       | 223/750 [1:31:06<3:30:05, 23.92s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  30%|██▉       | 224/750 [1:31:31<3:31:01, 24.07s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  30%|███       | 225/750 [1:31:56<3:34:19, 24.49s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  30%|███       | 226/750 [1:32:22<3:38:45, 25.05s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  30%|███       | 227/750 [1:32:50<3:44:32, 25.76s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  30%|███       | 228/750 [1:33:12<3:33:51, 24.58s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  31%|███       | 229/750 [1:33:34<3:28:11, 23.98s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  31%|███       | 230/750 [1:34:01<3:34:40, 24.77s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  31%|███       | 231/750 [1:34:27<3:37:39, 25.16s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  31%|███       | 232/750 [1:34:54<3:43:12, 25.86s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  31%|███       | 233/750 [1:35:19<3:38:32, 25.36s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  31%|███       | 234/750 [1:35:43<3:36:10, 25.14s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  31%|███▏      | 235/750 [1:36:06<3:30:15, 24.50s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  31%|███▏      | 236/750 [1:36:33<3:35:07, 25.11s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  32%|███▏      | 237/750 [1:36:56<3:29:05, 24.46s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  32%|███▏      | 238/750 [1:37:22<3:33:45, 25.05s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  32%|███▏      | 239/750 [1:37:44<3:24:33, 24.02s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  32%|███▏      | 240/750 [1:38:10<3:30:20, 24.75s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  32%|███▏      | 241/750 [1:38:33<3:24:58, 24.16s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  32%|███▏      | 242/750 [1:38:56<3:22:21, 23.90s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  32%|███▏      | 243/750 [1:39:22<3:25:45, 24.35s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  33%|███▎      | 244/750 [1:39:47<3:27:05, 24.56s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  33%|███▎      | 245/750 [1:40:09<3:19:46, 23.74s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  33%|███▎      | 246/750 [1:40:33<3:22:16, 24.08s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  33%|███▎      | 247/750 [1:40:59<3:26:13, 24.60s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  33%|███▎      | 248/750 [1:41:23<3:24:12, 24.41s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  33%|███▎      | 249/750 [1:41:50<3:29:35, 25.10s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  33%|███▎      | 250/750 [1:42:15<3:29:25, 25.13s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  33%|███▎      | 251/750 [1:42:40<3:27:41, 24.97s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  34%|███▎      | 252/750 [1:43:07<3:33:25, 25.71s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  34%|███▎      | 253/750 [1:43:32<3:29:46, 25.32s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  34%|███▍      | 254/750 [1:43:56<3:26:43, 25.01s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  34%|███▍      | 255/750 [1:44:22<3:29:43, 25.42s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  34%|███▍      | 256/750 [1:44:51<3:36:41, 26.32s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  34%|███▍      | 257/750 [1:45:18<3:38:46, 26.63s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  34%|███▍      | 258/750 [1:45:42<3:32:02, 25.86s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  35%|███▍      | 259/750 [1:46:06<3:27:24, 25.34s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  35%|███▍      | 260/750 [1:46:38<3:42:32, 27.25s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  35%|███▍      | 261/750 [1:47:01<3:32:51, 26.12s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  35%|███▍      | 262/750 [1:47:28<3:32:46, 26.16s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  35%|███▌      | 263/750 [1:47:49<3:20:53, 24.75s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  35%|███▌      | 264/750 [1:48:16<3:26:38, 25.51s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  35%|███▌      | 265/750 [1:48:43<3:27:56, 25.73s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  35%|███▌      | 266/750 [1:49:06<3:22:54, 25.15s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  36%|███▌      | 267/750 [1:49:33<3:25:44, 25.56s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  36%|███▌      | 268/750 [1:49:56<3:18:48, 24.75s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  36%|███▌      | 269/750 [1:50:20<3:17:43, 24.66s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  36%|███▌      | 270/750 [1:50:43<3:11:57, 23.99s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  36%|███▌      | 271/750 [1:51:07<3:12:05, 24.06s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  36%|███▋      | 272/750 [1:51:32<3:14:06, 24.36s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  36%|███▋      | 273/750 [1:51:56<3:13:16, 24.31s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  37%|███▋      | 274/750 [1:52:23<3:18:01, 24.96s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  37%|███▋      | 275/750 [1:52:59<3:44:33, 28.36s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  37%|███▋      | 276/750 [1:53:27<3:42:29, 28.16s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  37%|███▋      | 277/750 [1:53:51<3:32:22, 26.94s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  37%|███▋      | 278/750 [1:54:16<3:28:52, 26.55s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  37%|███▋      | 279/750 [1:54:40<3:21:09, 25.63s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  37%|███▋      | 280/750 [1:55:06<3:22:09, 25.81s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  37%|███▋      | 281/750 [1:55:29<3:15:37, 25.03s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  38%|███▊      | 282/750 [1:55:54<3:15:04, 25.01s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  38%|███▊      | 283/750 [1:56:18<3:11:35, 24.62s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  38%|███▊      | 284/750 [1:56:46<3:18:51, 25.60s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  38%|███▊      | 285/750 [1:57:11<3:17:59, 25.55s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  38%|███▊      | 286/750 [1:57:35<3:13:47, 25.06s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  38%|███▊      | 287/750 [1:58:02<3:17:54, 25.65s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  38%|███▊      | 288/750 [1:58:26<3:13:48, 25.17s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  39%|███▊      | 289/750 [1:58:51<3:11:24, 24.91s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  39%|███▊      | 290/750 [1:59:14<3:07:24, 24.44s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  39%|███▉      | 291/750 [1:59:38<3:05:02, 24.19s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  39%|███▉      | 292/750 [2:00:07<3:17:26, 25.87s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  39%|███▉      | 293/750 [2:00:34<3:17:53, 25.98s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  39%|███▉      | 294/750 [2:01:01<3:20:29, 26.38s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  39%|███▉      | 295/750 [2:01:27<3:19:34, 26.32s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  39%|███▉      | 296/750 [2:01:51<3:12:41, 25.47s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  40%|███▉      | 297/750 [2:02:14<3:06:48, 24.74s/it]

[VALIDATION_INPUT] Validating input...
An error occurred during graph execution: 0
Traceback (most recent call last):
  File "/var/folders/pk/36_zplxd57l142tqyrqqmmkh0000gp/T/ipykernel_34509/3647782116.py", line 32, in <module>
    result["results"][j].get("score") for j in range(len(result["results"]))
    ~~~~~~~~~~~~~~~~~^^^
KeyError: 0



Evaluating LangGraph in Batches:  40%|███▉      | 298/750 [2:02:40<3:09:51, 25.20s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  40%|███▉      | 299/750 [2:03:05<3:08:12, 25.04s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  40%|████      | 300/750 [2:03:27<3:02:48, 24.37s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  40%|████      | 301/750 [2:03:52<3:02:48, 24.43s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  40%|████      | 302/750 [2:04:16<3:02:11, 24.40s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  40%|████      | 303/750 [2:04:41<3:02:51, 24.55s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  41%|████      | 304/750 [2:05:06<3:03:42, 24.71s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  41%|████      | 305/750 [2:05:32<3:04:41, 24.90s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  41%|████      | 306/750 [2:05:59<3:10:28, 25.74s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  41%|████      | 307/750 [2:06:23<3:06:05, 25.20s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  41%|████      | 308/750 [2:06:49<3:07:11, 25.41s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  41%|████      | 309/750 [2:07:14<3:04:39, 25.12s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  41%|████▏     | 310/750 [2:07:39<3:05:10, 25.25s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  41%|████▏     | 311/750 [2:08:03<3:01:39, 24.83s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  42%|████▏     | 312/750 [2:08:31<3:08:34, 25.83s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  42%|████▏     | 313/750 [2:08:53<2:58:28, 24.50s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  42%|████▏     | 314/750 [2:09:17<2:58:49, 24.61s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  42%|████▏     | 315/750 [2:09:43<2:59:30, 24.76s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  42%|████▏     | 316/750 [2:10:06<2:56:18, 24.37s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  42%|████▏     | 317/750 [2:10:29<2:53:31, 24.04s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  42%|████▏     | 318/750 [2:10:56<2:58:48, 24.84s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  43%|████▎     | 319/750 [2:11:21<2:59:54, 25.04s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  43%|████▎     | 320/750 [2:11:49<3:03:58, 25.67s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  43%|████▎     | 321/750 [2:12:11<2:56:11, 24.64s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  43%|████▎     | 322/750 [2:12:35<2:54:33, 24.47s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  43%|████▎     | 323/750 [2:13:00<2:54:24, 24.51s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  43%|████▎     | 324/750 [2:13:23<2:51:36, 24.17s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  43%|████▎     | 325/750 [2:13:48<2:53:28, 24.49s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  43%|████▎     | 326/750 [2:14:12<2:51:07, 24.22s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  44%|████▎     | 327/750 [2:14:38<2:55:40, 24.92s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  44%|████▎     | 328/750 [2:15:04<2:57:25, 25.23s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  44%|████▍     | 329/750 [2:15:25<2:48:22, 24.00s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  44%|████▍     | 330/750 [2:15:51<2:51:23, 24.48s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  44%|████▍     | 331/750 [2:16:17<2:54:44, 25.02s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  44%|████▍     | 332/750 [2:16:42<2:53:57, 24.97s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  44%|████▍     | 333/750 [2:17:07<2:52:26, 24.81s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  45%|████▍     | 334/750 [2:17:29<2:46:59, 24.08s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  45%|████▍     | 335/750 [2:17:55<2:50:56, 24.72s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  45%|████▍     | 336/750 [2:18:20<2:50:31, 24.71s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  45%|████▍     | 337/750 [2:18:46<2:53:03, 25.14s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  45%|████▌     | 338/750 [2:19:11<2:51:50, 25.03s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  45%|████▌     | 339/750 [2:19:35<2:49:36, 24.76s/it]

[VALIDATION_INPUT] Validating input...
An error occurred during graph execution: 0
Traceback (most recent call last):
  File "/var/folders/pk/36_zplxd57l142tqyrqqmmkh0000gp/T/ipykernel_34509/3647782116.py", line 32, in <module>
    result["results"][j].get("score") for j in range(len(result["results"]))
    ~~~~~~~~~~~~~~~~~^^^
KeyError: 0



Evaluating LangGraph in Batches:  45%|████▌     | 340/750 [2:20:02<2:53:38, 25.41s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  45%|████▌     | 341/750 [2:20:26<2:51:35, 25.17s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  46%|████▌     | 342/750 [2:20:52<2:51:25, 25.21s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  46%|████▌     | 343/750 [2:21:16<2:48:38, 24.86s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  46%|████▌     | 344/750 [2:21:43<2:53:31, 25.64s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  46%|████▌     | 345/750 [2:22:09<2:53:08, 25.65s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  46%|████▌     | 346/750 [2:22:37<2:57:07, 26.31s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  46%|████▋     | 347/750 [2:23:01<2:52:52, 25.74s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  46%|████▋     | 348/750 [2:23:25<2:49:19, 25.27s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  47%|████▋     | 349/750 [2:23:50<2:47:31, 25.07s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  47%|████▋     | 350/750 [2:24:15<2:47:42, 25.16s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  47%|████▋     | 351/750 [2:24:39<2:45:00, 24.81s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  47%|████▋     | 352/750 [2:25:05<2:47:19, 25.23s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  47%|████▋     | 353/750 [2:25:30<2:45:33, 25.02s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  47%|████▋     | 354/750 [2:25:57<2:48:28, 25.53s/it]

[VALIDATION_INPUT] Validating input...
An error occurred during graph execution: 0
Traceback (most recent call last):
  File "/var/folders/pk/36_zplxd57l142tqyrqqmmkh0000gp/T/ipykernel_34509/3647782116.py", line 32, in <module>
    result["results"][j].get("score") for j in range(len(result["results"]))
    ~~~~~~~~~~~~~~~~~^^^
KeyError: 0



Evaluating LangGraph in Batches:  47%|████▋     | 355/750 [2:26:24<2:51:29, 26.05s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  47%|████▋     | 356/750 [2:26:47<2:45:52, 25.26s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  48%|████▊     | 357/750 [2:27:19<2:58:42, 27.28s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  48%|████▊     | 358/750 [2:27:47<2:59:05, 27.41s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  48%|████▊     | 359/750 [2:28:15<2:58:46, 27.43s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  48%|████▊     | 360/750 [2:28:41<2:56:28, 27.15s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  48%|████▊     | 361/750 [2:29:05<2:48:56, 26.06s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  48%|████▊     | 362/750 [2:29:31<2:48:57, 26.13s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  48%|████▊     | 363/750 [2:29:57<2:47:53, 26.03s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  49%|████▊     | 364/750 [2:30:19<2:41:09, 25.05s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  49%|████▊     | 365/750 [2:30:44<2:39:50, 24.91s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  49%|████▉     | 366/750 [2:31:09<2:39:02, 24.85s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  49%|████▉     | 367/750 [2:31:35<2:40:48, 25.19s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  49%|████▉     | 368/750 [2:32:00<2:40:49, 25.26s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  49%|████▉     | 369/750 [2:32:24<2:38:18, 24.93s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  49%|████▉     | 370/750 [2:32:50<2:38:30, 25.03s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  49%|████▉     | 371/750 [2:33:16<2:41:12, 25.52s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  50%|████▉     | 372/750 [2:33:42<2:41:44, 25.67s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  50%|████▉     | 373/750 [2:34:07<2:39:11, 25.34s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  50%|████▉     | 374/750 [2:34:34<2:43:07, 26.03s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  50%|█████     | 375/750 [2:35:02<2:44:58, 26.40s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  50%|█████     | 376/750 [2:35:26<2:39:46, 25.63s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  50%|█████     | 377/750 [2:35:54<2:45:12, 26.58s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  50%|█████     | 378/750 [2:36:19<2:42:02, 26.14s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  51%|█████     | 379/750 [2:36:44<2:37:51, 25.53s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  51%|█████     | 380/750 [2:37:08<2:35:58, 25.29s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  51%|█████     | 381/750 [2:37:32<2:31:51, 24.69s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  51%|█████     | 382/750 [2:37:55<2:28:31, 24.22s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  51%|█████     | 383/750 [2:38:19<2:29:08, 24.38s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  51%|█████     | 384/750 [2:38:43<2:27:24, 24.17s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  51%|█████▏    | 385/750 [2:39:09<2:30:30, 24.74s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  51%|█████▏    | 386/750 [2:39:33<2:29:07, 24.58s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  52%|█████▏    | 387/750 [2:39:56<2:25:44, 24.09s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  52%|█████▏    | 388/750 [2:40:22<2:28:31, 24.62s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  52%|█████▏    | 389/750 [2:40:45<2:24:59, 24.10s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  52%|█████▏    | 390/750 [2:41:10<2:25:57, 24.33s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  52%|█████▏    | 391/750 [2:41:34<2:25:19, 24.29s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  52%|█████▏    | 392/750 [2:41:58<2:24:35, 24.23s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  52%|█████▏    | 393/750 [2:42:21<2:22:09, 23.89s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  53%|█████▎    | 394/750 [2:42:44<2:20:13, 23.63s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  53%|█████▎    | 395/750 [2:43:09<2:21:49, 23.97s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  53%|█████▎    | 396/750 [2:43:34<2:23:00, 24.24s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  53%|█████▎    | 397/750 [2:44:03<2:30:21, 25.56s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  53%|█████▎    | 398/750 [2:44:29<2:31:50, 25.88s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  53%|█████▎    | 399/750 [2:44:54<2:29:40, 25.59s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  53%|█████▎    | 400/750 [2:45:19<2:27:25, 25.27s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  53%|█████▎    | 401/750 [2:45:50<2:38:00, 27.16s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  54%|█████▎    | 402/750 [2:46:13<2:30:07, 25.88s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  54%|█████▎    | 403/750 [2:46:39<2:29:50, 25.91s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  54%|█████▍    | 404/750 [2:47:07<2:32:05, 26.38s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  54%|█████▍    | 405/750 [2:47:34<2:33:21, 26.67s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  54%|█████▍    | 406/750 [2:48:06<2:42:49, 28.40s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  54%|█████▍    | 407/750 [2:48:33<2:38:57, 27.81s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  54%|█████▍    | 408/750 [2:48:59<2:35:14, 27.23s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  55%|█████▍    | 409/750 [2:49:24<2:31:42, 26.69s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  55%|█████▍    | 410/750 [2:49:50<2:30:31, 26.56s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  55%|█████▍    | 411/750 [2:50:16<2:28:42, 26.32s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  55%|█████▍    | 412/750 [2:50:42<2:26:58, 26.09s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  55%|█████▌    | 413/750 [2:51:07<2:24:39, 25.76s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  55%|█████▌    | 414/750 [2:51:33<2:24:27, 25.80s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  55%|█████▌    | 415/750 [2:51:59<2:25:30, 26.06s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  55%|█████▌    | 416/750 [2:52:26<2:25:30, 26.14s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  56%|█████▌    | 417/750 [2:52:48<2:18:50, 25.02s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  56%|█████▌    | 418/750 [2:53:13<2:17:55, 24.93s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  56%|█████▌    | 419/750 [2:53:37<2:17:06, 24.85s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  56%|█████▌    | 420/750 [2:54:03<2:17:25, 24.99s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  56%|█████▌    | 421/750 [2:54:27<2:15:12, 24.66s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  56%|█████▋    | 422/750 [2:54:52<2:15:45, 24.83s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  56%|█████▋    | 423/750 [2:55:16<2:13:42, 24.53s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  57%|█████▋    | 424/750 [2:55:40<2:12:21, 24.36s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  57%|█████▋    | 425/750 [2:56:06<2:14:30, 24.83s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  57%|█████▋    | 426/750 [2:56:31<2:14:51, 24.97s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  57%|█████▋    | 427/750 [2:56:53<2:10:37, 24.26s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  57%|█████▋    | 428/750 [2:57:17<2:09:20, 24.10s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  57%|█████▋    | 429/750 [2:57:43<2:11:20, 24.55s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  57%|█████▋    | 430/750 [2:58:04<2:05:31, 23.54s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  57%|█████▋    | 431/750 [2:58:29<2:08:12, 24.11s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  58%|█████▊    | 432/750 [2:58:55<2:09:21, 24.41s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  58%|█████▊    | 433/750 [2:59:18<2:07:37, 24.15s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  58%|█████▊    | 434/750 [2:59:42<2:07:24, 24.19s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  58%|█████▊    | 435/750 [3:00:09<2:10:46, 24.91s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  58%|█████▊    | 436/750 [3:00:36<2:13:16, 25.47s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  58%|█████▊    | 437/750 [3:00:58<2:08:31, 24.64s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  58%|█████▊    | 438/750 [3:01:24<2:10:07, 25.02s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  59%|█████▊    | 439/750 [3:01:52<2:13:08, 25.69s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  59%|█████▊    | 440/750 [3:02:16<2:11:29, 25.45s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  59%|█████▉    | 441/750 [3:02:43<2:12:16, 25.68s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  59%|█████▉    | 442/750 [3:03:07<2:10:11, 25.36s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  59%|█████▉    | 443/750 [3:03:33<2:10:17, 25.46s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  59%|█████▉    | 444/750 [3:03:59<2:10:03, 25.50s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  59%|█████▉    | 445/750 [3:04:24<2:09:12, 25.42s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  59%|█████▉    | 446/750 [3:04:49<2:08:33, 25.37s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  60%|█████▉    | 447/750 [3:05:12<2:04:50, 24.72s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  60%|█████▉    | 448/750 [3:05:38<2:06:23, 25.11s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  60%|█████▉    | 449/750 [3:06:03<2:05:49, 25.08s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  60%|██████    | 450/750 [3:06:28<2:04:11, 24.84s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  60%|██████    | 451/750 [3:06:50<2:00:47, 24.24s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  60%|██████    | 452/750 [3:07:15<2:00:37, 24.29s/it]

[VALIDATION_INPUT] Validating input...
An error occurred during graph execution: 0
Traceback (most recent call last):
  File "/var/folders/pk/36_zplxd57l142tqyrqqmmkh0000gp/T/ipykernel_34509/3647782116.py", line 32, in <module>
    result["results"][j].get("score") for j in range(len(result["results"]))
    ~~~~~~~~~~~~~~~~~^^^
KeyError: 0



Evaluating LangGraph in Batches:  60%|██████    | 453/750 [3:07:43<2:06:13, 25.50s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  61%|██████    | 454/750 [3:08:09<2:05:36, 25.46s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  61%|██████    | 455/750 [3:08:33<2:04:01, 25.22s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  61%|██████    | 456/750 [3:08:57<2:01:19, 24.76s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  61%|██████    | 457/750 [3:09:22<2:00:54, 24.76s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  61%|██████    | 458/750 [3:09:47<2:01:20, 24.93s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  61%|██████    | 459/750 [3:10:11<1:59:40, 24.68s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  61%|██████▏   | 460/750 [3:10:39<2:04:32, 25.77s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  61%|██████▏   | 461/750 [3:11:04<2:03:10, 25.57s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  62%|██████▏   | 462/750 [3:11:32<2:04:56, 26.03s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  62%|██████▏   | 463/750 [3:11:57<2:04:17, 25.98s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  62%|██████▏   | 464/750 [3:12:24<2:05:20, 26.30s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  62%|██████▏   | 465/750 [3:12:50<2:04:19, 26.17s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  62%|██████▏   | 466/750 [3:13:14<2:00:56, 25.55s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  62%|██████▏   | 467/750 [3:13:40<2:00:42, 25.59s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  62%|██████▏   | 468/750 [3:14:06<2:01:00, 25.75s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  63%|██████▎   | 469/750 [3:14:34<2:03:42, 26.41s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  63%|██████▎   | 470/750 [3:14:59<2:00:40, 25.86s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  63%|██████▎   | 471/750 [3:15:26<2:02:14, 26.29s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  63%|██████▎   | 472/750 [3:15:54<2:03:38, 26.69s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  63%|██████▎   | 473/750 [3:16:18<2:00:22, 26.07s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  63%|██████▎   | 474/750 [3:16:43<1:58:12, 25.70s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  63%|██████▎   | 475/750 [3:17:09<1:57:59, 25.74s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  63%|██████▎   | 476/750 [3:17:33<1:55:25, 25.28s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  64%|██████▎   | 477/750 [3:17:58<1:53:55, 25.04s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  64%|██████▎   | 478/750 [3:18:22<1:52:42, 24.86s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  64%|██████▍   | 479/750 [3:18:47<1:52:41, 24.95s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  64%|██████▍   | 480/750 [3:19:13<1:53:15, 25.17s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  64%|██████▍   | 481/750 [3:19:39<1:53:51, 25.39s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  64%|██████▍   | 482/750 [3:20:03<1:51:43, 25.01s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  64%|██████▍   | 483/750 [3:20:28<1:51:15, 25.00s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  65%|██████▍   | 484/750 [3:20:55<1:53:08, 25.52s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  65%|██████▍   | 485/750 [3:21:19<1:51:04, 25.15s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  65%|██████▍   | 486/750 [3:21:43<1:48:32, 24.67s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  65%|██████▍   | 487/750 [3:22:08<1:49:35, 25.00s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  65%|██████▌   | 488/750 [3:22:35<1:51:07, 25.45s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  65%|██████▌   | 489/750 [3:23:02<1:53:09, 26.01s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  65%|██████▌   | 490/750 [3:23:27<1:50:37, 25.53s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  65%|██████▌   | 491/750 [3:23:52<1:49:51, 25.45s/it]

[VALIDATION_INPUT] Validating input...
An error occurred during graph execution: 0
Traceback (most recent call last):
  File "/var/folders/pk/36_zplxd57l142tqyrqqmmkh0000gp/T/ipykernel_34509/3647782116.py", line 32, in <module>
    result["results"][j].get("score") for j in range(len(result["results"]))
    ~~~~~~~~~~~~~~~~~^^^
KeyError: 0



Evaluating LangGraph in Batches:  66%|██████▌   | 492/750 [3:24:20<1:52:43, 26.21s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  66%|██████▌   | 493/750 [3:24:41<1:45:59, 24.75s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  66%|██████▌   | 494/750 [3:25:08<1:48:42, 25.48s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  66%|██████▌   | 495/750 [3:25:33<1:47:30, 25.30s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  66%|██████▌   | 496/750 [3:26:00<1:48:42, 25.68s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  66%|██████▋   | 497/750 [3:26:24<1:46:11, 25.18s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  66%|██████▋   | 498/750 [3:26:51<1:48:45, 25.89s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  67%|██████▋   | 499/750 [3:27:19<1:49:57, 26.29s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  67%|██████▋   | 500/750 [3:27:45<1:49:18, 26.23s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  67%|██████▋   | 501/750 [3:28:10<1:47:35, 25.93s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  67%|██████▋   | 502/750 [3:28:39<1:50:50, 26.82s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  67%|██████▋   | 503/750 [3:29:03<1:47:25, 26.09s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  67%|██████▋   | 504/750 [3:29:30<1:48:25, 26.44s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  67%|██████▋   | 505/750 [3:29:54<1:44:23, 25.57s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  67%|██████▋   | 506/750 [3:30:19<1:43:06, 25.35s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  68%|██████▊   | 507/750 [3:30:43<1:41:35, 25.08s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  68%|██████▊   | 508/750 [3:31:06<1:38:48, 24.50s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  68%|██████▊   | 509/750 [3:31:33<1:41:15, 25.21s/it]

[VALIDATION_INPUT] Validating input...
An error occurred during graph execution: 0
Traceback (most recent call last):
  File "/var/folders/pk/36_zplxd57l142tqyrqqmmkh0000gp/T/ipykernel_34509/3647782116.py", line 32, in <module>
    result["results"][j].get("score") for j in range(len(result["results"]))
    ~~~~~~~~~~~~~~~~~^^^
KeyError: 0



Evaluating LangGraph in Batches:  68%|██████▊   | 510/750 [3:32:00<1:42:48, 25.70s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  68%|██████▊   | 511/750 [3:32:26<1:42:25, 25.71s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  68%|██████▊   | 512/750 [3:32:51<1:40:56, 25.45s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  68%|██████▊   | 513/750 [3:33:17<1:41:52, 25.79s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  69%|██████▊   | 514/750 [3:33:41<1:39:08, 25.20s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  69%|██████▊   | 515/750 [3:34:09<1:41:24, 25.89s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  69%|██████▉   | 516/750 [3:34:35<1:41:26, 26.01s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  69%|██████▉   | 517/750 [3:35:00<1:40:19, 25.83s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  69%|██████▉   | 518/750 [3:35:25<1:39:06, 25.63s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  69%|██████▉   | 519/750 [3:35:50<1:37:52, 25.42s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  69%|██████▉   | 520/750 [3:36:15<1:36:37, 25.21s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  69%|██████▉   | 521/750 [3:36:42<1:37:46, 25.62s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  70%|██████▉   | 522/750 [3:37:07<1:37:26, 25.64s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  70%|██████▉   | 523/750 [3:37:30<1:33:57, 24.84s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  70%|██████▉   | 524/750 [3:37:54<1:32:14, 24.49s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  70%|███████   | 525/750 [3:38:19<1:32:00, 24.53s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  70%|███████   | 526/750 [3:38:42<1:30:28, 24.23s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  70%|███████   | 527/750 [3:39:06<1:29:06, 23.98s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  70%|███████   | 528/750 [3:39:28<1:27:01, 23.52s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  71%|███████   | 529/750 [3:39:52<1:27:15, 23.69s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  71%|███████   | 530/750 [3:40:17<1:28:35, 24.16s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  71%|███████   | 531/750 [3:40:42<1:29:12, 24.44s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  71%|███████   | 532/750 [3:41:06<1:28:09, 24.26s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  71%|███████   | 533/750 [3:41:33<1:30:48, 25.11s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  71%|███████   | 534/750 [3:41:58<1:30:20, 25.09s/it]

[VALIDATION_INPUT] Validating input...
An error occurred during graph execution: 0
Traceback (most recent call last):
  File "/var/folders/pk/36_zplxd57l142tqyrqqmmkh0000gp/T/ipykernel_34509/3647782116.py", line 32, in <module>
    result["results"][j].get("score") for j in range(len(result["results"]))
    ~~~~~~~~~~~~~~~~~^^^
KeyError: 0



Evaluating LangGraph in Batches:  71%|███████▏  | 535/750 [3:42:25<1:31:26, 25.52s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  71%|███████▏  | 536/750 [3:42:51<1:31:19, 25.61s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  72%|███████▏  | 537/750 [3:43:13<1:27:45, 24.72s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  72%|███████▏  | 538/750 [3:43:42<1:31:51, 26.00s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  72%|███████▏  | 539/750 [3:44:07<1:29:49, 25.54s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  72%|███████▏  | 540/750 [3:44:29<1:25:35, 24.45s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  72%|███████▏  | 541/750 [3:44:52<1:24:21, 24.22s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  72%|███████▏  | 542/750 [3:45:14<1:21:33, 23.53s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  72%|███████▏  | 543/750 [3:45:40<1:23:19, 24.15s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  73%|███████▎  | 544/750 [3:46:04<1:22:52, 24.14s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  73%|███████▎  | 545/750 [3:46:27<1:21:27, 23.84s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  73%|███████▎  | 546/750 [3:46:53<1:22:41, 24.32s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  73%|███████▎  | 547/750 [3:47:22<1:27:37, 25.90s/it]

[VALIDATION_INPUT] Validating input...
An error occurred during graph execution: 0
Traceback (most recent call last):
  File "/var/folders/pk/36_zplxd57l142tqyrqqmmkh0000gp/T/ipykernel_34509/3647782116.py", line 32, in <module>
    result["results"][j].get("score") for j in range(len(result["results"]))
    ~~~~~~~~~~~~~~~~~^^^
KeyError: 0



Evaluating LangGraph in Batches:  73%|███████▎  | 548/750 [3:47:50<1:28:56, 26.42s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  73%|███████▎  | 549/750 [3:48:14<1:26:29, 25.82s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  73%|███████▎  | 550/750 [3:48:38<1:24:20, 25.30s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  73%|███████▎  | 551/750 [3:49:03<1:22:44, 24.95s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  74%|███████▎  | 552/750 [3:49:25<1:20:04, 24.26s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  74%|███████▎  | 553/750 [3:49:49<1:19:17, 24.15s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  74%|███████▍  | 554/750 [3:50:16<1:21:10, 24.85s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  74%|███████▍  | 555/750 [3:50:41<1:21:41, 25.13s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  74%|███████▍  | 556/750 [3:51:04<1:18:53, 24.40s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  74%|███████▍  | 557/750 [3:51:29<1:19:24, 24.69s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  74%|███████▍  | 558/750 [3:51:54<1:19:14, 24.76s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  75%|███████▍  | 559/750 [3:52:20<1:19:22, 24.94s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  75%|███████▍  | 560/750 [3:52:46<1:20:12, 25.33s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  75%|███████▍  | 561/750 [3:53:11<1:19:50, 25.35s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  75%|███████▍  | 562/750 [3:53:38<1:20:50, 25.80s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  75%|███████▌  | 563/750 [3:54:00<1:16:53, 24.67s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  75%|███████▌  | 564/750 [3:54:26<1:17:01, 24.85s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  75%|███████▌  | 565/750 [3:54:48<1:14:49, 24.27s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  75%|███████▌  | 566/750 [3:55:12<1:13:31, 23.97s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  76%|███████▌  | 567/750 [3:55:39<1:16:17, 25.01s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  76%|███████▌  | 568/750 [3:56:03<1:14:47, 24.66s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  76%|███████▌  | 569/750 [3:56:26<1:13:12, 24.27s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  76%|███████▌  | 570/750 [3:56:52<1:14:08, 24.71s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  76%|███████▌  | 571/750 [3:57:17<1:14:09, 24.85s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  76%|███████▋  | 572/750 [3:57:39<1:10:38, 23.81s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  76%|███████▋  | 573/750 [3:58:03<1:10:55, 24.04s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  77%|███████▋  | 574/750 [3:58:28<1:11:14, 24.29s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  77%|███████▋  | 575/750 [3:58:52<1:10:41, 24.24s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  77%|███████▋  | 576/750 [3:59:17<1:10:46, 24.40s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  77%|███████▋  | 577/750 [3:59:42<1:10:54, 24.59s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  77%|███████▋  | 578/750 [4:00:09<1:12:41, 25.36s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  77%|███████▋  | 579/750 [4:00:36<1:13:30, 25.79s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  77%|███████▋  | 580/750 [4:01:02<1:13:08, 25.82s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  77%|███████▋  | 581/750 [4:01:26<1:11:36, 25.42s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  78%|███████▊  | 582/750 [4:01:51<1:10:52, 25.31s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  78%|███████▊  | 583/750 [4:02:17<1:10:26, 25.31s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  78%|███████▊  | 584/750 [4:02:40<1:08:22, 24.71s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  78%|███████▊  | 585/750 [4:03:05<1:07:58, 24.72s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  78%|███████▊  | 586/750 [4:03:29<1:07:33, 24.72s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  78%|███████▊  | 587/750 [4:03:53<1:06:16, 24.40s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  78%|███████▊  | 588/750 [4:04:15<1:03:58, 23.69s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  79%|███████▊  | 589/750 [4:04:37<1:02:19, 23.22s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  79%|███████▊  | 590/750 [4:05:05<1:05:28, 24.55s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  79%|███████▉  | 591/750 [4:05:29<1:04:20, 24.28s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  79%|███████▉  | 592/750 [4:05:52<1:02:57, 23.91s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  79%|███████▉  | 593/750 [4:06:18<1:04:11, 24.53s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  79%|███████▉  | 594/750 [4:06:41<1:02:51, 24.18s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  79%|███████▉  | 595/750 [4:07:07<1:04:05, 24.81s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  79%|███████▉  | 596/750 [4:07:33<1:04:04, 24.97s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  80%|███████▉  | 597/750 [4:07:58<1:03:41, 24.97s/it]

[VALIDATION_INPUT] Validating input...
An error occurred during graph execution: 0
Traceback (most recent call last):
  File "/var/folders/pk/36_zplxd57l142tqyrqqmmkh0000gp/T/ipykernel_34509/3647782116.py", line 32, in <module>
    result["results"][j].get("score") for j in range(len(result["results"]))
    ~~~~~~~~~~~~~~~~~^^^
KeyError: 0



Evaluating LangGraph in Batches:  80%|███████▉  | 598/750 [4:08:24<1:04:40, 25.53s/it]

[VALIDATION_INPUT] Validating input...
An error occurred during graph execution: 0
Traceback (most recent call last):
  File "/var/folders/pk/36_zplxd57l142tqyrqqmmkh0000gp/T/ipykernel_34509/3647782116.py", line 32, in <module>
    result["results"][j].get("score") for j in range(len(result["results"]))
    ~~~~~~~~~~~~~~~~~^^^
KeyError: 0



Evaluating LangGraph in Batches:  80%|███████▉  | 599/750 [4:08:52<1:05:34, 26.06s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  80%|████████  | 600/750 [4:09:17<1:04:39, 25.86s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  80%|████████  | 601/750 [4:09:42<1:03:34, 25.60s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  80%|████████  | 602/750 [4:10:07<1:02:15, 25.24s/it]

[VALIDATION_INPUT] Validating input...
An error occurred during graph execution: 0
Traceback (most recent call last):
  File "/var/folders/pk/36_zplxd57l142tqyrqqmmkh0000gp/T/ipykernel_34509/3647782116.py", line 32, in <module>
    result["results"][j].get("score") for j in range(len(result["results"]))
    ~~~~~~~~~~~~~~~~~^^^
KeyError: 0



Evaluating LangGraph in Batches:  80%|████████  | 603/750 [4:10:34<1:03:35, 25.95s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  81%|████████  | 604/750 [4:10:59<1:02:31, 25.70s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  81%|████████  | 605/750 [4:11:25<1:02:06, 25.70s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  81%|████████  | 606/750 [4:11:50<1:01:15, 25.52s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  81%|████████  | 607/750 [4:12:15<1:00:05, 25.21s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  81%|████████  | 608/750 [4:12:38<58:40, 24.80s/it]  

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  81%|████████  | 609/750 [4:13:04<59:00, 25.11s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  81%|████████▏ | 610/750 [4:13:27<57:02, 24.45s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  81%|████████▏ | 611/750 [4:13:51<56:01, 24.19s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  82%|████████▏ | 612/750 [4:14:15<56:02, 24.37s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  82%|████████▏ | 613/750 [4:14:42<57:28, 25.17s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  82%|████████▏ | 614/750 [4:15:08<57:12, 25.24s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  82%|████████▏ | 615/750 [4:15:33<56:23, 25.07s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  82%|████████▏ | 616/750 [4:15:59<57:13, 25.62s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  82%|████████▏ | 617/750 [4:16:26<57:11, 25.80s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  82%|████████▏ | 618/750 [4:16:51<56:18, 25.60s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  83%|████████▎ | 619/750 [4:17:14<54:11, 24.82s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  83%|████████▎ | 620/750 [4:17:38<53:04, 24.50s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  83%|████████▎ | 621/750 [4:18:03<53:04, 24.69s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  83%|████████▎ | 622/750 [4:18:26<51:35, 24.19s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  83%|████████▎ | 623/750 [4:18:50<51:19, 24.25s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  83%|████████▎ | 624/750 [4:19:14<50:38, 24.11s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  83%|████████▎ | 625/750 [4:19:40<51:21, 24.66s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  83%|████████▎ | 626/750 [4:20:05<51:26, 24.89s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  84%|████████▎ | 627/750 [4:20:30<50:53, 24.83s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  84%|████████▎ | 628/750 [4:20:54<49:47, 24.49s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  84%|████████▍ | 629/750 [4:21:20<50:38, 25.11s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  84%|████████▍ | 630/750 [4:21:42<47:56, 23.97s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  84%|████████▍ | 631/750 [4:22:09<49:26, 24.93s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  84%|████████▍ | 632/750 [4:22:32<48:21, 24.59s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  84%|████████▍ | 633/750 [4:22:59<48:59, 25.12s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  85%|████████▍ | 634/750 [4:23:29<51:15, 26.52s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  85%|████████▍ | 635/750 [4:23:54<50:05, 26.13s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  85%|████████▍ | 636/750 [4:24:20<49:28, 26.04s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  85%|████████▍ | 637/750 [4:24:44<48:10, 25.58s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  85%|████████▌ | 638/750 [4:25:07<46:05, 24.69s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  85%|████████▌ | 639/750 [4:25:33<46:32, 25.16s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  85%|████████▌ | 640/750 [4:25:59<46:22, 25.29s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  85%|████████▌ | 641/750 [4:26:22<45:03, 24.80s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  86%|████████▌ | 642/750 [4:26:48<45:07, 25.07s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  86%|████████▌ | 643/750 [4:27:13<44:30, 24.96s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  86%|████████▌ | 644/750 [4:27:36<42:59, 24.33s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  86%|████████▌ | 645/750 [4:28:01<43:07, 24.64s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  86%|████████▌ | 646/750 [4:28:25<42:15, 24.38s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  86%|████████▋ | 647/750 [4:28:50<42:08, 24.55s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  86%|████████▋ | 648/750 [4:29:16<42:25, 24.96s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  87%|████████▋ | 649/750 [4:29:39<41:18, 24.54s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  87%|████████▋ | 650/750 [4:30:03<40:45, 24.46s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  87%|████████▋ | 651/750 [4:30:26<39:33, 23.98s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  87%|████████▋ | 652/750 [4:30:49<38:35, 23.63s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  87%|████████▋ | 653/750 [4:31:12<37:39, 23.29s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  87%|████████▋ | 654/750 [4:31:37<38:24, 24.00s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  87%|████████▋ | 655/750 [4:32:04<39:08, 24.72s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  87%|████████▋ | 656/750 [4:32:30<39:44, 25.37s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  88%|████████▊ | 657/750 [4:32:55<38:52, 25.08s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  88%|████████▊ | 658/750 [4:33:20<38:32, 25.14s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  88%|████████▊ | 659/750 [4:33:47<38:44, 25.55s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  88%|████████▊ | 660/750 [4:34:09<36:57, 24.64s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  88%|████████▊ | 661/750 [4:34:34<36:30, 24.61s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  88%|████████▊ | 662/750 [4:35:01<37:20, 25.46s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  88%|████████▊ | 663/750 [4:35:27<37:12, 25.66s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  89%|████████▊ | 664/750 [4:35:50<35:40, 24.89s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  89%|████████▊ | 665/750 [4:36:16<35:32, 25.08s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  89%|████████▉ | 666/750 [4:36:46<37:05, 26.50s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  89%|████████▉ | 667/750 [4:37:09<35:20, 25.55s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  89%|████████▉ | 668/750 [4:37:33<34:19, 25.11s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  89%|████████▉ | 669/750 [4:37:56<33:07, 24.54s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  89%|████████▉ | 670/750 [4:38:22<33:09, 24.87s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  89%|████████▉ | 671/750 [4:38:46<32:14, 24.48s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  90%|████████▉ | 672/750 [4:39:09<31:22, 24.13s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  90%|████████▉ | 673/750 [4:39:35<31:54, 24.86s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  90%|████████▉ | 674/750 [4:40:00<31:13, 24.65s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  90%|█████████ | 675/750 [4:40:26<31:20, 25.07s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  90%|█████████ | 676/750 [4:40:51<30:51, 25.02s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  90%|█████████ | 677/750 [4:41:15<30:09, 24.79s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  90%|█████████ | 678/750 [4:41:39<29:26, 24.54s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  91%|█████████ | 679/750 [4:42:03<28:46, 24.32s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  91%|█████████ | 680/750 [4:42:27<28:22, 24.32s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  91%|█████████ | 681/750 [4:42:53<28:35, 24.86s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  91%|█████████ | 682/750 [4:43:20<28:43, 25.35s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  91%|█████████ | 683/750 [4:43:41<27:09, 24.32s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  91%|█████████ | 684/750 [4:44:07<27:09, 24.69s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  91%|█████████▏| 685/750 [4:44:31<26:28, 24.44s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  91%|█████████▏| 686/750 [4:44:57<26:40, 25.01s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  92%|█████████▏| 687/750 [4:45:24<26:54, 25.63s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  92%|█████████▏| 688/750 [4:45:50<26:31, 25.66s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  92%|█████████▏| 689/750 [4:46:15<25:57, 25.54s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  92%|█████████▏| 690/750 [4:46:39<25:01, 25.03s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  92%|█████████▏| 691/750 [4:47:04<24:37, 25.04s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  92%|█████████▏| 692/750 [4:47:28<23:50, 24.66s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  92%|█████████▏| 693/750 [4:47:54<23:44, 25.00s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  93%|█████████▎| 694/750 [4:48:20<23:40, 25.37s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  93%|█████████▎| 695/750 [4:48:46<23:25, 25.56s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  93%|█████████▎| 696/750 [4:49:11<22:53, 25.44s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  93%|█████████▎| 697/750 [4:49:37<22:28, 25.44s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  93%|█████████▎| 698/750 [4:50:04<22:34, 26.04s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  93%|█████████▎| 699/750 [4:50:29<21:44, 25.59s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  93%|█████████▎| 700/750 [4:50:52<20:41, 24.84s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  93%|█████████▎| 701/750 [4:51:17<20:29, 25.10s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  94%|█████████▎| 702/750 [4:51:43<20:14, 25.31s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  94%|█████████▎| 703/750 [4:52:07<19:33, 24.96s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  94%|█████████▍| 704/750 [4:52:29<18:25, 24.03s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  94%|█████████▍| 705/750 [4:52:55<18:28, 24.64s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  94%|█████████▍| 706/750 [4:53:19<17:50, 24.32s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  94%|█████████▍| 707/750 [4:53:45<17:50, 24.91s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  94%|█████████▍| 708/750 [4:54:11<17:38, 25.21s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  95%|█████████▍| 709/750 [4:54:36<17:06, 25.04s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  95%|█████████▍| 710/750 [4:55:00<16:38, 24.95s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  95%|█████████▍| 711/750 [4:55:28<16:38, 25.60s/it]

[VALIDATION_INPUT] Validating input...
An error occurred during graph execution: 0
Traceback (most recent call last):
  File "/var/folders/pk/36_zplxd57l142tqyrqqmmkh0000gp/T/ipykernel_34509/3647782116.py", line 32, in <module>
    result["results"][j].get("score") for j in range(len(result["results"]))
    ~~~~~~~~~~~~~~~~~^^^
KeyError: 0



Evaluating LangGraph in Batches:  95%|█████████▍| 712/750 [4:55:54<16:27, 25.99s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  95%|█████████▌| 713/750 [4:56:20<15:52, 25.75s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  95%|█████████▌| 714/750 [4:56:43<15:04, 25.13s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  95%|█████████▌| 715/750 [4:57:09<14:46, 25.33s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  95%|█████████▌| 716/750 [4:57:33<14:04, 24.83s/it]

[VALIDATION_INPUT] Validating input...
An error occurred during graph execution: Error code: 500 - {'type': 'error', 'error': {'type': 'api_error', 'message': 'Internal server error'}, 'request_id': None}
Traceback (most recent call last):
  File "/var/folders/pk/36_zplxd57l142tqyrqqmmkh0000gp/T/ipykernel_34509/3647782116.py", line 30, in <module>
    result = app.invoke(state)
             ^^^^^^^^^^^^^^^^^
  File "/Users/nhannguyen/Documents/HCMUS/LLM/Code/llm-resume-evaluation/.env/lib/python3.12/site-packages/langgraph/pregel/main.py", line 3085, in invoke
    for chunk in self.stream(
                 ^^^^^^^^^^^^
  File "/Users/nhannguyen/Documents/HCMUS/LLM/Code/llm-resume-evaluation/.env/lib/python3.12/site-packages/langgraph/pregel/main.py", line 2674, in stream
    for _ in runner.tick(
             ^^^^^^^^^^^^
  File "/Users/nhannguyen/Documents/HCMUS/LLM/Code/llm-resume-evaluation/.env/lib/python3.12/site-packages/langgraph/pregel/_runner.py", line 162, in tick
    run_wit

Evaluating LangGraph in Batches:  96%|█████████▌| 717/750 [4:57:53<12:50, 23.35s/it]

[VALIDATION_INPUT] Validating input...
An error occurred during graph execution: Error code: 500 - {'type': 'error', 'error': {'type': 'api_error', 'message': 'Internal server error'}, 'request_id': None}
Traceback (most recent call last):
  File "/var/folders/pk/36_zplxd57l142tqyrqqmmkh0000gp/T/ipykernel_34509/3647782116.py", line 30, in <module>
    result = app.invoke(state)
             ^^^^^^^^^^^^^^^^^
  File "/Users/nhannguyen/Documents/HCMUS/LLM/Code/llm-resume-evaluation/.env/lib/python3.12/site-packages/langgraph/pregel/main.py", line 3085, in invoke
    for chunk in self.stream(
                 ^^^^^^^^^^^^
  File "/Users/nhannguyen/Documents/HCMUS/LLM/Code/llm-resume-evaluation/.env/lib/python3.12/site-packages/langgraph/pregel/main.py", line 2674, in stream
    for _ in runner.tick(
             ^^^^^^^^^^^^
  File "/Users/nhannguyen/Documents/HCMUS/LLM/Code/llm-resume-evaluation/.env/lib/python3.12/site-packages/langgraph/pregel/_runner.py", line 162, in tick
    run_wit

Evaluating LangGraph in Batches:  96%|█████████▌| 718/750 [4:58:05<10:38, 19.96s/it]

[VALIDATION_INPUT] Validating input...
An error occurred during graph execution: Error code: 500 - {'type': 'error', 'error': {'type': 'api_error', 'message': 'Internal server error'}, 'request_id': None}
Traceback (most recent call last):
  File "/var/folders/pk/36_zplxd57l142tqyrqqmmkh0000gp/T/ipykernel_34509/3647782116.py", line 30, in <module>
    result = app.invoke(state)
             ^^^^^^^^^^^^^^^^^
  File "/Users/nhannguyen/Documents/HCMUS/LLM/Code/llm-resume-evaluation/.env/lib/python3.12/site-packages/langgraph/pregel/main.py", line 3085, in invoke
    for chunk in self.stream(
                 ^^^^^^^^^^^^
  File "/Users/nhannguyen/Documents/HCMUS/LLM/Code/llm-resume-evaluation/.env/lib/python3.12/site-packages/langgraph/pregel/main.py", line 2674, in stream
    for _ in runner.tick(
             ^^^^^^^^^^^^
  File "/Users/nhannguyen/Documents/HCMUS/LLM/Code/llm-resume-evaluation/.env/lib/python3.12/site-packages/langgraph/pregel/_runner.py", line 162, in tick
    run_wit

Evaluating LangGraph in Batches:  96%|█████████▌| 719/750 [4:58:06<07:27, 14.44s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  96%|█████████▌| 720/750 [4:58:33<08:59, 17.99s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  96%|█████████▌| 721/750 [4:58:57<09:38, 19.95s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  96%|█████████▋| 722/750 [4:59:22<10:00, 21.46s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  96%|█████████▋| 723/750 [4:59:44<09:44, 21.63s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  97%|█████████▋| 724/750 [5:00:10<09:56, 22.93s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  97%|█████████▋| 725/750 [5:00:37<10:06, 24.25s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  97%|█████████▋| 726/750 [5:01:00<09:33, 23.89s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  97%|█████████▋| 727/750 [5:01:24<09:09, 23.89s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  97%|█████████▋| 728/750 [5:01:49<08:50, 24.13s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  97%|█████████▋| 729/750 [5:02:14<08:30, 24.29s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  97%|█████████▋| 730/750 [5:02:38<08:08, 24.42s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  97%|█████████▋| 731/750 [5:03:04<07:49, 24.73s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  98%|█████████▊| 732/750 [5:03:30<07:31, 25.10s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  98%|█████████▊| 733/750 [5:03:56<07:12, 25.44s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  98%|█████████▊| 734/750 [5:04:22<06:50, 25.65s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  98%|█████████▊| 735/750 [5:04:45<06:11, 24.79s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  98%|█████████▊| 736/750 [5:05:10<05:48, 24.92s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  98%|█████████▊| 737/750 [5:05:34<05:20, 24.68s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  98%|█████████▊| 738/750 [5:06:00<05:00, 25.07s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  99%|█████████▊| 739/750 [5:06:28<04:44, 25.87s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  99%|█████████▊| 740/750 [5:06:56<04:23, 26.37s/it]

[VALIDATION_INPUT] Validating input...
An error occurred during graph execution: 0
Traceback (most recent call last):
  File "/var/folders/pk/36_zplxd57l142tqyrqqmmkh0000gp/T/ipykernel_34509/3647782116.py", line 32, in <module>
    result["results"][j].get("score") for j in range(len(result["results"]))
    ~~~~~~~~~~~~~~~~~^^^
KeyError: 0



Evaluating LangGraph in Batches:  99%|█████████▉| 741/750 [5:07:21<03:55, 26.21s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  99%|█████████▉| 742/750 [5:07:45<03:22, 25.30s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  99%|█████████▉| 743/750 [5:08:09<02:54, 24.92s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  99%|█████████▉| 744/750 [5:08:32<02:26, 24.43s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  99%|█████████▉| 745/750 [5:08:57<02:03, 24.79s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  99%|█████████▉| 746/750 [5:09:22<01:38, 24.71s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches: 100%|█████████▉| 747/750 [5:09:46<01:13, 24.63s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches: 100%|█████████▉| 748/750 [5:10:12<00:49, 24.84s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches: 100%|█████████▉| 749/750 [5:10:38<00:25, 25.15s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches: 100%|██████████| 750/750 [5:11:01<00:00, 24.88s/it]

--- Evaluation Complete ---

--- LangGraph Evaluation Results ---
📊 Mean Absolute Error (MAE): 1.2687
---
🎯 Accuracy@1 (error <= 1.0): 72.97%
🎯 Accuracy@2 (error <= 2.0): 92.43%
🎯 Accuracy@3 (error <= 3.0): 96.17%
--------------------------------------





In [8]:
import json

with open("preds/preds_ee_anthropic_kaggle.json", "w") as f:
    json.dump(pred_scores, f)

with open("preds/golds_kaggle.json", "w") as f:
    json.dump(gold_scores, f)

In [12]:
import json

with open("preds/preds_ee_anthropic_kaggle.json", "r") as f:
    pred_scores = json.load(f)
with open("preds/golds_kaggle.json", "r") as f:
    gold_scores = json.load(f)

# 1. Mean Absolute Error (MAE)
final_mae = mean_absolute_error(gold_scores, pred_scores)

# 2. Accuracy@threshold
acc_at_1 = accuracy_at_threshold(gold_scores, pred_scores, threshold=1)
acc_at_2 = accuracy_at_threshold(gold_scores, pred_scores, threshold=2)
acc_at_3 = accuracy_at_threshold(gold_scores, pred_scores, threshold=3)


# --- Print Results ---

print("\n--- LangGraph Evaluation Results ---")
print(f"📊 Mean Absolute Error (MAE): {final_mae:.4f}")
print("---")
print(f"🎯 Accuracy@1 (error <= 1.0): {acc_at_1:.2%}")
print(f"🎯 Accuracy@2 (error <= 2.0): {acc_at_2:.2%}")
print(f"🎯 Accuracy@3 (error <= 3.0): {acc_at_3:.2%}")
print("--------------------------------------")


--- LangGraph Evaluation Results ---
📊 Mean Absolute Error (MAE): 1.2687
---
🎯 Accuracy@1 (error <= 1.0): 72.97%
🎯 Accuracy@2 (error <= 2.0): 92.43%
🎯 Accuracy@3 (error <= 3.0): 96.17%
--------------------------------------


## Collected dataset

In [9]:
import json
import time

from tqdm import tqdm

collected_folder = "/Users/nhannguyen/ngtuthanhan@gmail.com - Google Drive/My Drive/HCMUS/LLM/Data_Collector_30_8"
matching_json = os.path.join(collected_folder, "Matching_Result.json")

with open(matching_json, "r") as f:
    matching_data = json.load(f)

pred_scores = []
gold_scores = []

batch_size = 1
for i in tqdm(
    range(0, len(matching_data), batch_size),
    desc="Evaluating LangGraph in Batches",
    total=len(matching_data) // batch_size,
):
    batch_examples = matching_data[i : i + batch_size]
    batch_gold_scores = [example["Score"] for example in batch_examples]
    batch_jd = []
    batch_resume = []
    for example in batch_examples:
        jd_path = example["JD"]
        resume_path = example["CV"]
        gold_score = example["Score"]
        with open(os.path.join(collected_folder, "JD", jd_path) + ".txt", "r") as f:
            jd = f.read()
        with open(os.path.join(collected_folder, "CV", resume_path) + ".txt", "r") as f:
            resume = f.read()
        batch_jd.append(jd)
        batch_resume.append(resume)

    state = State(
        inputs=[
            {
                "job_description": jd,
                "resume": resume,
            }
            for jd, resume in zip(batch_jd, batch_resume)
        ]
    )
    try:
        result = app.invoke(state)
        pred_score = result["results"][0].get("score")
    except Exception as e:
        print(f"An error occurred during graph execution: {e}")
        pred_score = 0.0  # Assign a penalty score

    gold_scores.append(gold_score)
    pred_scores.append(pred_score)

    # To avoid rate limiting
    time.sleep(1)


print("--- Evaluation Complete ---")

# --- Calculate Metrics ---

# 1. Mean Absolute Error (MAE)
final_mae = mean_absolute_error(gold_scores, pred_scores)

# 2. Accuracy@threshold
acc_at_1 = accuracy_at_threshold(gold_scores, pred_scores, threshold=1)
acc_at_2 = accuracy_at_threshold(gold_scores, pred_scores, threshold=2)
acc_at_3 = accuracy_at_threshold(gold_scores, pred_scores, threshold=3)


# --- Print Results ---

print("\n--- LangGraph Evaluation Results ---")
print(f"📊 Mean Absolute Error (MAE): {final_mae:.4f}")
print("---")
print(f"🎯 Accuracy@1 (error <= 1.0): {acc_at_1:.2%}")
print(f"🎯 Accuracy@2 (error <= 2.0): {acc_at_2:.2%}")
print(f"🎯 Accuracy@3 (error <= 3.0): {acc_at_3:.2%}")
print("--------------------------------------")

Evaluating LangGraph in Batches:   0%|          | 0/87 [00:00<?, ?it/s]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   1%|          | 1/87 [00:26<37:54, 26.45s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   2%|▏         | 2/87 [00:55<39:27, 27.85s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   3%|▎         | 3/87 [01:18<35:53, 25.63s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:   5%|▍         | 4/87 [01:47<37:10, 26.88s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   6%|▌         | 5/87 [02:13<36:35, 26.78s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:   7%|▋         | 6/87 [02:41<36:44, 27.21s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   8%|▊         | 7/87 [03:11<37:28, 28.10s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:   9%|▉         | 8/87 [03:40<37:09, 28.23s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  10%|█         | 9/87 [04:06<36:00, 27.69s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  11%|█▏        | 10/87 [04:36<36:21, 28.33s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  13%|█▎        | 11/87 [05:09<37:38, 29.71s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  14%|█▍        | 12/87 [05:36<36:14, 28.99s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  15%|█▍        | 13/87 [06:13<38:37, 31.32s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  16%|█▌        | 14/87 [06:42<37:12, 30.58s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  17%|█▋        | 15/87 [07:16<38:01, 31.69s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  18%|█▊        | 16/87 [07:45<36:40, 30.99s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  20%|█▉        | 17/87 [08:16<36:09, 30.99s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  21%|██        | 18/87 [08:52<37:15, 32.40s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  22%|██▏       | 19/87 [09:18<34:23, 30.35s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  23%|██▎       | 20/87 [09:48<33:49, 30.29s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  24%|██▍       | 21/87 [10:13<31:33, 28.69s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  25%|██▌       | 22/87 [10:47<32:46, 30.25s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  26%|██▋       | 23/87 [11:14<31:18, 29.36s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  28%|██▊       | 24/87 [11:47<31:56, 30.43s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  29%|██▊       | 25/87 [12:14<30:36, 29.63s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  30%|██▉       | 26/87 [12:43<29:52, 29.39s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  31%|███       | 27/87 [13:15<30:00, 30.01s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  32%|███▏      | 28/87 [13:37<27:05, 27.56s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  33%|███▎      | 29/87 [14:04<26:27, 27.38s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  34%|███▍      | 30/87 [14:25<24:26, 25.74s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  36%|███▌      | 31/87 [14:53<24:29, 26.24s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  37%|███▋      | 32/87 [15:18<23:37, 25.77s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  38%|███▊      | 33/87 [15:43<23:00, 25.56s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  39%|███▉      | 34/87 [16:08<22:31, 25.51s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  40%|████      | 35/87 [16:33<21:52, 25.25s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  41%|████▏     | 36/87 [16:58<21:32, 25.35s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  43%|████▎     | 37/87 [17:26<21:39, 25.99s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  44%|████▎     | 38/87 [17:58<22:43, 27.82s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  45%|████▍     | 39/87 [18:25<22:02, 27.56s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  46%|████▌     | 40/87 [19:00<23:29, 30.00s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  47%|████▋     | 41/87 [19:30<22:47, 29.73s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  48%|████▊     | 42/87 [20:04<23:22, 31.17s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  49%|████▉     | 43/87 [20:36<22:55, 31.26s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  51%|█████     | 44/87 [21:06<22:07, 30.87s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  52%|█████▏    | 45/87 [21:40<22:19, 31.90s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  53%|█████▎    | 46/87 [22:00<19:21, 28.32s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  54%|█████▍    | 47/87 [22:26<18:23, 27.59s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  55%|█████▌    | 48/87 [22:48<16:48, 25.86s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  56%|█████▋    | 49/87 [23:13<16:21, 25.83s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  57%|█████▋    | 50/87 [23:38<15:46, 25.58s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  59%|█████▊    | 51/87 [24:03<15:12, 25.36s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  60%|█████▉    | 52/87 [24:26<14:18, 24.51s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  61%|██████    | 53/87 [24:49<13:38, 24.08s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  62%|██████▏   | 54/87 [25:12<13:10, 23.97s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  63%|██████▎   | 55/87 [25:39<13:14, 24.84s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  64%|██████▍   | 56/87 [26:07<13:20, 25.82s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  66%|██████▌   | 57/87 [26:33<12:51, 25.73s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  67%|██████▋   | 58/87 [27:02<12:58, 26.85s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  68%|██████▊   | 59/87 [27:29<12:29, 26.78s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  69%|██████▉   | 60/87 [27:59<12:28, 27.71s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  70%|███████   | 61/87 [28:25<11:44, 27.10s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  71%|███████▏  | 62/87 [28:50<11:07, 26.70s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  72%|███████▏  | 63/87 [29:16<10:36, 26.52s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  74%|███████▎  | 64/87 [29:36<09:20, 24.37s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  75%|███████▍  | 65/87 [30:00<08:53, 24.25s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  76%|███████▌  | 66/87 [30:20<08:02, 22.97s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  77%|███████▋  | 67/87 [30:44<07:49, 23.45s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  78%|███████▊  | 68/87 [31:06<07:16, 22.96s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  79%|███████▉  | 69/87 [31:30<07:00, 23.38s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  80%|████████  | 70/87 [31:54<06:36, 23.34s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  82%|████████▏ | 71/87 [32:16<06:06, 22.92s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  83%|████████▎ | 72/87 [32:40<05:51, 23.41s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  84%|████████▍ | 73/87 [33:01<05:17, 22.66s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  85%|████████▌ | 74/87 [33:28<05:09, 23.78s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  86%|████████▌ | 75/87 [33:48<04:33, 22.75s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  87%|████████▋ | 76/87 [34:14<04:20, 23.71s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  89%|████████▊ | 77/87 [34:37<03:55, 23.56s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  90%|████████▉ | 78/87 [35:02<03:35, 23.99s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  91%|█████████ | 79/87 [35:27<03:15, 24.38s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  92%|█████████▏| 80/87 [35:50<02:48, 24.02s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  93%|█████████▎| 81/87 [36:17<02:27, 24.64s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  94%|█████████▍| 82/87 [36:42<02:04, 24.91s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  95%|█████████▌| 83/87 [37:10<01:43, 25.85s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  97%|█████████▋| 84/87 [37:38<01:19, 26.42s/it]

[VALIDATION_INPUT] Validating input...
Pass id 0
An error occurred during graph execution: 0


Evaluating LangGraph in Batches:  98%|█████████▊| 85/87 [38:09<00:55, 27.78s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches:  99%|█████████▉| 86/87 [38:40<00:28, 28.76s/it]

[VALIDATION_INPUT] Validating input...


Evaluating LangGraph in Batches: 100%|██████████| 87/87 [39:04<00:00, 26.95s/it]

--- Evaluation Complete ---

--- LangGraph Evaluation Results ---
📊 Mean Absolute Error (MAE): 4.3563
---
🎯 Accuracy@1 (error <= 1.0): 24.14%
🎯 Accuracy@2 (error <= 2.0): 36.78%
🎯 Accuracy@3 (error <= 3.0): 43.68%
--------------------------------------





In [10]:
import json

with open("preds/preds_ee_anthropic_collected.json", "w") as f:
    json.dump(pred_scores, f)

In [11]:
import json

with open("preds/golds_collected.json", "w") as f:
    json.dump(gold_scores, f)