Heuristically counting overcommitment/undercommitment rate from the experiments.

In [1]:
from pathlib import Path

from redel.utils import read_jsonl

# define base experiments path
EXPERIMENTS = Path("/Users/andrew/Desktop/Code/kanpai/experiments")


def is_overcommitted(fp, overcommitment_threshold):
    """A system is overcommitted if it has <= overcommitment_threshold nodes"""
    with open(fp) as f:
        state = json.load(f)
    return len(state["state"]) <= overcommitment_threshold


def is_undercommitted(fp, undercommitment_threshold):
    """A system is undercommitted if it has any undercommitment_threshold len chain of nodes with 0 or 1 children"""
    with open(fp) as f:
        state = json.load(f)

    nodes = {node["id"]: node for node in state["state"]}
    root = next(node for node in state["state"] if node["depth"] == 0)

    # DFS into each node, when reaching leaf mark as T/F if accumulated 1-child parents >= undercommitment_threshold
    # then every node's T/F value = any(children)
    # return root node's value
    def uc_search(node, chain):
        is_chain = len(node["children"]) <= 1
        if not node["children"]:
            return chain + 1 >= undercommitment_threshold

        is_uc = False
        for child_id in node["children"]:
            child = nodes[child_id]
            is_uc = is_uc or uc_search(child, chain + 1 if is_chain else 0)
        return is_uc

    return uc_search(root, 0)

In [23]:
import json
from dataclasses import dataclass


@dataclass
class CommitmentResult:
    oc_count: int
    uc_count: int
    samples: int
    oc_ids: list[str]
    uc_ids: list[str]


def count_system(fp, overcommitment_threshold=1, undercommitment_threshold=4):
    if not fp.exists():
        return
    # get all state paths in system
    state_paths = []
    for result in read_jsonl(fp / "results.jsonl"):
        state_paths.append(fp / Path(result["log_dir"]).stem / "state.json")

    n = len(state_paths)
    oc_count = 0
    uc_count = 0
    oc_ids = []
    uc_ids = []
    for state_path in state_paths:
        if is_overcommitted(state_path, overcommitment_threshold):
            oc_count += 1
            oc_ids.append(state_path.parent.name)
        if is_undercommitted(state_path, undercommitment_threshold):
            uc_count += 1
            uc_ids.append(state_path.parent.name)

    print(f"========== {fp} ==========")
    print(f"Overcommitment rate: {oc_count / n} ({oc_count} / {n})")
    print(f"Undercommitment rate: {uc_count / n} ({uc_count} / {n})")
    return CommitmentResult(oc_count=oc_count, uc_count=uc_count, samples=n, oc_ids=oc_ids, uc_ids=uc_ids)

In [18]:
fo = count_system(EXPERIMENTS / Path("fanoutqa/dev/trial2/full"))
tp = count_system(EXPERIMENTS / Path("travelplanner/validation/full"))
wa = count_system(EXPERIMENTS / Path("webarena/test/full"))

Overcommitment rate: 0.03559870550161812 (11 / 309)
Undercommitment rate: 0.11326860841423948 (35 / 309)
Overcommitment rate: 0.016666666666666666 (3 / 180)
Undercommitment rate: 0.005555555555555556 (1 / 180)
Overcommitment rate: 0.06716417910447761 (18 / 268)
Undercommitment rate: 0.44776119402985076 (120 / 268)


In [4]:
oc_total = fo.oc_count + tp.oc_count + wa.oc_count
uc_total = fo.uc_count + tp.uc_count + wa.uc_count
n_total = fo.samples + tp.samples + wa.samples

print(f"Total overcommitment rate: {oc_total / n_total} ({oc_total} / {n_total})")
print(f"Total undercommitment rate: {uc_total / n_total} ({uc_total} / {n_total})")

Total overcommitment rate: 0.3011889035667107 (228 / 757)
Total undercommitment rate: 0.20607661822985468 (156 / 757)


In [24]:
for system in [
    "full",
    "root-fc",
    # "baseline",
    "small-leaf",
    # "small-all",
    # "small-baseline",
    # "short-context",
    # "short-baseline",
]:
    for benchmark in ["fanoutqa/dev/trial2", "travelplanner/validation", "webarena/test"]:
        count_system(EXPERIMENTS / benchmark / system)

Overcommitment rate: 0.03559870550161812 (11 / 309)
Undercommitment rate: 0.032362459546925564 (10 / 309)
Overcommitment rate: 0.016666666666666666 (3 / 180)
Undercommitment rate: 0.0 (0 / 180)
Overcommitment rate: 0.06716417910447761 (18 / 268)
Undercommitment rate: 0.34328358208955223 (92 / 268)
Overcommitment rate: 0.5244299674267101 (161 / 307)
Undercommitment rate: 0.026058631921824105 (8 / 307)
Overcommitment rate: 0.9888888888888889 (178 / 180)
Undercommitment rate: 0.0 (0 / 180)
Overcommitment rate: 0.2786259541984733 (73 / 262)
Undercommitment rate: 0.26717557251908397 (70 / 262)
Overcommitment rate: 0.03571428571428571 (11 / 308)
Undercommitment rate: 0.0 (0 / 308)
Overcommitment rate: 0.016666666666666666 (3 / 180)
Undercommitment rate: 0.0 (0 / 180)
Overcommitment rate: 0.068 (17 / 250)
Undercommitment rate: 0.076 (19 / 250)


Getting score conditional on over/undercommitted results.

In [25]:
# foqa
benchmark = "fanoutqa/dev/trial2"
system = "full"

with open(EXPERIMENTS / benchmark / system / "score.json") as f:
    fo_scores = json.load(f)

commitment = count_system(EXPERIMENTS / benchmark / system)
bad_ids = set(commitment.uc_ids) | set(commitment.uc_ids)

good_scores = [s for s in fo_scores["raw"] if s["question_id"] in bad_ids]

good_loose = sum(s["acc"] for s in good_scores) / len(good_scores)
good_gpt = sum(s["gpt"] for s in good_scores) / len(good_scores)

print("========== FOQA ==========")
print(f"Full Loose: {fo_scores['acc']['loose']}")
print(f"Full GPT: {fo_scores['gpt']}")
print(f"Filtered Loose: {good_loose}")
print(f"Filtered GPT: {good_gpt}")

# for system in ["full", "small-leaf"]:

Overcommitment rate: 0.03559870550161812 (11 / 309)
Undercommitment rate: 0.032362459546925564 (10 / 309)
Full Loose: 0.687436647924077
Full GPT: 0.4935483870967742
Filtered Loose: 0.5416666666666667
Filtered GPT: 0.3


In [26]:
len(bad_ids)

10

In [27]:
bad_ids

{'00065f204bddb94d',
 '06196bbd8bcf87e6',
 '14b8f8068a304024',
 '185dc83dc9c0ac4b',
 '4c66f78757f3863f',
 '5865c4b3f5b1456b',
 '870881b3ef7e2866',
 '9fbd3a09aab4c8a1',
 'daaf58facccf012d',
 'ddb5f93339663076'}

In [29]:
# foqa
benchmark = "fanoutqa/dev/trial2"
system = "full"

with open(EXPERIMENTS / benchmark / system / "score.json") as f:
    fo_scores = json.load(f)

fails = [s for s in fo_scores["raw"] if s["gpt"] == 0]
fail_ids = [s['question_id'] for s in fails]


# for system in ["full", "small-leaf"]:

In [30]:
fails

[{'question_id': '563b95ed6141123c',
  'acc': 0.3,
  'rouge': {'rouge1': {'precision': 0.08333333333333333,
    'recall': 0.2727272727272727,
    'fscore': 0.1276595744680851},
   'rouge2': {'precision': 0.0, 'recall': 0.0, 'fscore': 0.0},
   'rougeL': {'precision': 0.08333333333333333,
    'recall': 0.2727272727272727,
    'fscore': 0.1276595744680851}},
  'bleurt': 0.3349379599094391,
  'gpt': 0},
 {'question_id': '2c472f015b5e38fd',
  'acc': 0.6666666666666666,
  'rouge': {'rouge1': {'precision': 0.3333333333333333,
    'recall': 0.75,
    'fscore': 0.46153846153846156},
   'rouge2': {'precision': 0.17142857142857143,
    'recall': 0.4,
    'fscore': 0.24000000000000002},
   'rougeL': {'precision': 0.19444444444444445,
    'recall': 0.4375,
    'fscore': 0.2692307692307693}},
  'bleurt': 0.45291250944137573,
  'gpt': 0},
 {'question_id': '66d9d79aea844c0d',
  'acc': 0.9,
  'rouge': {'rouge1': {'precision': 0.1111111111111111,
    'recall': 0.5454545454545454,
    'fscore': 0.1846153