In [1]:
from datasets import load_from_disk
ds = load_from_disk("cord_v2_VLM") # replace VLM with one of (gemma, llava, qwen2.5-vl-7b)
ds


Dataset({
    features: ['id', 'ground_truth', 'predicted', 'raw', 'accuracy', 'correct', 'total'],
    num_rows: 100
})

In [2]:
groundTruth = ds['ground_truth']
predicted = ds['predicted']

In [None]:
import pandas as pd
import json


def flatten_dict(d, parent_key='', sep='.'):
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        elif isinstance(v, list):
            for i, item in enumerate(v):
                if isinstance(item, dict):
                    items.extend(flatten_dict(item, f"{new_key}[{i}]", sep=sep).items())
                else:
                    items.append((f"{new_key}[{i}]", item))
        else:
            items.append((new_key, v))
    return dict(items)

def normalize(val):
    if val is None:
        return ""
    if isinstance(val, str):
        val = val.strip().replace(",", "").replace(" ", "")
        try:
            return float(val)
        except:
            return val.lower()
    return val


all_results = []

for i, (gt_json, pred_json) in enumerate(zip(groundTruth, predicted)):
    gt = flatten_dict(json.loads(gt_json))
    pred = flatten_dict(json.loads(pred_json))

    keys = set(gt.keys())

    for key in keys:
        gt_val = normalize(gt.get(key))
        pred_val = normalize(pred.get(key, None))  # missing key → None
        match = gt_val == pred_val
        all_results.append({
            "entry": i,
            "key": key,
            "ground_truth": gt_val,
            "predicted": pred_val,
            "match": match
        })
df = pd.DataFrame(all_results)
df.to_csv("match_VLM.csv", index=False) # replace VLM with one of (gemma, llava, qwen2.5-vl-7b)

