In [30]:
import json
import re


def normalize_text(s):
    if not isinstance(s, str):
        return ""
    s = s.lower().strip()
    s = re.sub(r"[^a-z0-9\s]", "", s)
    return s


def normalize_gt(gt):
    gt_n = normalize_text(gt)

    if gt_n.startswith("yes"):
        return "yes"
    if gt_n.startswith("no"):
        return "no"
    if "cannot" in gt_n or "not provided" in gt_n or "not enough information" in gt_n:
        return "uncertain"
    return "other"


def normalize_pred(pred):
    pred_n = normalize_text(pred)

    if pred_n.startswith("yes") or " yes" in pred_n:
        return "yes"

    neg_patterns = [
        r"\bno\b",
        r"does not",
        r"doesnt",
        r"not show",
        r"no evidence",
        r"no signs",
        r"absent"
    ]
    if any(re.search(p, pred_n) for p in neg_patterns):
        return "no"

    return "other"



input_path = "/Users/ducanh0503/Desktop/capstone/eval_result/vqa-vu/iu/iuxray_vqa_advance_predictions.json"

with open(input_path, "r") as f:
    data = json.load(f)   

print("Loaded:", len(data), "records")



output_data = []

for item in data:
    new_item = item.copy() 

    new_item["gt_norm"] = normalize_gt(item.get("gt_answer", ""))
    new_item["pred_norm"] = normalize_pred(item.get("pred_answer", ""))

    output_data.append(new_item)



output_path = "/Users/ducanh0503/Desktop/capstone/eval_result/vqa-vu/iu/iu_vqa_predictions_normalized.json"

with open(output_path, "w") as f:
    json.dump(output_data, f, indent=2)



Loaded: 2573 records


In [31]:
import pandas as pd
path = "/Users/ducanh0503/Desktop/capstone/eval_result/vqa-vu/iu/iu_vqa_predictions_normalized.json"

with open(path, "r") as f:
    data = json.load(f)

print("Loaded:", len(data), "records")

df = pd.DataFrame(data)


valid = df[(df["gt_norm"].isin(["yes", "no"])) &
           (df["pred_norm"].isin(["yes", "no"]))]

print("Valid yes/no samples:", len(valid))


output_path = "/Users/ducanh0503/Desktop/capstone/eval_result/vqa-vu/iu/valid.json"

valid.to_json(output_path, orient="records", indent=2)

print("Saved valid samples to:", output_path)


Loaded: 2573 records
Valid yes/no samples: 1836
Saved valid samples to: /Users/ducanh0503/Desktop/capstone/eval_result/vqa-vu/iu/valid.json


In [32]:
input_path = "/Users/ducanh0503/Desktop/capstone/eval_result/vqa-vu/iu/iu_vqa_predictions_normalized.json"

with open(input_path, "r") as f:
    data = json.load(f)

print("Loaded:", len(data), "records")

invalid = []
valid_norm_labels = {"yes", "no"}

for item in data:
    gt = item.get("gt_norm", "")
    pred = item.get("pred_norm", "")

    if gt not in valid_norm_labels or pred not in valid_norm_labels:
        cleaned = {k: v for k, v in item.items() if k not in ("gt_norm", "pred_norm")}
        invalid.append(cleaned)


print("Invalid samples:", len(invalid))


output_path = "/Users/ducanh0503/Desktop/capstone/eval_result/vqa-vu/iu/invalid.json"

with open(output_path, "w") as f:
    json.dump(invalid, f, indent=2)


Loaded: 2573 records
Invalid samples: 737


In [33]:
path_invalid = "/Users/ducanh0503/Desktop/capstone/eval_result/vqa-vu/iu/valid.json"
with open(path_invalid, "r") as f:
    invalid = json.load(f)

print("Invalid records:", len(invalid))

path_trans = "/Users/ducanh0503/Desktop/capstone/eval_result/vqa-vu/iu/trans.json"
with open(path_trans, "r") as f:
    trans = json.load(f)

print("Trans records:", len(trans))


combined = invalid + trans
print("Total combined records:", len(combined))


output_path = "/Users/ducanh0503/Desktop/capstone/eval_result/vqa-vu/iu/full_result.json"

with open(output_path, "w") as f:
    json.dump(combined, f, indent=2)

print("Saved combined file to:", output_path)


Invalid records: 1836
Trans records: 737
Total combined records: 2573
Saved combined file to: /Users/ducanh0503/Desktop/capstone/eval_result/vqa-vu/iu/full_result.json


In [35]:
import json
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd


path = "/Users/ducanh0503/Desktop/capstone/eval_result/vqa-vu/iu/full_result.json"

with open(path, "r") as f:
    data = json.load(f)

print("Loaded:", len(data), "records")


df = pd.DataFrame(data)

assert "gt_norm" in df.columns and "pred_norm" in df.columns


valid = df[(df["gt_norm"].isin(["yes", "no"])) & 
           (df["pred_norm"].isin(["yes", "no"]))]

print("Valid yes/no samples:", len(valid))

y_true = valid["gt_norm"].values
y_pred = valid["pred_norm"].values

cm = confusion_matrix(y_true, y_pred, labels=["yes", "no"])

print("\n===== CONFUSION MATRIX =====")
print(cm)

report = classification_report(y_true, y_pred, labels=["yes", "no"])
print("\n===== CLASSIFICATION REPORT =====")
print(report)

accuracy = (valid["gt_norm"] == valid["pred_norm"]).mean()
print("Accuracy:", round(accuracy * 100, 2), "%")


Loaded: 2573 records
Valid yes/no samples: 2573

===== CONFUSION MATRIX =====
[[ 427  248]
 [ 167 1731]]

===== CLASSIFICATION REPORT =====
              precision    recall  f1-score   support

         yes       0.72      0.63      0.67       675
          no       0.87      0.91      0.89      1898

    accuracy                           0.84      2573
   macro avg       0.80      0.77      0.78      2573
weighted avg       0.83      0.84      0.84      2573

Accuracy: 83.87 %
