In [None]:
# for shuchang's evalutaion
import json
with open("final_affiliations_2000_parallel.json", 'r', encoding='utf-8') as f:
    final_affiliations = json.load(f)

target_file_ids = set(entry['File ID'] for entry in final_affiliations)  # 例如 '2311.00001'

In [1]:
import csv
import json
from collections import defaultdict

# Load VIP RORs
vip_rors = set()
with open("dashboard_institutions_2023_2024-06-28.csv", encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        ror_id = row['rorId'].strip()
        if ror_id:
            vip_rors.add(ror_id)

# Load evaluated paper IDs (2000 papers)
with open("final_affiliations_2000_parallel.json", encoding='utf-8') as f:
    final_affiliations = json.load(f)
eval_paper_ids = set(entry["File ID"] for entry in final_affiliations)

# Load ground truth RORs per paper (only for 2000)
paper_to_true_rors = defaultdict(set)
with open("data/2311_with_ror.csv", encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        paper_id = row['paper_id'].strip()
        ror_id = row['ROR ID'].strip()
        if paper_id in eval_paper_ids and ror_id in vip_rors:
            paper_to_true_rors[paper_id].add(ror_id)

# Load predicted RORs per paper (already filtered to only include VIP RORs)
with open("vip_result_combined_v2.json", encoding='utf-8') as f:
    predictions = json.load(f)

# Collect confusion matrix values per VIP institution
confusion_matrix = {ror: {"TP": 0, "FP": 0, "FN": 0, "TN": 0} for ror in vip_rors}

for paper_id in eval_paper_ids:
    true_rors = paper_to_true_rors.get(paper_id, set())
    pred_rors = set(predictions.get(paper_id, []))  # may be missing

    for vip_ror in vip_rors:
        in_truth = vip_ror in true_rors
        in_pred = vip_ror in pred_rors

        if in_truth and in_pred:
            confusion_matrix[vip_ror]["TP"] += 1
        elif not in_truth and in_pred:
            confusion_matrix[vip_ror]["FP"] += 1
        elif in_truth and not in_pred:
            confusion_matrix[vip_ror]["FN"] += 1
        elif not in_truth and not in_pred:
            confusion_matrix[vip_ror]["TN"] += 1

# Print per-institution results
for ror, stats in confusion_matrix.items():
    print(f"{ror}: TP={stats['TP']}, FP={stats['FP']}, FN={stats['FN']}, TN={stats['TN']}")


https://ror.org/02k7v4d05: TP=3, FP=1, FN=7, TN=1989
https://ror.org/01jmqcy63: TP=0, FP=0, FN=0, TN=2000
https://ror.org/04zmssz18: TP=3, FP=2, FN=6, TN=1989
https://ror.org/04pp8hn57: TP=6, FP=2, FN=2, TN=1990
https://ror.org/05fe7ax82: TP=0, FP=0, FN=0, TN=2000
https://ror.org/01xtthb56: TP=2, FP=1, FN=2, TN=1995
https://ror.org/042aqky30: TP=5, FP=0, FN=0, TN=1995
https://ror.org/03cve4549: TP=26, FP=7, FN=4, TN=1963
https://ror.org/05gq02987: TP=3, FP=1, FN=1, TN=1995
https://ror.org/05wvpxv85: TP=2, FP=0, FN=0, TN=1998
https://ror.org/019pzjm43: TP=0, FP=0, FN=0, TN=2000
https://ror.org/00fn7gb05: TP=2, FP=0, FN=0, TN=1998
https://ror.org/00cyydd11: TP=0, FP=0, FN=0, TN=2000
https://ror.org/04mhzgx49: TP=9, FP=0, FN=2, TN=1989
https://ror.org/013bkhk48: TP=0, FP=0, FN=0, TN=2000
https://ror.org/05vt9qd57: TP=3, FP=3, FN=1, TN=1993
https://ror.org/03c0ach84: TP=0, FP=1, FN=0, TN=1999
https://ror.org/04etcj997: TP=1, FP=0, FN=0, TN=1999
https://ror.org/00wfvh315: TP=0, FP=0, FN=0, 

In [2]:
import pandas as pd

df = pd.DataFrame.from_dict(confusion_matrix, orient='index')
df.index.name = "ROR ID"
df.to_csv("vip_confusion_matrix.csv")
