In [14]:
import csv
import json
from collections import defaultdict

vip_csv = "dashboard_institutions2023_2025-01-21.csv"
unfiltered_result_from_llm = "final_affiliations_2000_parallel.json"
filtered_cascaded_result = "result_combined.json"
ground_truth = "data/2311_with_ror.csv"

vip_rors = set()
with open(vip_csv, encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        ror_id = row['ror_id'].strip()
        if ror_id:
            vip_rors.add(ror_id)
        else:
            print("no ror id for ", row['name'])

# only for getting all examined paper ids
with open(unfiltered_result_from_llm, encoding='utf-8') as f:
    final_affiliations = json.load(f)
eval_paper_ids = set(entry["File ID"] for entry in final_affiliations)

paper_to_true_rors = defaultdict(set)
with open(ground_truth, encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        paper_id = row['paper_id'].strip()
        ror_id = row['ROR ID'].strip()
        if paper_id in eval_paper_ids and ror_id in vip_rors:
            paper_to_true_rors[paper_id].add(ror_id)

# this result has already filtered to only include VIP RORs by Yi
with open(filtered_cascaded_result, encoding='utf-8') as f:
    predictions = json.load(f)


confusion_matrix = {ror: {"TP": 0, "FP": 0, "FN": 0, "TN": 0} for ror in vip_rors}

for paper_id in eval_paper_ids:
    true_rors = paper_to_true_rors.get(paper_id, set())
    pred_rors = set(predictions.get(paper_id, []))  # may be missing

    for vip_ror in vip_rors:
        in_truth = vip_ror in true_rors
        in_pred = vip_ror in pred_rors

        if in_truth and in_pred:
            confusion_matrix[vip_ror]["TP"] += 1
        elif not in_truth and in_pred:
            confusion_matrix[vip_ror]["FP"] += 1
        elif in_truth and not in_pred:
            confusion_matrix[vip_ror]["FN"] += 1
        elif not in_truth and not in_pred:
            confusion_matrix[vip_ror]["TN"] += 1

# Print per-institution results
def safe_divide(num, denom):
    return num / denom if denom != 0 else 0.0

# Add per-institution metrics
metrics_per_vip = {}
for ror, stats in confusion_matrix.items():
    tp = stats["TP"]
    fp = stats["FP"]
    fn = stats["FN"]

    precision = safe_divide(tp, tp + fp)
    recall = safe_divide(tp, tp + fn)
    f1 = safe_divide(2 * precision * recall, precision + recall) if (precision + recall) > 0 else 0.0

    metrics_per_vip[ror] = {
        **stats,
        "Precision": precision,
        "Recall": recall,
        "F1": f1
    }

# total TP/FP/FN over all VIPs
total_tp = sum(m["TP"] for m in metrics_per_vip.values())
total_fp = sum(m["FP"] for m in metrics_per_vip.values())
total_fn = sum(m["FN"] for m in metrics_per_vip.values())

micro_precision = safe_divide(total_tp, total_tp + total_fp)
micro_recall = safe_divide(total_tp, total_tp + total_fn)
micro_f1 = safe_divide(2 * micro_precision * micro_recall, micro_precision + micro_recall)


print(f"Precision: {micro_precision:.4f}")
print(f"Recall:    {micro_recall:.4f}")
print(f"F1 Score:  {micro_f1:.4f}")





no ror id for  Austrian Science Fund
no ror id for  CSIC - Estación Experimental La Mayora (EELM)
no ror id for  CSIC-UZA - Instituto de Ciencia de Materiales de Aragon (ICMA)
no ror id for  Campus de Excelencia Internacional UAM+CSIC
no ror id for  Centre pour la Communication Scientifique Directe
no ror id for  FinElib Consortium
no ror id for  NII Japan Consortium
no ror id for  Niels Bohr Institute
no ror id for  UCLouvain
no ror id for  arXiv-DH and Helmholtz Association of German Research Centres (HGF) (DHHGF)
Precision: 0.8002
Recall:    0.7819
F1 Score:  0.7910


In [15]:
import pandas as pd
import csv

# Step 1: Load ROR ID to InstitutionName mapping
ror_to_name = {}
with open("dashboard_institutions_2023_2024-06-28.csv", encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        ror_id = row["rorId"].strip()
        if ror_id:
            ror_to_name[ror_id] = row["InstitutionName"].strip()

# Step 2: Convert confusion_matrix to DataFrame
df = pd.DataFrame.from_dict(confusion_matrix, orient="index")
df.index.name = "ROR ID"

# Step 3: Add InstitutionName as a column
df["InstitutionName"] = df.index.map(ror_to_name)

# Optional: move InstitutionName to first column
df = df.reset_index()
df = df[["ROR ID", "TP", "FP", "FN", "TN", "InstitutionName"]]

# Step 4: Save to CSV
df.to_csv("vip_confusion_matrix.csv", index=False)


In [16]:
import pandas as pd

# Load the VIP dashboard into a DataFrame
dashboard_path = "dashboard_institutions_2023_2024-06-28.csv"
dashboard_df = pd.read_csv(dashboard_path,dtype={
        "salsaId": str,
        "orgId": str,
        "sId": str,
        "isConsortium": str  # optional
    })

# Prepare evaluation results as a DataFrame
eval_rows = []
for ror_id in vip_rors:
    row = confusion_matrix.get(ror_id, {
        "TP": 0, "FP": 0, "FN": 0, "TN": 0
    })
    row_with_ror = {"rorId": ror_id, **row}
    eval_rows.append(row_with_ror)

eval_df = pd.DataFrame(eval_rows)

# Merge with dashboard on rorId
merged_df = dashboard_df.merge(eval_df, on="rorId", how="left")

# Fill NaNs with 0 for institutions not matched
for col in ["TP", "FP", "FN", "TN"]:
    merged_df[col] = merged_df[col].fillna(0).astype(int)

# Save to new CSV
merged_df.to_csv("dashboard_institutions_with_raw_counts.csv", index=False)

In [17]:
total_tp, total_fp, total_fn = 0, 0, 0
paper_accuracies = []

for paper_id in eval_paper_ids:
    true_rors = paper_to_true_rors.get(paper_id, set())
    pred_rors = set(predictions.get(paper_id, []))
    
    # Only keep VIP RORs
    true_vip = true_rors & vip_rors
    pred_vip = pred_rors & vip_rors

    tp = len(true_vip & pred_vip)
    fp = len(pred_vip - true_vip)
    fn = len(true_vip - pred_vip)

    total_tp += tp
    total_fp += fp
    total_fn += fn

    denom = len(true_vip | pred_vip)
    acc = tp / denom if denom > 0 else 1.0  # If both sets empty, consider fully accurate
    paper_accuracies.append(acc)

# Micro metrics
micro_precision = safe_divide(total_tp, total_tp + total_fp)
micro_recall = safe_divide(total_tp, total_tp + total_fn)
micro_f1 = safe_divide(2 * micro_precision * micro_recall, micro_precision + micro_recall)

# Average paper-level accuracy
avg_paper_acc = sum(paper_accuracies) / len(paper_accuracies)

print("\n=== Paper-Level VIP Prediction Evaluation ===")
print(f"Total TP: {total_tp}, FP: {total_fp}, FN: {total_fn}")
print(f"Micro Precision: {micro_precision:.4f}")
print(f"Micro Recall:    {micro_recall:.4f}")
print(f"Micro F1 Score:  {micro_f1:.4f}")
print(f"Avg Paper Accuracy (Jaccard-style): {avg_paper_acc:.4f}")



=== Paper-Level VIP Prediction Evaluation ===
Total TP: 1434, FP: 358, FN: 400
Micro Precision: 0.8002
Micro Recall:    0.7819
Micro F1 Score:  0.7910
Avg Paper Accuracy (Jaccard-style): 0.8289
