In [None]:
# for shuchang's evalutaion
import json
with open("final_affiliations_2000_parallel.json", 'r', encoding='utf-8') as f:
    final_affiliations = json.load(f)

target_file_ids = set(entry['File ID'] for entry in final_affiliations)  # 例如 '2311.00001'

In [15]:
import csv
import json
from collections import defaultdict

# Load VIP RORs
vip_rors = set()
with open("dashboard_institutions_2023_2024-06-28.csv", encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        ror_id = row['rorId'].strip()
        if ror_id:
            vip_rors.add(ror_id)

# Load evaluated paper IDs (2000 papers)
with open("final_affiliations_2000_parallel.json", encoding='utf-8') as f:
    final_affiliations = json.load(f)
eval_paper_ids = set(entry["File ID"] for entry in final_affiliations)

# Load ground truth RORs per paper (only for 2000)
paper_to_true_rors = defaultdict(set)
with open("data/2311_with_ror.csv", encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        paper_id = row['paper_id'].strip()
        ror_id = row['ROR ID'].strip()
        if paper_id in eval_paper_ids and ror_id in vip_rors:
            paper_to_true_rors[paper_id].add(ror_id)

# Load predicted RORs per paper (already filtered to only include VIP RORs)
with open("vip_result_combined_v2.json", encoding='utf-8') as f:
    predictions = json.load(f)

# Collect confusion matrix values per VIP institution
confusion_matrix = {ror: {"TP": 0, "FP": 0, "FN": 0, "TN": 0} for ror in vip_rors}

for paper_id in eval_paper_ids:
    true_rors = paper_to_true_rors.get(paper_id, set())
    pred_rors = set(predictions.get(paper_id, []))  # may be missing

    for vip_ror in vip_rors:
        in_truth = vip_ror in true_rors
        in_pred = vip_ror in pred_rors

        if in_truth and in_pred:
            confusion_matrix[vip_ror]["TP"] += 1
        elif not in_truth and in_pred:
            confusion_matrix[vip_ror]["FP"] += 1
        elif in_truth and not in_pred:
            confusion_matrix[vip_ror]["FN"] += 1
        elif not in_truth and not in_pred:
            confusion_matrix[vip_ror]["TN"] += 1

# Print per-institution results
def safe_divide(num, denom):
    return num / denom if denom != 0 else 0.0

# Add per-institution metrics
metrics_per_vip = {}
for ror, stats in confusion_matrix.items():
    tp = stats["TP"]
    fp = stats["FP"]
    fn = stats["FN"]

    precision = safe_divide(tp, tp + fp)
    recall = safe_divide(tp, tp + fn)
    f1 = safe_divide(2 * precision * recall, precision + recall) if (precision + recall) > 0 else 0.0

    metrics_per_vip[ror] = {
        **stats,
        "Precision": precision,
        "Recall": recall,
        "F1": f1
    }

# Aggregate macro-average (mean over all institutions)
macro_precision = sum(m["Precision"] for m in metrics_per_vip.values()) / len(metrics_per_vip)
macro_recall = sum(m["Recall"] for m in metrics_per_vip.values()) / len(metrics_per_vip)
macro_f1 = sum(m["F1"] for m in metrics_per_vip.values()) / len(metrics_per_vip)

# Aggregate micro-average (total TP/FP/FN over all VIPs)
total_tp = sum(m["TP"] for m in metrics_per_vip.values())
total_fp = sum(m["FP"] for m in metrics_per_vip.values())
total_fn = sum(m["FN"] for m in metrics_per_vip.values())

micro_precision = safe_divide(total_tp, total_tp + total_fp)
micro_recall = safe_divide(total_tp, total_tp + total_fn)
micro_f1 = safe_divide(2 * micro_precision * micro_recall, micro_precision + micro_recall)

# Print macro and micro average scores
print("\n=== Macro-Averaged Metrics ===")
print(f"Macro Precision: {macro_precision:.4f}")
print(f"Macro Recall:    {macro_recall:.4f}")
print(f"Macro F1 Score:  {macro_f1:.4f}")

print("\n=== Micro-Averaged Metrics ===")
print(f"Micro Precision: {micro_precision:.4f}")
print(f"Micro Recall:    {micro_recall:.4f}")
print(f"Micro F1 Score:  {micro_f1:.4f}")




=== Macro-Averaged Metrics ===
Macro Precision: 0.5242
Macro Recall:    0.5081
Macro F1 Score:  0.5004

=== Micro-Averaged Metrics ===
Micro Precision: 0.7960
Micro Recall:    0.7372
Micro F1 Score:  0.7654


In [20]:
import pandas as pd

# Load the VIP dashboard into a DataFrame
dashboard_path = "dashboard_institutions_2023_2024-06-28.csv"
dashboard_df = pd.read_csv(dashboard_path,dtype={
        "salsaId": str,
        "orgId": str,
        "sId": str,
        "isConsortium": str  # optional
    })

# Prepare evaluation results as a DataFrame
eval_rows = []
for ror_id in vip_rors:
    row = confusion_matrix.get(ror_id, {
        "TP": 0, "FP": 0, "FN": 0, "TN": 0
    })
    row_with_ror = {"rorId": ror_id, **row}
    eval_rows.append(row_with_ror)

eval_df = pd.DataFrame(eval_rows)

# Merge with dashboard on rorId
merged_df = dashboard_df.merge(eval_df, on="rorId", how="left")

# Fill NaNs with 0 for institutions not matched
for col in ["TP", "FP", "FN", "TN"]:
    merged_df[col] = merged_df[col].fillna(0).astype(int)

# Save to new CSV
merged_df.to_csv("dashboard_institutions_with_raw_counts.csv", index=False)
print("\n✅ Raw count results (TP/FP/FN/TN) written to 'dashboard_institutions_with_raw_counts.csv'")



✅ Raw count results (TP/FP/FN/TN) written to 'dashboard_institutions_with_raw_counts.csv'


In [21]:
def safe_divide(num, denom):
    return num / denom if denom != 0 else 0.0

# Add per-institution metrics
metrics_per_vip = {}
for ror, stats in confusion_matrix.items():
    tp = stats["TP"]
    fp = stats["FP"]
    fn = stats["FN"]

    precision = safe_divide(tp, tp + fp)
    recall = safe_divide(tp, tp + fn)
    f1 = safe_divide(2 * precision * recall, precision + recall) if (precision + recall) > 0 else 0.0

    metrics_per_vip[ror] = {
        **stats,
        "Precision": precision,
        "Recall": recall,
        "F1": f1
    }

# Aggregate macro-average (mean over all institutions)
macro_precision = sum(m["Precision"] for m in metrics_per_vip.values()) / len(metrics_per_vip)
macro_recall = sum(m["Recall"] for m in metrics_per_vip.values()) / len(metrics_per_vip)
macro_f1 = sum(m["F1"] for m in metrics_per_vip.values()) / len(metrics_per_vip)

# Aggregate micro-average (total TP/FP/FN over all VIPs)
total_tp = sum(m["TP"] for m in metrics_per_vip.values())
total_fp = sum(m["FP"] for m in metrics_per_vip.values())
total_fn = sum(m["FN"] for m in metrics_per_vip.values())

micro_precision = safe_divide(total_tp, total_tp + total_fp)
micro_recall = safe_divide(total_tp, total_tp + total_fn)
micro_f1 = safe_divide(2 * micro_precision * micro_recall, micro_precision + micro_recall)

# Print macro and micro average scores
print("\n=== Macro-Averaged Metrics ===")
print(f"Macro Precision: {macro_precision:.4f}")
print(f"Macro Recall:    {macro_recall:.4f}")
print(f"Macro F1 Score:  {macro_f1:.4f}")

print("\n=== Micro-Averaged Metrics ===")
print(f"Micro Precision: {micro_precision:.4f}")
print(f"Micro Recall:    {micro_recall:.4f}")
print(f"Micro F1 Score:  {micro_f1:.4f}")


=== Macro-Averaged Metrics ===
Macro Precision: 0.5242
Macro Recall:    0.5081
Macro F1 Score:  0.5004

=== Micro-Averaged Metrics ===
Micro Precision: 0.7960
Micro Recall:    0.7372
Micro F1 Score:  0.7654


In [2]:
import pandas as pd

df = pd.DataFrame.from_dict(confusion_matrix, orient='index')
df.index.name = "ROR ID"
df.to_csv("vip_confusion_matrix.csv")


In [10]:
total_tp, total_fp, total_fn = 0, 0, 0
paper_accuracies = []

for paper_id in eval_paper_ids:
    true_rors = paper_to_true_rors.get(paper_id, set())
    pred_rors = set(predictions.get(paper_id, []))
    
    # Only keep VIP RORs
    true_vip = true_rors & vip_rors
    pred_vip = pred_rors & vip_rors

    tp = len(true_vip & pred_vip)
    fp = len(pred_vip - true_vip)
    fn = len(true_vip - pred_vip)

    total_tp += tp
    total_fp += fp
    total_fn += fn

    denom = len(true_vip | pred_vip)
    acc = tp / denom if denom > 0 else 1.0  # If both sets empty, consider fully accurate
    paper_accuracies.append(acc)

# Micro metrics
micro_precision = safe_divide(total_tp, total_tp + total_fp)
micro_recall = safe_divide(total_tp, total_tp + total_fn)
micro_f1 = safe_divide(2 * micro_precision * micro_recall, micro_precision + micro_recall)

# Average paper-level accuracy
avg_paper_acc = sum(paper_accuracies) / len(paper_accuracies)

print("\n=== Paper-Level VIP Prediction Evaluation ===")
print(f"Total TP: {total_tp}, FP: {total_fp}, FN: {total_fn}")
print(f"Micro Precision: {micro_precision:.4f}")
print(f"Micro Recall:    {micro_recall:.4f}")
print(f"Micro F1 Score:  {micro_f1:.4f}")
print(f"Avg Paper Accuracy (Jaccard-style): {avg_paper_acc:.4f}")



=== Paper-Level VIP Prediction Evaluation ===
Total TP: 1338, FP: 343, FN: 477
Micro Precision: 0.7960
Micro Recall:    0.7372
Micro F1 Score:  0.7654
Avg Paper Accuracy (Jaccard-style): 0.8216
