In [11]:
import pandas as pd
import evaluate
import torch
import gc

In [12]:
# ========== Load and Prepare Data ==========
# Replace these paths if needed
gold_path = "../dataset/task_b+c/data/challenge_data/clinicalnlp_taskB_test1.csv"
system_output_path = "../outputs/taskB_run1.csv"

In [13]:
# Read gold and system outputs
df_gold = pd.read_csv(gold_path)
df_pred = pd.read_csv(system_output_path)

In [14]:
# Rename and align columns
df_gold.rename(columns={"note": "reference", "encounter_id": "EncounterID"}, inplace=True)
df_pred.rename(columns={"SystemOutput": "prediction"}, inplace=True)

In [5]:
# Rename predicted column for consistency
sys_df.rename(columns={text_column_sys: "prediction"}, inplace=True)
gold_df.rename(columns={text_column_gold: "reference"}, inplace=True)

In [15]:
# Merge
df = df_gold.merge(df_pred[["EncounterID", "prediction"]], on="EncounterID")
references = df["reference"].tolist()
predictions = df["prediction"].tolist()

In [16]:
print(f"✅ Loaded {len(df)} prediction-reference pairs.")


✅ Loaded 40 prediction-reference pairs.


In [17]:
# ========== Metrics ==========
gc.collect()
torch.cuda.empty_cache()

In [18]:
# ROUGE
print("🔍 Computing ROUGE...")
rouge = evaluate.load("rouge")
rouge_scores = rouge.compute(predictions=predictions, references=references)
print("✅ ROUGE computed.")

🔍 Computing ROUGE...
✅ ROUGE computed.


In [19]:
# BERTScore (lighter model, CPU)
print("🔍 Computing BERTScore with bert-base-uncased on CPU...")
bertscore = evaluate.load("bertscore", device="cpu")
bertscore_scores = bertscore.compute(
    predictions=predictions,
    references=references,
    model_type="bert-base-uncased",
    device="cpu"
)
print("✅ BERTScore computed.")

🔍 Computing BERTScore with bert-base-uncased on CPU...
✅ BERTScore computed.


In [20]:
# BLEURT (on CPU)
print("🔍 Computing BLEURT (on CPU)...")
bleurt = evaluate.load("bleurt", config_name="BLEURT-20")
bleurt_scores = bleurt.compute(predictions=predictions, references=references)
print("✅ BLEURT computed.")

🔍 Computing BLEURT (on CPU)...
INFO:tensorflow:Reading checkpoint C:\Users\stoic\.cache\huggingface\metrics\bleurt\BLEURT-20\downloads\extracted\c55bb45fd2f4421f7460be96cbe7202b11151d877dd0e707e65c138aa101c4e7\BLEURT-20.


INFO:tensorflow:Reading checkpoint C:\Users\stoic\.cache\huggingface\metrics\bleurt\BLEURT-20\downloads\extracted\c55bb45fd2f4421f7460be96cbe7202b11151d877dd0e707e65c138aa101c4e7\BLEURT-20.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Will load checkpoint BLEURT-20


INFO:tensorflow:Will load checkpoint BLEURT-20


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:... name:BLEURT-20


INFO:tensorflow:... name:BLEURT-20


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... max_seq_length:512


INFO:tensorflow:... max_seq_length:512


INFO:tensorflow:... vocab_file:None


INFO:tensorflow:... vocab_file:None


INFO:tensorflow:... do_lower_case:None


INFO:tensorflow:... do_lower_case:None


INFO:tensorflow:... sp_model:sent_piece


INFO:tensorflow:... sp_model:sent_piece


INFO:tensorflow:... dynamic_seq_length:True


INFO:tensorflow:... dynamic_seq_length:True


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating SentencePiece tokenizer.


INFO:tensorflow:Creating SentencePiece tokenizer.


INFO:tensorflow:Creating SentencePiece tokenizer.


INFO:tensorflow:Creating SentencePiece tokenizer.


INFO:tensorflow:Will load model: C:\Users\stoic\.cache\huggingface\metrics\bleurt\BLEURT-20\downloads\extracted\c55bb45fd2f4421f7460be96cbe7202b11151d877dd0e707e65c138aa101c4e7\BLEURT-20\sent_piece.model.


INFO:tensorflow:Will load model: C:\Users\stoic\.cache\huggingface\metrics\bleurt\BLEURT-20\downloads\extracted\c55bb45fd2f4421f7460be96cbe7202b11151d877dd0e707e65c138aa101c4e7\BLEURT-20\sent_piece.model.


INFO:tensorflow:SentencePiece tokenizer created.


INFO:tensorflow:SentencePiece tokenizer created.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Loading model.


INFO:tensorflow:Loading model.


INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.


✅ BLEURT computed.


In [21]:
# ========== Display Results ==========
def average(metric_list):
    return round(sum(metric_list) / len(metric_list), 4)

print("\n📊 Evaluation Results:")
print("ROUGE-1:", round(rouge_scores["rouge1"], 4))
print("ROUGE-2:", round(rouge_scores["rouge2"], 4))
print("ROUGE-L:", round(rouge_scores["rougeL"], 4))
print("ROUGE-Lsum:", round(rouge_scores["rougeLsum"], 4))

print("\nBERTScore (bert-base-uncased):")
print("Precision:", average(bertscore_scores["precision"]))
print("Recall:", average(bertscore_scores["recall"]))
print("F1:", average(bertscore_scores["f1"]))

print("\nBLEURT:")
print("BLEURT score:", average(bleurt_scores["scores"]))


📊 Evaluation Results:
ROUGE-1: 0.1521
ROUGE-2: 0.0519
ROUGE-L: 0.1012
ROUGE-Lsum: 0.1336

BERTScore (bert-base-uncased):
Precision: 0.5688
Recall: 0.4428
F1: 0.4969

BLEURT:
BLEURT score: 0.3082
