In [1]:
from typing import Dict, List, Tuple
import math
import re

# Optional: NLTK BLEU (more standard than re-implementing)
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
# import nltk; nltk.download("punkt")  # <- run once if needed

# BERTScore
from bert_score import score as bert_score

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import json, re, math, sys, pandas as pd
from pathlib import Path
from typing import Dict, List, Tuple
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score as bert_score

def norm_key(s: str) -> str:
    # collapse whitespace & lowercase; strip punctuation noise at ends
    s = re.sub(r"\s+", " ", s).strip().lower()
    return s

def normalize_text(t: str) -> str:
    t = t.strip().lower()
    t = re.sub(r"\s+", " ", t)
    return t

def flatten(d: Dict) -> Dict[str, str]:
    """Flatten your nested JSON (Patients/Doctors/Manufacturer/Regulators) into {question: answer}."""
    flat = {}
    for section, qa in d.items():
        if not isinstance(qa, dict): 
            continue
        for q, a in qa.items():
            flat[q] = a
    return flat

def bleu_per_item(preds: List[str], refs: List[str]) -> List[float]:
    sm = SmoothingFunction().method1
    scores = []
    for p, r in zip(preds, refs):
        scores.append(sentence_bleu([r.split()], p.split(), smoothing_function=sm))
    return scores

def bertscore_lists(preds: List[str], refs: List[str]):
    P, R, F1 = bert_score(preds, refs, lang="en", verbose=False)
    return P.tolist(), R.tolist(), F1.tolist()




In [5]:
# ---------------- configure paths ----------------
# Your model outputs (the JSON you pasted). Save it as 'preds.json'.
PRED_PATH = Path("llm_query_answer_topten.json")
# Reference answers (gold). Save the JSON from section 2 as 'refs.json'.
REF_PATH  = Path("answer_ref.json")
OUT_CSV   = Path("scores.csv")

# ---------------- load ----------------
pred_raw = json.loads(PRED_PATH.read_text(encoding="utf-8"))
ref_raw  = json.loads(REF_PATH.read_text(encoding="utf-8"))

preds_dict = flatten(pred_raw)
refs_dict  = flatten(ref_raw)

# Build normalization maps
pred_map = {norm_key(k): k for k in preds_dict.keys()}
ref_map  = {norm_key(k): k for k in refs_dict.keys()}

# Intersect by normalized keys
common_norm = [k for k in ref_map.keys() if k in pred_map]
missing_in_preds = [ref_map[k] for k in ref_map.keys() if k not in pred_map]
missing_in_refs  = [pred_map[k] for k in pred_map.keys() if k not in ref_map]

if missing_in_preds:
    print("These reference questions mistmatch with preds (after normalization):")
    for m in missing_in_preds: print(" -", m)
if missing_in_refs:
    print(" These prediction questions were not found in refs (after normalization):")
    for m in missing_in_refs: print(" -", m)

 These prediction questions were not found in refs (after normalization):
 - What does the report say about the retainer ring being cracked or loose?


In [6]:

questions = [ref_map[k] for k in common_norm]  # use canonical (ref) wording
y_ref = [normalize_text(refs_dict[ref_map[k]]) for k in common_norm]
y_pred = [normalize_text(preds_dict[pred_map[k]]) for k in common_norm]

# BLEU
bleu_scores = bleu_per_item(y_pred, y_ref)
bleu_avg = sum(bleu_scores)/len(bleu_scores) if bleu_scores else float("nan")

# BERTScore
P, R, F1 = bertscore_lists(y_pred, y_ref)
Pm, Rm, Fm = sum(P)/len(P), sum(R)/len(R), sum(F1)/len(F1)

# Report
df = pd.DataFrame({
    "question": questions,
    "bleu": bleu_scores,
    "bert_P": P,
    "bert_R": R,
    "bert_F1": F1
}).sort_values("question")

print(f"\nCorpus BLEU (mean sentence BLEU): {bleu_avg:.4f}")
print(f"Mean BERTScore  P={Pm:.4f}  R={Rm:.4f}  F1={Fm:.4f}\n")
print(df.to_string(index=False, max_colwidth=100))

df.to_csv(OUT_CSV, index=False)
print(f"\nSaved per-item scores to {OUT_CSV.resolve()}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Corpus BLEU (mean sentence BLEU): 0.0035
Mean BERTScore  P=0.8288  R=0.8549  F1=0.8415

                                                                                            question     bleu   bert_P   bert_R  bert_F1
                                   Are adhesive failures or peeling overlays reported in pump usage? 0.001843 0.824287 0.824589 0.824438
         Are cosmetic damages like cracked battery tubes or retainer rings discussed in any reports? 0.011980 0.845396 0.884026 0.864280
                          Are cracks in the device case mentioned in relation to patient complaints? 0.001780 0.837654 0.881060 0.858809
      Are injury or malfunction related keywords present in reports marked where adverse event is No 0.001654 0.822655 0.866879 0.844188
Are there any cases where the adverse event flag is marked as No but the predicted Adverse event ... 0.001659 0.820946 0.853353 0.836836
                 Are there any reports describing nausea, vomiting, or hyperglycemia afte

### Scores on long worded reference answers

In [10]:
# ---------------- configure paths ----------------
# Your model outputs (the JSON you pasted). Save it as 'preds.json'.
PRED_PATH = Path("llm_query_answer_topten.json")
# Reference answers (gold). Save the JSON from section 2 as 'refs.json'.
REF_PATH  = Path("long_answer_ref.json")
OUT_CSV   = Path("scores_long_ans.csv")

# ---------------- load ----------------
pred_raw = json.loads(PRED_PATH.read_text(encoding="utf-8"))
ref_raw  = json.loads(REF_PATH.read_text(encoding="utf-8"))

preds_dict = flatten(pred_raw)
refs_dict  = flatten(ref_raw)

# Build normalization maps
pred_map = {norm_key(k): k for k in preds_dict.keys()}
ref_map  = {norm_key(k): k for k in refs_dict.keys()}

# Intersect by normalized keys
common_norm = [k for k in ref_map.keys() if k in pred_map]
missing_in_preds = [ref_map[k] for k in ref_map.keys() if k not in pred_map]
missing_in_refs  = [pred_map[k] for k in pred_map.keys() if k not in ref_map]

if missing_in_preds:
    print("These reference questions mistmatch with preds (after normalization):")
    for m in missing_in_preds: print(" -", m)
if missing_in_refs:
    print(" These prediction questions were not found in refs (after normalization):")
    for m in missing_in_refs: print(" -", m)

In [11]:

questions = [ref_map[k] for k in common_norm]  # use canonical (ref) wording
y_ref = [normalize_text(refs_dict[ref_map[k]]) for k in common_norm]
y_pred = [normalize_text(preds_dict[pred_map[k]]) for k in common_norm]

# BLEU
bleu_scores = bleu_per_item(y_pred, y_ref)
bleu_avg = sum(bleu_scores)/len(bleu_scores) if bleu_scores else float("nan")

# BERTScore
P, R, F1 = bertscore_lists(y_pred, y_ref)
Pm, Rm, Fm = sum(P)/len(P), sum(R)/len(R), sum(F1)/len(F1)

# Report
df = pd.DataFrame({
    "question": questions,
    "bleu": bleu_scores,
    "bert_P": P,
    "bert_R": R,
    "bert_F1": F1
}).sort_values("question")

print(f"\nCorpus BLEU (mean sentence BLEU): {bleu_avg:.4f}")
print(f"Mean BERTScore  P={Pm:.4f}  R={Rm:.4f}  F1={Fm:.4f}\n")
print(df.to_string(index=False, max_colwidth=100))

df.to_csv(OUT_CSV, index=False)
print(f"\nSaved per-item scores to {OUT_CSV.resolve()}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Corpus BLEU (mean sentence BLEU): 0.0104
Mean BERTScore  P=0.8437  R=0.8755  F1=0.8591

                                                                                            question     bleu   bert_P   bert_R  bert_F1
                                   Are adhesive failures or peeling overlays reported in pump usage? 0.002998 0.827543 0.846378 0.836855
         Are cosmetic damages like cracked battery tubes or retainer rings discussed in any reports? 0.018523 0.859474 0.902566 0.880493
                          Are cracks in the device case mentioned in relation to patient complaints? 0.005208 0.845445 0.902300 0.872948
      Are injury or malfunction related keywords present in reports marked where adverse event is No 0.012243 0.829888 0.868389 0.848702
Are there any cases where the adverse event flag is marked as No\n                  but the predi... 0.015436 0.843640 0.889254 0.865847
                 Are there any reports describing nausea, vomiting, or hyperglycemia afte

In [14]:
import json
import pandas as pd
from pathlib import Path
from sacrebleu.metrics import CHRF
from rouge_score import rouge_scorer
from bert_score import score as bert_score

# ---------------- CONFIG ----------------
PRED_PATH = Path("llm_query_answer_topten.json")
REF_PATH  = Path("long_answer_ref.json")
OUT_CSV   = Path("score_analyse_long_ans.csv")
TOP_K_LOW = 5  # how many lowest-scoring questions to show

# ---------------- LOAD ----------------
pred_raw = json.loads(PRED_PATH.read_text(encoding="utf-8"))
ref_raw  = json.loads(REF_PATH.read_text(encoding="utf-8"))

# Flatten nested structure
def flatten(d):
    flat = {}
    for section, qa in d.items():
        if isinstance(qa, dict):
            for q, a in qa.items():
                flat[q] = a.strip()
    return flat

preds = flatten(pred_raw)
refs  = flatten(ref_raw)

# Align keys exactly
common_qs = sorted(set(preds.keys()) & set(refs.keys()))
if len(common_qs) != len(refs):
    print(f"⚠ Missing keys — only {len(common_qs)}/{len(refs)} matched between preds and refs.")

y_pred = [preds[q] for q in common_qs]
y_ref  = [refs[q]  for q in common_qs]

# ---------------- METRICS ----------------
# chrF++
chrf = CHRF(word_order=2)  # chrF++
chrf_scores = [chrf.sentence_score(hyp, [ref]).score for hyp, ref in zip(y_pred, y_ref)]

# ROUGE-L
rouger = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
rougeL_f = [rouger.score(ref, hyp)["rougeL"].fmeasure for hyp, ref in zip(y_pred, y_ref)]

# BERTScore (roberta-large baseline)
P, R, F1 = bert_score(y_pred, y_ref, lang="en", verbose=True, rescale_with_baseline=True, model_type="roberta-large")
P, R, F1 = P.tolist(), R.tolist(), F1.tolist()

# ---------------- SAVE & ANALYZE ----------------
df = pd.DataFrame({
    "question": common_qs,
    "bert_P": P,
    "bert_R": R,
    "bert_F1": F1,
    "chrF++": chrf_scores,
    "ROUGE-L_F1": rougeL_f,
    "prediction": y_pred,
    "reference": y_ref
}).sort_values("bert_F1", ascending=False)

df.to_csv(OUT_CSV, index=False)
print(f"Saved full metrics to {OUT_CSV.resolve()}")

# Show worst questions by BERT F1
print("\n=== Lowest BERT F1 questions ===")
print(df.sort_values("bert_F1").head(TOP_K_LOW)[["question", "bert_F1", "chrF++", "ROUGE-L_F1"]])

# Show worst questions by chrF++
print("\n=== Lowest chrF++ questions ===")
print(df.sort_values("chrF++").head(TOP_K_LOW)[["question", "bert_F1", "chrF++", "ROUGE-L_F1"]])

# Show worst questions by ROUGE-L
print("\n=== Lowest ROUGE-L questions ===")
print(df.sort_values("ROUGE-L_F1").head(TOP_K_LOW)[["question", "bert_F1", "chrF++", "ROUGE-L_F1"]])

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:01<00:00,  1.51s/it]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 74.60it/s]

done in 1.54 seconds, 16.83 sentences/sec
Saved full metrics to C:\vaish\score_analyse_long_ans.csv

=== Lowest BERT F1 questions ===
                                             question   bert_F1     chrF++  \
0   Are adhesive failures or peeling overlays repo...  0.061337  19.074895   
10  Are there examples where predicted adverse eve...  0.071462  19.218829   
24  What problems are described with insulin deliv...  0.072407  23.676587   
25  Which kinds of malfunctions are mentioned for ...  0.082178  29.687894   
20  What does the FOI_TEXT say about the retainer ...  0.095517  24.636944   

    ROUGE-L_F1  
0     0.083333  
10    0.108108  
24    0.141026  
25    0.212766  
20    0.128000  

=== Lowest chrF++ questions ===
                                             question   bert_F1     chrF++  \
18  Was a patient hospitalized due to pump malfunc...  0.102463  18.552426   
0   Are adhesive failures or peeling overlays repo...  0.061337  19.074895   
10  Are there examples where




In [13]:
!pip install sacrebleu rouge-score bert-score pandas

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting tabulate>=0.8.9 (from sacrebleu)
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Collecting lxml (from sacrebleu)
  Downloading lxml-6.0.0-cp310-cp310-win_amd64.whl.metadata (6.8 kB)
Collecting absl-py (from rouge-score)
  Using cached absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Using cached absl_py-2.3.1-py3-none-any.whl (135 kB)
Downloading lxml-6.0.0-cp310-cp310-win_amd64.whl (4.0 MB)
   ---------------------------------------- 0.0/4.0 MB ? eta -:--:--
   ---------------------------------------- 4.0/4.0 MB 21.7 MB/s eta 0:00:00
Building wheels for collected packages: rouge-score
  Building wheel 

  DEPRECATION: Building 'rouge-score' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'rouge-score'. Discussion can be found at https://github.com/pypa/pip/issues/6334
