# BERTScore Similarity Analysis

This notebook computes BERTScore similarity metrics for pairs of generated SOAP notes (clean vs. adversarial). It reads notes from the `data/` directory and saves results in the `results/` directory.

In [None]:
import os
import pandas as pd
from bert_score import score as bertscore

# Define base directory for data and results
BASE_FOLDER = '../data'
RESULTS_FOLDER = '../results'

# List of clean/adversarial note file pairs (update if more pairs added)
note_pairs = [
    ("1_CLEAN_soap_notes.txt", "1_ADV_soap_notes.txt"),
    ("2_CLEAN_soap_notes.txt", "2_ADV_soap_notes.txt")
]

# Load note texts
clean_texts = []
adversarial_texts = []
for clean_file, adv_file in note_pairs:
    with open(os.path.join(BASE_FOLDER, clean_file), 'r', encoding='utf-8') as f:
        clean_texts.append(f.read())
    with open(os.path.join(BASE_FOLDER, adv_file), 'r', encoding='utf-8') as f:
        adversarial_texts.append(f.read())

print(f"Loaded {len(clean_texts)} clean notes and {len(adversarial_texts)} adversarial notes.")


In [None]:
# Compute BERTScore metrics
P, R, F1 = bertscore(adversarial_texts, clean_texts, model_type='emilyalsentzer/Bio_ClinicalBERT', device='cpu')

# Create DataFrame of results
results_df = pd.DataFrame({
    'pair': [f'Pair {i+1}' for i in range(len(clean_texts))],
    'precision': P.tolist(),
    'recall': R.tolist(),
    'f1': F1.tolist()
})

# Display results
results_df


In [None]:
# Save BERTScore results to CSV
os.makedirs(RESULTS_FOLDER, exist_ok=True)
output_path = os.path.join(RESULTS_FOLDER, 'bertscore_results.csv')
results_df.to_csv(output_path, index=False)
print(f'Saved BERTScore results to {output_path}')
