In [None]:
import pandas as pd
from pathlib import Path
import numpy as np
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                            f1_score, roc_auc_score)
from tqdm import tqdm
from sklearn.preprocessing import label_binarize

# Loading data
df_model = pd.read_excel(
    Path(r"data_source\xxx.xlsx")
)
df_annotated = pd.read_excel(
    Path(r"data_source\xxx.xlsx")
)

# rename
df_model = df_model.loc[
    :,
    [
        "Practical barriers to vaccination (-)",
        "Perceived barriers to accepting vaccines (-)",
        "Perceived benefits (+)",
        "Misinformation (-)",
        "Perceived Disease Risk (+)",
        "Social norms  cues to action (+)",
        "Attitude",
    ],
]

df_annotated = df_annotated.loc[
    :,
    [
        "Practical barriers to vaccination (-)",
        "Perceived barriers to accepting vaccines (-)",
        "Perceived benefits (+)",
        "Misinformation (-)",
        "Perceived Disease Risk (+)",
        "Social norms  cues to action (+)",
        "Attitude",
    ],
]

assert len(df_model) == len(df_annotated), "DataFrames have different lengths"

binary_categories = [   
                        "Practical barriers to vaccination (-)",        
                        "Perceived barriers to accepting vaccines (-)",        
                        "Behavior",        
                        "Perceived benefits (+)",        
                        "Misinformation (-)",
                        "Perceived Disease Risk (+)",        
                        "Social norms  cues to action (+)",  
                    ]

results = {
    'Category': [],
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1': [],
    'AUC': [],
    'Model Pos/Neg Count': [],
    'Annotated Pos/Neg Count': []
}

attitude_results = {
    'Class': [],
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1': [],
    'AUC (OvR)': [],
    'Model Count': [],
    'Annotated Count': []
}

attitude_macro_results = {
    'Metric': [],
    'Value': [],
    'Model Class Distribution': [],
    'Annotated Class Distribution': []
}

# Bootstrap parameter
n_bootstraps = 10000
alpha = 0.05  # 95% CI

def format_result(point, lower, upper):

    return f"{point:.4f} ({lower:.4f}, {upper:.4f})"

def format_counts(count_dict):

    return "\n".join([f"{k}: {v}" for k, v in count_dict.items()])

def calculate_ci(data):

    lower = np.percentile(data, 100 * alpha / 2)
    upper = np.percentile(data, 100 * (1 - alpha / 2))
    return lower, upper

# binary category estimation
for category in binary_categories:
    print(category)
    print(f"\nEvaluating binary category: {category}")
    
    y_true = df_annotated[category].values
    y_pred = df_model[category].values
    
    y_true = (y_true > 0).astype(int)
    y_pred = (y_pred > 0).astype(int)
    
    # class counting
    model_pos = sum(y_pred)
    model_neg = len(y_pred) - model_pos
    annotated_pos = sum(y_true)
    annotated_neg = len(y_true) - annotated_pos
    
    model_counts = {"Model Pos": model_pos, "Model Neg": model_neg}
    annotated_counts = {"Annotated Pos": annotated_pos, "Annotated Neg": annotated_neg}
    
    # initialize bootstrap storage
    boot_acc, boot_prec, boot_rec, boot_f1, boot_auc = [], [], [], [], []
    
    # bootstrap sampling
    for _ in tqdm(range(n_bootstraps), desc=f"Bootstrapping {category}"):
        indices = np.random.choice(len(y_true), len(y_true), replace=True)
        sample_true = y_true[indices]
        sample_pred = y_pred[indices]
        
        try:
            acc = accuracy_score(sample_true, sample_pred)
            prec = precision_score(sample_true, sample_pred, zero_division=0)
            rec = recall_score(sample_true, sample_pred, zero_division=0)
            f1 = f1_score(sample_true, sample_pred, zero_division=0)
            
            auc = roc_auc_score(sample_true, sample_pred)
        except:
            continue
        
        boot_acc.append(acc)
        boot_prec.append(prec)
        boot_rec.append(rec)
        boot_f1.append(f1)
        boot_auc.append(auc)
    
    # point estimation
    point_acc = accuracy_score(y_true, y_pred)
    point_prec = precision_score(y_true, y_pred, zero_division=0)
    point_rec = recall_score(y_true, y_pred, zero_division=0)
    point_f1 = f1_score(y_true, y_pred, zero_division=0)
    
    try:
        point_auc = roc_auc_score(y_true, y_pred)
    except:
        point_auc = np.nan
    
    # CI estimation
    acc_lower, acc_upper = calculate_ci(boot_acc)
    prec_lower, prec_upper = calculate_ci(boot_prec)
    rec_lower, rec_upper = calculate_ci(boot_rec)
    f1_lower, f1_upper = calculate_ci(boot_f1)
    auc_lower, auc_upper = calculate_ci(boot_auc) if boot_auc else (np.nan, np.nan)
    
    # save results
    results['Category'].append(category)
    results['Accuracy'].append(format_result(point_acc, acc_lower, acc_upper))
    results['Precision'].append(format_result(point_prec, prec_lower, prec_upper))
    results['Recall'].append(format_result(point_rec, rec_lower, rec_upper))
    results['F1'].append(format_result(point_f1, f1_lower, f1_upper))
    results['AUC'].append(format_result(point_auc, auc_lower, auc_upper) if not np.isnan(point_auc) else "N/A")
    results['Model Pos/Neg Count'].append(format_counts(model_counts))
    results['Annotated Pos/Neg Count'].append(format_counts(annotated_counts))

# 3 category evaluation
print("\nEvaluating multiclass category: Attitude")
y_true_att = df_annotated["Attitude"].values
y_pred_att = df_model["Attitude"].values

y_true_att = y_true_att.astype(int)
y_pred_att = y_pred_att.astype(int)

# class counting
model_class_counts = {f"Model Class {i}": sum(y_pred_att == i) for i in range(3)}
annotated_class_counts = {f"Annotated Class {i}": sum(y_true_att == i) for i in range(3)}

# initializebootstrap storage
boot_acc_att, boot_prec_macro_att, boot_rec_macro_att, boot_f1_macro_att = [], [], [], []
boot_prec_per_class, boot_rec_per_class, boot_f1_per_class = [], [], []
boot_auc_macro_att, boot_auc_per_class = [], []

y_true_bin = label_binarize(y_true_att, classes=[0, 1, 2])

# bootstrap sampling
for _ in tqdm(range(n_bootstraps), desc="Bootstrapping Attitude"):
    indices = np.random.choice(len(y_true_att), len(y_true_att), replace=True)
    sample_true = y_true_att[indices]
    sample_pred = y_pred_att[indices]
    sample_true_bin = y_true_bin[indices]
    
    try:
        acc = accuracy_score(sample_true, sample_pred)
        
        prec_macro = precision_score(sample_true, sample_pred, average='macro', zero_division=0)
        rec_macro = recall_score(sample_true, sample_pred, average='macro', zero_division=0)
        f1_macro = f1_score(sample_true, sample_pred, average='macro', zero_division=0)
        
        prec_per_class = precision_score(sample_true, sample_pred, average=None, zero_division=0)
        rec_per_class = recall_score(sample_true, sample_pred, average=None, zero_division=0)
        f1_per_class = f1_score(sample_true, sample_pred, average=None, zero_division=0)
        

        auc_scores = []
        for i in range(3):
            try:
                auc = roc_auc_score(sample_true_bin[:, i], (sample_pred == i).astype(int))
                auc_scores.append(auc)
            except:
                auc_scores.append(np.nan)
        
        auc_macro = np.nanmean(auc_scores) if not all(np.isnan(auc_scores)) else np.nan
    except Exception as e:
        print(f"Error in bootstrap: {e}")
        continue
    
    boot_acc_att.append(acc)
    boot_prec_macro_att.append(prec_macro)
    boot_rec_macro_att.append(rec_macro)
    boot_f1_macro_att.append(f1_macro)
    boot_prec_per_class.append(prec_per_class)
    boot_rec_per_class.append(rec_per_class)
    boot_f1_per_class.append(f1_per_class)
    boot_auc_macro_att.append(auc_macro)
    boot_auc_per_class.append(auc_scores)

# point estimation
point_acc_att = accuracy_score(y_true_att, y_pred_att)
point_prec_macro_att = precision_score(y_true_att, y_pred_att, average='macro', zero_division=0)
point_rec_macro_att = recall_score(y_true_att, y_pred_att, average='macro', zero_division=0)
point_f1_macro_att = f1_score(y_true_att, y_pred_att, average='macro', zero_division=0)
point_prec_per_class = precision_score(y_true_att, y_pred_att, average=None, zero_division=0)
point_rec_per_class = recall_score(y_true_att, y_pred_att, average=None, zero_division=0)
point_f1_per_class = f1_score(y_true_att, y_pred_att, average=None, zero_division=0)

auc_scores = []
for i in range(3):
    try:
        auc = roc_auc_score(y_true_bin[:, i], (y_pred_att == i).astype(int))
        auc_scores.append(auc)
    except:
        auc_scores.append(np.nan)
point_auc_macro_att = np.nanmean(auc_scores) if not all(np.isnan(auc_scores)) else np.nan

# CI estimation
acc_lower_att, acc_upper_att = calculate_ci(boot_acc_att)
prec_macro_lower, prec_macro_upper = calculate_ci(boot_prec_macro_att)
rec_macro_lower, rec_macro_upper = calculate_ci(boot_rec_macro_att)
f1_macro_lower, f1_macro_upper = calculate_ci(boot_f1_macro_att)
auc_macro_lower, auc_macro_upper = calculate_ci(boot_auc_macro_att) if boot_auc_macro_att else (np.nan, np.nan)

# save results marco
attitude_macro_results['Metric'].append('Accuracy')
attitude_macro_results['Value'].append(format_result(point_acc_att, acc_lower_att, acc_upper_att))
attitude_macro_results['Model Class Distribution'].append(format_counts(model_class_counts))
attitude_macro_results['Annotated Class Distribution'].append(format_counts(annotated_class_counts))

attitude_macro_results['Metric'].append('Precision (macro)')
attitude_macro_results['Value'].append(format_result(point_prec_macro_att, prec_macro_lower, prec_macro_upper))
attitude_macro_results['Model Class Distribution'].append("")
attitude_macro_results['Annotated Class Distribution'].append("")

attitude_macro_results['Metric'].append('Recall (macro)')
attitude_macro_results['Value'].append(format_result(point_rec_macro_att, rec_macro_lower, rec_macro_upper))
attitude_macro_results['Model Class Distribution'].append("")
attitude_macro_results['Annotated Class Distribution'].append("")

attitude_macro_results['Metric'].append('F1 (macro)')
attitude_macro_results['Value'].append(format_result(point_f1_macro_att, f1_macro_lower, f1_macro_upper))
attitude_macro_results['Model Class Distribution'].append("")
attitude_macro_results['Annotated Class Distribution'].append("")

attitude_macro_results['Metric'].append('AUC (macro)')
attitude_macro_results['Value'].append(format_result(point_auc_macro_att, auc_macro_lower, auc_macro_upper) if not np.isnan(point_auc_macro_att) else "N/A")
attitude_macro_results['Model Class Distribution'].append("")
attitude_macro_results['Annotated Class Distribution'].append("")

# calculation
for class_idx in range(3):
    # point estimation
    class_prec = [prec[class_idx] for prec in boot_prec_per_class if len(prec) > class_idx]
    class_rec = [rec[class_idx] for rec in boot_rec_per_class if len(rec) > class_idx]
    class_f1 = [f1[class_idx] for f1 in boot_f1_per_class if len(f1) > class_idx]
    class_auc = [auc[class_idx] for auc in boot_auc_per_class if len(auc) > class_idx and not np.isnan(auc[class_idx])]
    
    # CI estimation
    prec_lower, prec_upper = calculate_ci(class_prec)
    rec_lower, rec_upper = calculate_ci(class_rec)
    f1_lower, f1_upper = calculate_ci(class_f1)
    auc_lower, auc_upper = calculate_ci(class_auc) if class_auc else (np.nan, np.nan)
    
    # class conuting
    model_count = model_class_counts[f"Model Class {class_idx}"]
    annotated_count = annotated_class_counts[f"Annotated Class {class_idx}"]
    
    # save results
    attitude_results['Class'].append(f"Class {class_idx}")
    attitude_results['Accuracy'].append(format_result(point_acc_att, acc_lower_att, acc_upper_att))
    attitude_results['Precision'].append(format_result(point_prec_per_class[class_idx], prec_lower, prec_upper))
    attitude_results['Recall'].append(format_result(point_rec_per_class[class_idx], rec_lower, rec_upper))
    attitude_results['F1'].append(format_result(point_f1_per_class[class_idx], f1_lower, f1_upper))
    attitude_results['AUC (OvR)'].append(format_result(auc_scores[class_idx], auc_lower, auc_upper) if not np.isnan(auc_scores[class_idx]) else "N/A")
    attitude_results['Model Count'].append(f"Model: {model_count}")
    attitude_results['Annotated Count'].append(f"Annotated: {annotated_count}")

# to DataFrame
binary_results_df = pd.DataFrame(results)
attitude_results_df = pd.DataFrame(attitude_results)
attitude_macro_df = pd.DataFrame(attitude_macro_results)

print("\nBinary Categories Results:")
print(binary_results_df)
print("\nAttitude (Multiclass) Results - Macro Average:")
print(attitude_macro_df)
print("\nAttitude (Multiclass) Results - Per Class:")
print(attitude_results_df)

# save to Excel
output_path = Path(r"data_source\xxx.xlsx")
with pd.ExcelWriter(output_path) as writer:
    binary_results_df.to_excel(writer, sheet_name="Binary Categories", index=False)
    attitude_macro_df.to_excel(writer, sheet_name="Attitude (Macro)", index=False)
    attitude_results_df.to_excel(writer, sheet_name="Attitude (Per Class)", index=False)
print(f"\nResults saved to: {output_path}")