Evaluation of Mistral Large Injury Verification

This notebook evaluates the LLM outputs stored in the file generated by
the pipeline notebook.

Expected columns:

- `human_injury`, `mistral_injury`
- `human_surgery`, `mistral_surgery`
- `human_partial`, `mistral_partial`
- `human_associated_injuries`, `mistral_associated_injuries`
- `human_text_class`, `mistral_text_class`

We compute accuracy, precision, recall (sensitivity), specificity,
F1, confusion matrices, and a multi-class report for text classification.


In [None]:
import pandas as pd
import numpy as np

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
)

import matplotlib.pyplot as plt

# Path to the CSV produced by the pipeline notebook
CSV_PATH = "appraisal_results_01.csv" # or "results/appraisal_results_01.csv"

df = pd.read_csv(CSV_PATH, index_col=0)
df.head()


In [None]:
# Compute metrics for a binary task. (Assumes labels 0/1 for both human and model columns.)
def binary_metrics(df, human_col, mistral_col):
    # Keep only rows with usable predictions
    valid_mask = df[mistral_col].isin([0, 1, "0", "1"])
    filtered = df[valid_mask].copy()

    if len(filtered) == 0:
        raise ValueError(f"No valid rows to evaluate for {mistral_col} (all 'Skipped' or missing).")

    # Convert everything to int
    y_true = filtered[human_col].astype(int).to_numpy()
    y_pred = filtered[mistral_col].astype(int).to_numpy()

    acc  = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec  = recall_score(y_true, y_pred, zero_division=0)  # sensitivity
    f1   = f1_score(y_true, y_pred, zero_division=0)
    cm   = confusion_matrix(y_true, y_pred, labels=[0, 1])

    tn, fp, fn, tp = cm.ravel()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0.0

    metrics = {
        "n_samples": len(filtered),
        "accuracy": acc,
        "precision": prec,
        "recall_sensitivity": rec,
        "specificity": specificity,
        "f1": f1,
        "tn": int(tn),
        "fp": int(fp),
        "fn": int(fn),
        "tp": int(tp),
        "confusion_matrix": cm,
    }
    return metrics, filtered


In [None]:
def plot_confusion_matrix(cm, title="Confusion matrix", labels=("0", "1")):
    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation="nearest")
    ax.set_title(title)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("True")
    ax.set_xticks([0, 1])
    ax.set_yticks([0, 1])
    ax.set_xticklabels(labels)
    ax.set_yticklabels(labels)

    for (i, j), v in np.ndenumerate(cm):
        ax.text(j, i, int(v), ha="center", va="center")

    plt.colorbar(im)
    plt.show()


In [None]:
tasks = {
    "injury":              ("human_injury",              "mistral_injury"),
    "surgery":             ("human_surgery",             "mistral_surgery"),
    "partial":             ("human_partial",             "mistral_partial"),
    "associated_injuries": ("human_associated_injuries", "mistral_associated_injuries"),
}

summary_rows = []

for name, (y_col, yhat_col) in tasks.items():
    print(f"\n=== {name.upper()} ===")

    m, df_used = binary_metrics(df, y_col, yhat_col)

    print(f"Samples used (non-skipped): {m['n_samples']}")
    print(f"Accuracy:            {m['accuracy']:.4f}")
    print(f"Precision:           {m['precision']:.4f}")
    print(f"Recall (Sensitivity):{m['recall_sensitivity']:.4f}")
    print(f"Specificity:         {m['specificity']:.4f}")
    print(f"F1-score:            {m['f1']:.4f}")
    print(f"TP: {m['tp']} | FP: {m['fp']} | TN: {m['tn']} | FN: {m['fn']}")

    plot_confusion_matrix(m["confusion_matrix"], title=f"{name} â€“ confusion matrix")

    summary_rows.append({
        "task": name,
        "n_samples": m["n_samples"],
        "accuracy": m["accuracy"],
        "precision": m["precision"],
        "recall_sensitivity": m["recall_sensitivity"],
        "specificity": m["specificity"],
        "f1": m["f1"],
        "tp": m["tp"],
        "fp": m["fp"],
        "tn": m["tn"],
        "fn": m["fn"],
    })

summary_df = pd.DataFrame(summary_rows)
summary_df


In [None]:
# Filter out Skipped / invalid predictions (if any, shouldn't be necessary)
valid_mask_text = df["mistral_text_class"].isin([0, 1, 2, 3, 4, "0", "1", "2", "3", "4"])
df_text = df[valid_mask_text].copy()

y_true_text = df_text["human_text_class"].astype(int)
y_pred_text = df_text["mistral_text_class"].astype(int)

print("=== TEXT CLASSIFICATION ===")
print(f"Samples used (non-skipped): {len(df_text)}")
print("Overall accuracy:", accuracy_score(y_true_text, y_pred_text))

print("\nClassification report (per class):\n")
print(classification_report(y_true_text, y_pred_text, digits=3))


In [None]:
df_verified = df[df["human_injury"] == 1].copy()

print(f"Number of rows with human_injury == 1: {len(df_verified)}")

tasks_verified = {
    "surgery_given_injury":             ("human_surgery",             "mistral_surgery"),
    "partial_given_injury":             ("human_partial",             "mistral_partial"),
    "assoc_injuries_given_injury":      ("human_associated_injuries", "mistral_associated_injuries"),
}

rows_verified = []
for name, (y_col, yhat_col) in tasks_verified.items():
    print(f"\n=== {name.upper()} (subset: human_injury == 1) ===")
    m, df_used = binary_metrics(df_verified, y_col, yhat_col)
    print(f"Samples used:        {m['n_samples']}")
    print(f"Accuracy:            {m['accuracy']:.4f}")
    print(f"Precision:           {m['precision']:.4f}")
    print(f"Recall (Sensitivity):{m['recall_sensitivity']:.4f}")
    print(f"Specificity:         {m['specificity']:.4f}")
    print(f"F1-score:            {m['f1']:.4f}")
    print(f"TP: {m['tp']} | FP: {m['fp']} | TN: {m['tn']} | FN: {m['fn']}")

    rows_verified.append({"task": name, **{k: v for k, v in m.items() if k != "confusion_matrix"}})

pd.DataFrame(rows_verified)
